fix(computer-use): vision capture returns an image on cua-driver >=0.5.x

Vision mode called a `screenshot` MCP tool that cua-driver dropped in 0.5.x (full-window PNG capture was folded into `get_window_state`). The driver replied "Unknown tool: screenshot", so `images` came back empty, `png_b64` stayed None, and capture returned a 0x0 result with no image on every call. `som`/`ax` were unaffected because they already use `get_window_state`, which masked the regression. Route vision by capability: - driver advertises `screenshot` (older builds) -> use it (no AX walk) - otherwise -> call `get_window_state` but discard the AX tree/elements, returning only the PNG so vision stays free of element noise - capabilities not yet discovered -> try `screenshot`, fall back to `get_window_state` on an empty image, so the path self-heals Add `_image_from_tool_result` to pull the PNG from either an MCP image content-part or `structuredContent.screenshot_png_b64`, and use it on the som path too so the image won't silently drop on driver builds that deliver it via structuredContent instead of a content part. Verified live (vision: 1568x954, 0 elements; som: image + 527 elements) and with unit coverage of all four routing cases.
2026-07-04 01:05:21 +08:00 · 2026-06-22 17:40:18 -05:00
1374 changed files with 14148 additions and 126825 deletions
--- a/.envrc
+++ b/.envrc
@@ -1,5 +1,5 @@
 watch_file pyproject.toml uv.lock
 watch_file package-lock.json package.json web/package.json ui-tui/package.json website/package.json apps/shared/package.json apps/desktop/package.json ui-tui/packages/hermes-ink/package.json
-watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix nix/hermes-agent.nix nix/desktop.nix
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix

 use flake
--- a/.github/actions/detect-changes/action.yml
+++ b/.github/actions/detect-changes/action.yml
@@ -1,62 +0,0 @@
-name: Detect affected areas
-description: >-
-  Classify a PR's changed files into CI work lanes (python, frontend, site,
-  scan, deps, mcp_catalog) so the orchestrator can conditionally call only
-  the sub-workflows a PR can affect. Outputs are always "true" on push/dispatch
-  events and fail open (everything "true") when the diff cannot be computed.
-
-outputs:
-  python:
-    description: Run Python tests / ruff / ty / windows-footguns.
-    value: ${{ steps.classify.outputs.python }}
-  frontend:
-    description: Run the TypeScript typecheck matrix + desktop build.
-    value: ${{ steps.classify.outputs.frontend }}
-  docker_meta:
-    description: Docker setup and meta files have changed.
-    value: ${{ steps.classify.outputs.docker_meta }}
-  site:
-    description: Build the Docusaurus docs site.
-    value: ${{ steps.classify.outputs.site }}
-  scan:
-    description: Run the supply-chain critical-pattern scanner.
-    value: ${{ steps.classify.outputs.scan }}
-  deps:
-    description: Check pyproject.toml dependency upper bounds.
-    value: ${{ steps.classify.outputs.deps }}
-  mcp_catalog:
-    description: Require MCP catalog security review label.
-    value: ${{ steps.classify.outputs.mcp_catalog }}
-
-runs:
-  using: composite
-  steps:
-    - name: Classify changed files
-      id: classify
-      shell: bash
-      env:
-        GH_TOKEN: ${{ github.token }}
-        REPO: ${{ github.repository }}
-        EVENT_NAME: ${{ github.event_name }}
-        BASE_SHA: ${{ github.event.pull_request.base.sha }}
-        HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-      run: |
-        set -euo pipefail
-
-        # Only pull_request events are gated. Other events (push, release,
-        # dispatch) leave CHANGED empty, so the classifier fails open and every
-        # lane runs. Post-merge / on-demand validation is never weakened.
-        if [ "$EVENT_NAME" = "pull_request" ]; then
-          # Use the compare endpoint with the pinned base/head SHAs from the
-          # event payload instead of the "current PR files" endpoint. The SHAs
-          # are frozen at trigger time, so the file list is deterministic even
-          # if the PR receives a new push between trigger and detect.
-          CHANGED="$(gh api \
-            --paginate \
-            "repos/${REPO}/compare/${BASE_SHA}...${HEAD_SHA}" \
-            --jq '.files[].filename' || true)"
-        fi
-
-        echo "Changed files:"
-        printf '%s\n' "${CHANGED:-(none)}"
-        printf '%s\n' "${CHANGED:-}" | python3 scripts/ci/classify_changes.py
--- a/.github/actions/hermes-smoke-test/action.yml
+++ b/.github/actions/hermes-smoke-test/action.yml
@@ -0,0 +1,50 @@
+name: Hermes smoke test
+description: >
+  Run the image's built-in entrypoint against `--help` and `dashboard --help`
+  to catch basic runtime regressions before publishing.  Requires the image
+  to already be loaded into the local Docker daemon under `image`.
+
+  Works identically on amd64 and arm64 runners.
+
+inputs:
+  image:
+    description: Fully-qualified image tag (e.g. nousresearch/hermes-agent:test)
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Ensure /tmp/hermes-test is hermes-writable
+      shell: bash
+      run: |
+        # The image runs as the hermes user (UID 10000).  GitHub Actions
+        # creates /tmp/hermes-test root-owned by default, which hermes
+        # can't write to — chown it to match the in-container UID before
+        # bind-mounting.  Real users doing `docker run -v ~/.hermes:...`
+        # with their own UID hit the same issue and have their own
+        # remediations (HERMES_UID env var, or chown locally).
+        mkdir -p /tmp/hermes-test
+        sudo chown -R 10000:10000 /tmp/hermes-test
+
+    - name: hermes --help
+      shell: bash
+      run: |
+        # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so
+        # this exercises the actual production startup path. PR #30136
+        # review caught that an --entrypoint override here had been
+        # silently neutered by the s6-overlay migration — stage2-hook
+        # ignores its CMD args, so the smoke test was a no-op.
+        docker run --rm \
+          -v /tmp/hermes-test:/opt/data \
+          "${{ inputs.image }}" --help
+
+    - name: hermes dashboard --help
+      shell: bash
+      run: |
+        # Regression guard for #9153: dashboard was present in source but
+        # missing from the published image.  If this fails, something in
+        # the Dockerfile is excluding the dashboard subcommand from the
+        # installed package.
+        docker run --rm \
+          -v /tmp/hermes-test:/opt/data \
+          "${{ inputs.image }}" dashboard --help
--- a/.github/actions/retry/action.yml
+++ b/.github/actions/retry/action.yml
@@ -1,50 +0,0 @@
-name: Retry a flaky command
-description: >-
-  Run a shell command, retrying on non-zero exit. For dependency installs
-  (npm ci, uv sync) whose only failures are transient network/toolchain
-  flakes — a node-gyp header fetch, a registry blip — so CI self-heals
-  instead of needing a manual re-run.
-
-inputs:
-  command:
-    description: Shell command to run (and retry).
-    required: true
-  attempts:
-    description: Max attempts before giving up.
-    default: "3"
-  delay:
-    description: Seconds to wait between attempts.
-    default: "10"
-  working-directory:
-    description: Directory to run in.
-    default: "."
-
-runs:
-  using: composite
-  steps:
-    - shell: bash
-      working-directory: ${{ inputs.working-directory }}
-      # command goes through env, never interpolated into the script body, so
-      # a command with quotes/specials can't break or inject into the runner.
-      env:
-        _CMD: ${{ inputs.command }}
-        _ATTEMPTS: ${{ inputs.attempts }}
-        _DELAY: ${{ inputs.delay }}
-      run: |
-        set -uo pipefail
-        n=0
-        while :; do
-          n=$((n + 1))
-          echo "::group::attempt $n/$_ATTEMPTS: $_CMD"
-          if bash -c "$_CMD"; then
-            echo "::endgroup::"
-            exit 0
-          fi
-          echo "::endgroup::"
-          if [ "$n" -ge "$_ATTEMPTS" ]; then
-            echo "::error::failed after $n attempts: $_CMD"
-            exit 1
-          fi
-          echo "::warning::attempt $n failed; retrying in ${_DELAY}s: $_CMD"
-          sleep "$_DELAY"
-        done
--- a/.github/workflows/build-windows-installer.yml
+++ b/.github/workflows/build-windows-installer.yml
@@ -0,0 +1,100 @@
+name: Build Windows Installer
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  # Gate: workflow_dispatch is already restricted to users with write access,
+  # but we want ADMIN-only. Explicitly check the triggering actor's repo
+  # permission via the API and fail fast for anyone below admin.
+  authorize:
+    name: Authorize (admins only)
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - name: Check actor is a repo admin
+        env:
+          GH_TOKEN: ${{ github.token }}
+          ACTOR: ${{ github.actor }}
+        run: |
+          set -euo pipefail
+          perm=$(gh api \
+            "repos/${{ github.repository }}/collaborators/${ACTOR}/permission" \
+            --jq '.permission')
+          echo "Actor '${ACTOR}' has permission: ${perm}"
+          if [ "${perm}" != "admin" ]; then
+            echo "::error::'${ACTOR}' is not a repo admin (permission=${perm}). Refusing to build/sign."
+            exit 1
+          fi
+          echo "Authorized: '${ACTOR}' is an admin."
+
+  build:
+    name: Hermes-Setup.exe
+    needs: authorize
+    runs-on: windows-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+      # Required for OIDC auth to Azure (azure/login federated credentials).
+      id-token: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+
+      - name: Setup Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        with:
+          node-version: 22
+          cache: npm
+
+      - name: Install npm dependencies
+        run: npm ci
+
+      - name: Setup Rust
+        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8  # stable
+
+      - name: Cache Rust targets
+        uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32  # v2
+        with:
+          workspaces: apps/bootstrap-installer/src-tauri
+
+      - name: Build installer
+        run: npm run tauri:build
+        working-directory: apps/bootstrap-installer
+
+      - name: Azure login (OIDC)
+        uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5  # v2
+        with:
+          client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Sign Hermes-Setup.exe with Azure Artifact Signing
+        uses: azure/artifact-signing-action@c7ab2a863ab5f9a846ddb8265964877ef296ee82  # v2
+        with:
+          endpoint: ${{ vars.AZURE_SIGNING_ENDPOINT }}
+          signing-account-name: ${{ vars.AZURE_SIGNING_ACCOUNT_NAME }}
+          certificate-profile-name: ${{ vars.AZURE_SIGNING_CERTIFICATE_PROFILE }}
+          # Sign both the raw exe and the bundled NSIS installer.
+          files-folder: ${{ github.workspace }}\apps\bootstrap-installer\src-tauri\target\release
+          files-folder-filter: exe
+          files-folder-recurse: true
+          file-digest: SHA256
+          timestamp-rfc3161: http://timestamp.acs.microsoft.com
+          timestamp-digest: SHA256
+
+      - name: Upload NSIS installer
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: Hermes-Setup-installer
+          path: apps/bootstrap-installer/src-tauri/target/release/bundle/nsis/*.exe
+
+      - name: Upload raw exe
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+        with:
+          name: Hermes-Setup-exe
+          path: apps/bootstrap-installer/src-tauri/target/release/Hermes-Setup.exe
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,167 +0,0 @@
-name: CI
-
-# Orchestrator workflow. Runs ``detect-changes`` once, then conditionally
-# calls the sub-workflows that a PR can actually affect. A final
-# ``all-checks-pass`` gate job aggregates results so branch protection only
-# needs to require a single check.
-#
-# Sub-workflows are triggered via ``workflow_call`` and keep their own job
-# definitions, matrices, and concurrency settings. They no longer have
-# ``push:`` / ``pull_request:`` triggers of their own — everything flows
-# through this file.
-
-on:
-  pull_request:
-  push:
-    branches: [main]
-
-permissions:
-  contents: read
-  pull-requests: write # needed by lint (PR comment) + supply-chain (PR comment)
-  actions: read # needed by osv-scanner (SARIF upload)
-  security-events: write # needed by osv-scanner (SARIF upload)
-  packages: write # needed by docker build
-
-concurrency:
-  group: ci-${{ github.ref }}
-  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
-
-jobs:
-  # ─────────────────────────────────────────────────────────────────────
-  # detect: run the classifier once. Every downstream job reads its outputs
-  # to decide whether to run. On push/dispatch the classifier fails open
-  # (all lanes true) so post-merge validation is never weakened.
-  # ─────────────────────────────────────────────────────────────────────
-  detect:
-    name: Detect affected areas
-    runs-on: ubuntu-latest
-    outputs:
-      python: ${{ steps.classify.outputs.python }}
-      frontend: ${{ steps.classify.outputs.frontend }}
-      site: ${{ steps.classify.outputs.site }}
-      scan: ${{ steps.classify.outputs.scan }}
-      deps: ${{ steps.classify.outputs.deps }}
-      docker_meta: ${{ steps.classify.outputs.docker_meta }}
-      mcp_catalog: ${{ steps.classify.outputs.mcp_catalog }}
-      event_name: ${{ github.event_name }}
-    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      - name: Detect affected areas
-        id: classify
-        uses: ./.github/actions/detect-changes
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Lane-gated sub-workflows. Each runs in parallel after detect finishes.
-  # Skipped workflows (if condition is false) don't spin up runners.
-  # ─────────────────────────────────────────────────────────────────────
-  tests:
-    name: Python tests
-    needs: detect
-    if: needs.detect.outputs.python == 'true'
-    uses: ./.github/workflows/tests.yml
-    with:
-      slice_count: 8
-
-  lint:
-    name: Python lints
-    needs: detect
-    if: needs.detect.outputs.python == 'true'
-    uses: ./.github/workflows/lint.yml
-    with:
-      event_name: ${{ needs.detect.outputs.event_name }}
-
-  typecheck:
-    name: TypeScript
-    needs: detect
-    if: needs.detect.outputs.frontend == 'true'
-    uses: ./.github/workflows/typecheck.yml
-
-  docs-site:
-    name: Docs Site
-    needs: detect
-    if: needs.detect.outputs.site == 'true'
-    uses: ./.github/workflows/docs-site-checks.yml
-
-  history-check:
-    name: Deny unrelated histories
-    needs: detect
-    if: needs.detect.outputs.event_name == 'pull_request'
-    uses: ./.github/workflows/history-check.yml
-
-  contributor-check:
-    name: Check contributors
-    needs: detect
-    if: needs.detect.outputs.python == 'true'
-    uses: ./.github/workflows/contributor-check.yml
-
-  uv-lockfile:
-    name: Check uv.lock
-    needs: detect
-    uses: ./.github/workflows/uv-lockfile-check.yml
-
-  docker-lint:
-    name: Lint Docker scripts
-    needs: detect
-    if: needs.detect.outputs.docker_meta == 'true'
-    uses: ./.github/workflows/docker-lint.yml
-
-  docker:
-    name: Build&Test Docker image
-    needs: detect
-    if: needs.detect.outputs.python == 'true' || needs.detect.outputs.frontend == 'true' || needs.detect.outputs.docker_meta == 'true'
-    uses: ./.github/workflows/docker.yml
-    secrets: inherit
-
-  supply-chain:
-    name: Supply-chain scan
-    needs: detect
-    if: needs.detect.outputs.event_name == 'pull_request' && (needs.detect.outputs.scan == 'true' || needs.detect.outputs.deps == 'true' || needs.detect.outputs.mcp_catalog == 'true')
-    uses: ./.github/workflows/supply-chain-audit.yml
-    with:
-      event_name: ${{ needs.detect.outputs.event_name }}
-      scan: ${{ needs.detect.outputs.scan == 'true' }}
-      deps: ${{ needs.detect.outputs.deps == 'true' }}
-      mcp_catalog: ${{ needs.detect.outputs.mcp_catalog == 'true' }}
-
-  osv-scanner:
-    name: OSV scan
-    uses: ./.github/workflows/osv-scanner.yml
-
-  # ─────────────────────────────────────────────────────────────────────
-  # Gate: runs after everything. ``if: always()`` ensures it reports a
-  # status even when some deps were skipped. Only actual ``failure``
-  # results cause it to fail; ``skipped`` is treated as success.
-  #
-  # Branch protection should require ONLY this check.
-  # ─────────────────────────────────────────────────────────────────────
-  all-checks-pass:
-    name: All required checks pass
-    needs:
-      - tests
-      - lint
-      - typecheck
-      - docs-site
-      - history-check
-      - contributor-check
-      - uv-lockfile
-      - docker-lint
-      - supply-chain
-      - osv-scanner
-      # we don't require docker to pass rn because it's so slow lol
-      # - docker
-    if: always()
-    runs-on: ubuntu-latest
-    steps:
-      - name: Evaluate job results
-        env:
-          RESULTS: ${{ toJSON(needs.*.result) }}
-        run: |
-          echo "$RESULTS" | python3 -c "
-          import json, sys
-          results = json.load(sys.stdin)
-          failed = [r for r in results if r == 'failure']
-          if failed:
-              print(f'::error::{len(failed)} job(s) failed')
-              sys.exit(1)
-          print('All checks passed (or were skipped)')
-          "
--- a/.github/workflows/contributor-check.yml
+++ b/.github/workflows/contributor-check.yml
@@ -1,8 +1,11 @@
 name: Contributor Attribution Check

 on:
-  workflow_call:
-
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]
 permissions:
  contents: read

@@ -14,7 +17,21 @@ jobs:
        with:
          fetch-depth: 0  # Full history needed for git log

+      - name: Check if relevant files changed
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          CHANGED=$(git diff --name-only "$BASE"..."$HEAD" -- '*.py' '**/*.py' '.github/workflows/contributor-check.yml' || true)
+          if [ -n "$CHANGED" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=false" >> "$GITHUB_OUTPUT"
+            echo "No Python files changed, skipping attribution check."
+          fi
+
      - name: Check for unmapped contributor emails
+        if: steps.filter.outputs.run == 'true'
        run: |
          # Get the merge base between this PR and main
          MERGE_BASE=$(git merge-base origin/main HEAD)
--- a/.github/workflows/docker-lint.yml
+++ b/.github/workflows/docker-lint.yml
@@ -2,7 +2,7 @@ name: Docker / shell lint

 # Lints the container build inputs: Dockerfile (via hadolint) and any shell
 # scripts under docker/ (via shellcheck). These catch the class of regression
-# the behavioral docker smoke test can't — unquoted variable
+# the behavioral docker-publish smoke test can't — unquoted variable
 # expansions, silently-failing RUN commands, etc.
 #
 # Rules and ignores are documented in .hadolint.yaml at the repo root.
@@ -11,7 +11,19 @@ name: Docker / shell lint
 # activate script doesn't exist at lint time.

 on:
-  workflow_call:
+  push:
+    branches: [main]
+    paths:
+      - Dockerfile
+      - docker/**
+      - .hadolint.yaml
+      - .github/workflows/docker-lint.yml
+
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 permissions:
  contents: read
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,9 +1,25 @@
-name: Docker Build, Test, and Publish
+name: Docker Build and Publish

 on:
+  push:
+    branches: [main]
+    paths:
+      - '**/*.py'
+      - 'pyproject.toml'
+      - 'uv.lock'
+      - 'Dockerfile'
+      - 'docker/**'
+      - '.github/workflows/docker-publish.yml'
+      - '.github/actions/hermes-smoke-test/**'
+
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]
+
  release:
    types: [published]
-  workflow_call:

 permissions:
  contents: read
@@ -24,7 +40,11 @@ env:
  IMAGE_NAME: nousresearch/hermes-agent

 jobs:
-  # Build, test, and optionally push the amd64 image.
+  # ---------------------------------------------------------------------------
+  # Build amd64 natively.  This job also runs the smoke tests (basic --help
+  # and the dashboard subcommand regression guard from #9153), because amd64
+  # is the only arch we can `load` into the local daemon on an amd64 runner.
+  # ---------------------------------------------------------------------------
  build-amd64:
    # Only run on the upstream repository, not on forks
    if: github.repository == 'NousResearch/hermes-agent'
@@ -34,19 +54,16 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      # The image build + integration tests run on every event
-      # (PRs, push-to-main, release). Publish steps below are gated to
-      # push-to-main / release only.
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

-      # Build once, load into the local daemon for testing.  Cached
+      # Build once, load into the local daemon for smoke testing.  Cached
      # to gha with a per-arch scope; the push step below reuses every
      # layer from this build.
-      - name: Build image (amd64)
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+      - name: Build image (amd64, smoke test)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -58,12 +75,24 @@ jobs:
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64

+      - name: Smoke test image
+        uses: ./.github/actions/hermes-smoke-test
+        with:
+          image: ${{ env.IMAGE_NAME }}:test
+
+      # ---------------------------------------------------------------------
      # Run the docker-integration test suite against the freshly-built
-      # image already loaded into the local daemon (`:test`).
+      # image already loaded into the local daemon (`:test`).  These tests
+      # are excluded from the sharded `tests.yml :: test` matrix on purpose
+      # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each
+      # shard would otherwise reach the session-scoped ``built_image``
+      # fixture in ``tests/docker/conftest.py`` and start a 3-7min
+      # ``docker build`` — guaranteed to
+      # die in fixture setup.
      #
-      # Piggybacking here avoids a second image build: the build step
-      # already loaded the image into the daemon under
-      # `${IMAGE_NAME}:test`, so we just point ``HERMES_TEST_IMAGE`` at
+      # Piggybacking here avoids a second image build: the smoke test
+      # already proved the image loads + runs, so the daemon has it under
+      # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at
      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
      #
@@ -73,18 +102,20 @@ jobs:
      # cheapest path to coverage on every PR that touches docker code.
      # ---------------------------------------------------------------------
      - name: Install uv (for docker tests)
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5

      - name: Set up Python 3.11 (for docker tests)
        run: uv python install 3.11

      - name: Install Python dependencies (for docker tests)
        run: |
+          uv venv .venv --python 3.11
+          source .venv/bin/activate
          # ``dev`` extra pulls in pytest, pytest-asyncio —
          # everything tests/docker/ needs.  We deliberately avoid ``all``
          # here because the docker tests only drive the container via
          # subprocess and don't import hermes_agent's optional deps.
-          uv sync --locked --python 3.11 --extra dev
+          uv pip install -e ".[dev]"

      - name: Run docker integration tests
        env:
@@ -96,11 +127,12 @@ jobs:
          OPENAI_API_KEY: ""
          NOUS_API_KEY: ""
        run: |
-          scripts/run_tests.sh tests/docker/ --file-timeout 600
+          source .venv/bin/activate
+          python -m pytest tests/docker/ -v --tb=short

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -111,7 +143,7 @@ jobs:
      - name: Push amd64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -135,7 +167,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: digest-amd64
          path: /tmp/digests/*
@@ -143,7 +175,10 @@ jobs:
          retention-days: 1

  # ---------------------------------------------------------------------------
-  # Build, test, and optionally push the arm64 image.
+  # Build arm64 natively on GitHub's free arm64 runner.  This replaces the
+  # previous QEMU-emulated arm64 build, which was ~5-10x slower and shared
+  # a cache scope with amd64.  Matches the amd64 job's shape: build+load,
+  # smoke test, then on push/release push by digest.
  # ---------------------------------------------------------------------------
  build-arm64:
    if: github.repository == 'NousResearch/hermes-agent'
@@ -153,35 +188,57 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

      # Log in to ghcr.io so the registry-backed build cache below can be
      # read (cache-from) on every event and written (cache-to) on
      # push/release.  Uses the workflow's GITHUB_TOKEN, which is valid for
      # the whole job — unlike the gha cache backend's short-lived Azure SAS
      # token, which expired mid-build on slow cold-cache arm64 runs and
-      # crashed the build before the tests ran (the reason the gha cache
+      # crashed the build before the smoke test (the reason the gha cache
      # was removed from arm64 PRs in the first place).
      - name: Log in to ghcr.io (build cache)
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # Build once, load into the local daemon for testing, then push
-      # by digest below. Reads AND writes the registry-backed cache so the
-      # push reuses layers from this build and the next build starts warm.
+      # Build once, load into the local daemon for smoke testing.
+      #
+      # PR builds use the registry-backed cache READ-ONLY (cache-from only):
+      # they pull warm layers pushed by the most recent main build but never
+      # write, so rapid PR pushes don't race on cache writes or pollute the
+      # cache ref.  This restores warm-cache speed to arm64 PR builds (which
+      # were running fully uncached and were ~45% slower than amd64, making
+      # them the job most often cancelled on supersede).
      #
      # Registry cache (type=registry on ghcr.io) is used instead of the gha
      # cache that previously broke here: its credential is the job-lifetime
      # GITHUB_TOKEN, not a short-lived SAS token, so the cold-build-outlives-
      # token failure mode cannot recur.
-      - name: Build image (arm64, cached publish)
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+      - name: Build image (arm64, smoke test, cache read-only PR)
+        if: github.event_name == 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        with:
+          context: .
+          file: Dockerfile
+          load: true
+          platforms: linux/arm64
+          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
+          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
+
+      # Main/release builds read AND write the registry cache so the digest
+      # push below reuses layers from this smoke-test build, and so the next
+      # PR/main build starts warm.
+      - name: Build image (arm64, smoke test, cached publish)
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -193,29 +250,14 @@ jobs:
          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max

-      - name: Install uv for docker tests
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
-
-      - name: Set up Python 3.11 for docker tests
-        run: uv python install 3.11
-
-      - name: Install Python dependencies for docker tests
-        run: |
-          uv sync --locked --python 3.11 --extra dev
-
-      - name: Run docker tests
-        env:
-          # Skip rebuild; use the image already loaded by the build step.
-          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
-          OPENROUTER_API_KEY: ""
-          OPENAI_API_KEY: ""
-          NOUS_API_KEY: ""
-        run: |
-          scripts/run_tests.sh tests/docker/ --file-timeout 600
+      - name: Smoke test image
+        uses: ./.github/actions/hermes-smoke-test
+        with:
+          image: ${{ env.IMAGE_NAME }}:test

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -223,7 +265,7 @@ jobs:
      - name: Push arm64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -245,7 +287,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: digest-arm64
          path: /tmp/digests/*
@@ -267,17 +309,17 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Download digests
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          path: /tmp/digests
          pattern: digest-*
          merge-multiple: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/docs-site-checks.yml
+++ b/.github/workflows/docs-site-checks.yml
@@ -1,7 +1,13 @@
 name: Docs Site Checks

 on:
-  workflow_call:
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]
+
+  workflow_dispatch:

 permissions:
  contents: read
@@ -19,19 +25,15 @@ jobs:
          cache-dependency-path: website/package-lock.json

      - name: Install website dependencies
-        uses: ./.github/actions/retry
-        with:
-          command: npm ci
-          working-directory: website
+        run: npm ci
+        working-directory: website

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"

      - name: Install ascii-guard
-        uses: ./.github/actions/retry
-        with:
-          command: python -m pip install ascii-guard==2.3.0 pyyaml==6.0.3
+        run: python -m pip install ascii-guard==2.3.0 pyyaml==6.0.3

      - name: Extract skill metadata for dashboard
        run: python3 website/scripts/extract-skills.py
--- a/.github/workflows/history-check.yml
+++ b/.github/workflows/history-check.yml
@@ -14,7 +14,11 @@ name: History Check
 # the PR head and main to be non-empty.

 on:
-  workflow_call:
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 permissions:
  contents: read
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,12 +9,18 @@ name: Lint (ruff + ty)
 #      enforcement fails.

 on:
-  workflow_call:
-    inputs:
-      event_name:
-        description: The event name from the calling orchestrator (pull_request or push).
-        type: string
-        required: true
+  push:
+    branches: [main]
+    paths-ignore:
+      - "**/*.md"
+      - "docs/**"
+      - "website/**"
+
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 permissions:
  contents: read
@@ -27,7 +33,6 @@ concurrency:
 jobs:
  lint-diff:
    name: ruff + ty diff
-    if: inputs.event_name == 'pull_request'
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
@@ -37,19 +42,19 @@ jobs:
          fetch-depth: 0 # need full history for merge-base + worktree

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      - name: Install ruff + ty
-        uses: ./.github/actions/retry
-        with:
-          command: uv tool install ruff && uv tool install ty
+        run: |
+          uv tool install ruff
+          uv tool install ty

      - name: Determine base ref
        id: base
        run: |
          # For PRs, diff against the merge base with the target branch.
          # For pushes to main, diff against the previous commit on main.
-          if [ "${{ inputs.event_name }}" = "pull_request" ]; then
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
            BASE_SHA=$(git merge-base "origin/${{ github.base_ref }}" HEAD)
            BASE_REF="origin/${{ github.base_ref }}"
          else
@@ -105,19 +110,19 @@ jobs:
            --base-ty   .lint-reports/base/ty.json \
            --head-ty   .lint-reports/head/ty.json \
            --base-ref  "${{ steps.base.outputs.ref }}" \
-            --head-ref  "${{ inputs.event_name == 'pull_request' && github.head_ref || github.ref_name }}" \
+            --head-ref  "${{ github.event_name == 'pull_request' && github.head_ref || github.ref_name }}" \
            --output    .lint-reports/summary.md
          cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY"

      - name: Upload reports as artifact
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
        with:
          name: lint-reports
          path: .lint-reports/
          retention-days: 14

      - name: Post / update PR comment
-        if: inputs.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+        if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
        continue-on-error: true
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
        with:
@@ -164,12 +169,10 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      - name: Install ruff
-        uses: ./.github/actions/retry
-        with:
-          command: uv tool install ruff
+        run: uv tool install ruff

      - name: ruff check .
        # No --exit-zero, no || true. Exit code propagates to the job,
--- a/.github/workflows/osv-scanner.yml
+++ b/.github/workflows/osv-scanner.yml
@@ -1,8 +1,8 @@
 name: OSV-Scanner

 # Scans lockfiles (uv.lock, package-lock.json) against the OSV vulnerability
-# database. Runs on every PR/push (via the ci.yml orchestrator's workflow_call)
-# and on a weekly schedule against main.
+# database. Runs on every PR that touches a lockfile and on a weekly schedule
+# against main.
 #
 # This is detection-only — OSV-Scanner does NOT open PRs or modify pins.
 # It reports known CVEs in currently-pinned dependency versions so we can
@@ -10,9 +10,9 @@ name: OSV-Scanner
 # (full SHA / exact version) is preserved; only the notification signal
 # is added.
 #
-# Complements the supply-chain-audit.yml workflow (which scans for malicious
-# code patterns in PR diffs) by covering the orthogonal "currently-pinned
-# dep became known-vulnerable" case.
+# Complements the existing supply-chain-audit.yml workflow (which scans
+# for malicious code patterns in PR diffs) by covering the orthogonal
+# "currently-pinned dep became known-vulnerable" case.
 #
 # Uses Google's officially-recommended reusable workflow, pinned by SHA.
 # Findings land in the repo's Security tab (Code Scanning > OSV-Scanner).
@@ -20,7 +20,19 @@ name: OSV-Scanner
 # vulnerabilities in pinned deps that we may need to patch deliberately.

 on:
-  workflow_call:
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+    paths:
+      - "uv.lock"
+      - "pyproject.toml"
+      - "package.json"
+      - "package-lock.json"
+      - "website/package-lock.json"
  schedule:
    # Weekly scan against main — catches CVEs published after merge for
    # deps that haven't changed since.
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -3,17 +3,17 @@ name: Build Skills Index
 on:
  schedule:
    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: "0 6,18 * * *"
-  workflow_dispatch: # Manual trigger
+    - cron: '0 6,18 * * *'
+  workflow_dispatch:  # Manual trigger
  push:
    branches: [main]
    paths:
-      - "scripts/build_skills_index.py"
-      - ".github/workflows/skills-index.yml"
+      - 'scripts/build_skills_index.py'
+      - '.github/workflows/skills-index.yml'

 permissions:
  contents: read
-  actions: write # to trigger deploy-site.yml on schedule
+  actions: write   # to trigger deploy-site.yml on schedule

 jobs:
  build-index:
@@ -21,11 +21,11 @@ jobs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2

-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
        with:
-          python-version: "3.11"
+          python-version: '3.11'

      - name: Install dependencies
        run: pip install httpx==0.28.1 pyyaml==6.0.2
@@ -36,7 +36,7 @@ jobs:
        run: python scripts/build_skills_index.py

      - name: Upload index artifact
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: skills-index
          path: website/static/api/skills-index.json
--- a/.github/workflows/supply-chain-audit.yml
+++ b/.github/workflows/supply-chain-audit.yml
@@ -1,5 +1,16 @@
 name: Supply Chain Audit

+on:
+  # No paths filter — the jobs must always run so required checks
+  # report a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+permissions:
+  pull-requests: write
+  contents: read
+
 # Narrow, high-signal scanner. Only fires on critical indicators of supply
 # chain attacks (e.g. the litellm-style payloads). Low-signal heuristics
 # (plain base64, plain exec/eval, dependency/Dockerfile/workflow edits,
@@ -8,40 +19,56 @@ name: Supply Chain Audit
 # the scanner. Keep this file's checks ruthlessly narrow: if you find
 # yourself adding WARNING-tier patterns here again, make a separate
 # advisory-only workflow instead.
-#
-# Path-gating is handled centrally by the ``ci.yml`` orchestrator's
-# ``detect`` job. The orchestrator passes ``scan`` / ``deps`` /
-# ``mcp_catalog`` booleans as inputs; this workflow's jobs gate on those
-# inputs instead of re-computing the diff.
-
-on:
-  workflow_call:
-    inputs:
-      event_name:
-        description: The event name from the calling orchestrator.
-        type: string
-        required: true
-      scan:
-        description: Whether supply-chain-relevant files changed.
-        type: boolean
-        required: true
-      deps:
-        description: Whether pyproject.toml changed.
-        type: boolean
-        required: true
-      mcp_catalog:
-        description: Whether the MCP catalog / installer changed.
-        type: boolean
-        required: true
-
-permissions:
-  pull-requests: write
-  contents: read

 jobs:
+  # ── Path filter (shared by both scan and dep-bounds) ───────────────
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      # True when any file the scanner cares about changed in this PR
+      scan: ${{ steps.filter.outputs.scan }}
+      # True when pyproject.toml changed in this PR
+      deps: ${{ steps.filter.outputs.deps }}
+      # True when the curated MCP catalog / bundled MCP manifests changed.
+      mcp_catalog: ${{ steps.filter.outputs.mcp_catalog }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 0
+      - name: Check for relevant file changes
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          SCAN_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- \
+            '*.py' '**/*.py' '*.pth' '**/*.pth' \
+            'setup.py' 'setup.cfg' \
+            'sitecustomize.py' 'usercustomize.py' '__init__.pth' \
+            'pyproject.toml' || true)
+          if [ -n "$SCAN_FILES" ]; then
+            echo "scan=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "scan=false" >> "$GITHUB_OUTPUT"
+          fi
+          DEPS_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- 'pyproject.toml' || true)
+          if [ -n "$DEPS_FILES" ]; then
+            echo "deps=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "deps=false" >> "$GITHUB_OUTPUT"
+          fi
+          MCP_CATALOG_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- \
+            'optional-mcps/**' \
+            'hermes_cli/mcp_catalog.py' || true)
+          if [ -n "$MCP_CATALOG_FILES" ]; then
+            echo "mcp_catalog=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "mcp_catalog=false" >> "$GITHUB_OUTPUT"
+          fi
+
  scan:
    name: Scan PR for critical supply chain risks
-    if: inputs.scan
+    needs: changes
+    if: needs.changes.outputs.scan == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@@ -84,7 +111,7 @@ jobs:
          fi

          # --- base64 decode + exec/eval on the same line (the litellm attack pattern) ---
-          B64_EXEC_HITS=$(echo "$DIFF" | grep -n '^+' | grep -iE 'base64\.(b64decode|decodebytes|urlsafe_b64decode)' | grep -iE 'exec\(|eval\(' | head -10 || true)
+          B64_EXEC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -iE 'base64\.(b64decode|decodebytes|urlsafe_b64decode)' | grep -iE 'exec\(|eval\(' | head -10 || true)
          if [ -n "$B64_EXEC_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### 🚨 CRITICAL: base64 decode + exec/eval combo
@@ -98,7 +125,7 @@ jobs:
          fi

          # --- subprocess with encoded/obfuscated command argument ---
-          PROC_HITS=$(echo "$DIFF" | grep -n '^+' | grep -E 'subprocess\.(Popen|call|run)\s*\(' | grep -iE 'base64|\\x[0-9a-f]{2}|chr\(' | head -10 || true)
+          PROC_HITS=$(echo "$DIFF" | grep -n '^\+' | grep -E 'subprocess\.(Popen|call|run)\s*\(' | grep -iE 'base64|\\x[0-9a-f]{2}|chr\(' | head -10 || true)
          if [ -n "$PROC_HITS" ]; then
            FINDINGS="${FINDINGS}
          ### 🚨 CRITICAL: subprocess with encoded/obfuscated command
@@ -160,9 +187,23 @@ jobs:
          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
          exit 1

+  # Gate: reports success when scan was skipped (no relevant files changed).
+  # This ensures the required check always gets a status.
+  scan-gate:
+    name: Scan PR for critical supply chain risks
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.scan != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No supply-chain-relevant files changed, skipping scan."
+
  dep-bounds:
    name: Check PyPI dependency upper bounds
-    if: inputs.deps
+    needs: changes
+    if: needs.changes.outputs.deps == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@@ -212,7 +253,7 @@ jobs:
          $(cat /tmp/unbounded.txt)
          \`\`\`

-          **Fix:** Add an upper bound, e.g. \`"package>=1.2.0,<2"\`
+          **Fix:** Add an upper bound, e.g. \`\"package>=1.2.0,<2\"\`

          ---
          *See PR #2810 and CONTRIBUTING.md for the full policy rationale.*"
@@ -225,9 +266,23 @@ jobs:
          echo "::error::PyPI dependencies without upper bounds detected. Add <next_major ceiling per CONTRIBUTING.md policy."
          exit 1

+  # Gate: reports success when dep-bounds was skipped (no pyproject.toml changed).
+  # This ensures the required check always gets a status.
+  dep-bounds-gate:
+    name: Check PyPI dependency upper bounds
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.deps != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No pyproject.toml changes, skipping dependency bounds check."
+
  mcp-catalog-review:
    name: MCP catalog security review
-    if: inputs.mcp_catalog
+    needs: changes
+    if: needs.changes.outputs.mcp_catalog == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@@ -262,3 +317,11 @@ jobs:
          gh pr comment "$PR" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs)"
          echo "::error::MCP catalog changes require the mcp-catalog-reviewed label."
          exit 1
+
+  mcp-catalog-review-gate:
+    name: MCP catalog security review
+    needs: changes
+    if: always() && needs.changes.outputs.mcp_catalog != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No MCP catalog changes, skipping MCP catalog security review."
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,27 +1,33 @@
 name: Tests

 on:
-  workflow_call:
-    inputs:
-      slice_count:
-        description: Number of parallel test slices
-        type: number
-        default: 8
+  push:
+    branches: [main]
+    paths-ignore:
+      - "**/*.md"
+      - "docs/**"
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 permissions:
  contents: read

-# Cancel in-progress runs for the same ref
+# Cancel in-progress runs for the same PR/branch
 concurrency:
  group: tests-${{ github.ref }}
  cancel-in-progress: true

 jobs:
-  generate:
-    name: "Generate slices"
+  test:
    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.matrix.outputs.matrix }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        slice: [1, 2, 3, 4, 5, 6]
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -30,33 +36,20 @@ jobs:
        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: test_durations.json
+          # main always writes a new suffix, but jobs pick the latest one with the same prefix
+          # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
+          # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
+          # If there are no exact matches, the action searches for partial matches of the restore keys.
+          # When the action finds a partial match, the most recent cache is restored to the path directory.
          key: test-durations

-      - name: Generate test slices
-        id: matrix
-        run: |
-          MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
-          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
-
-  test:
-    name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
-    needs: generate
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-
      - name: Install ripgrep (prebuilt binary)
        run: |
          set -euo pipefail
          RG_VERSION=15.1.0
          RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
          RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
-          curl -sSfL --retry 3 --retry-delay 5 -o "$RG_TARBALL" \
+          curl -sSfL -o "$RG_TARBALL" \
            "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
          echo "${RG_SHA256}  ${RG_TARBALL}" | sha256sum -c -
          tar -xzf "$RG_TARBALL"
@@ -65,7 +58,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -85,28 +78,40 @@ jobs:
        # fails if the lock is out of sync with pyproject.toml), giving a
        # reproducible env. It also creates .venv itself, so no separate
        # `uv venv` step is needed.
-        uses: ./.github/actions/retry
-        with:
-          command: uv sync --locked --python 3.11 --extra all --extra dev
+        run: uv sync --locked --python 3.11 --extra all --extra dev

      - name: Minimize uv cache
        # Optimized for CI: prunes pre-built wheels that are cheap to
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
-        # Per-file isolation via scripts/run_tests.sh: each test file runs
-        # in its own freshly-spawned `python -m pytest <file>` subprocess
+      - name: Run tests (slice ${{ matrix.slice }}/6)
+        # Per-file isolation via scripts/run_tests_parallel.py: discovers
+        # every test_*.py file under tests/ (excluding integration/ + e2e/),
+        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
        # with bounded parallelism. No xdist, no shared workers, no
        # module-level state leakage between files.
        #
-        # File list is pre-computed by the generate job (--generate-slices)
-        # which runs LPT distribution once and passes the file list to each
-        # matrix job via --files. Previously each job re-discovered files and
-        # re-ran LPT independently — redundant N times.
+        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
+        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
+        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
+        # every file a fresh interpreter — the only isolation boundary
+        # that matters in practice (cross-file leakage was the original
+        # flake source; intra-file is the test author's responsibility).
+        #
+        # Why drop xdist entirely: xdist's persistent workers accumulate
+        # state across files, which is exactly the leakage we wanted to
+        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
+        # the job with cleaner semantics.
+        #
+        # Matrix slicing (--slice I/N): files are distributed across 6
+        # jobs by cached duration (LPT algorithm) so each job gets
+        # roughly equal wall time. Without a cache, files default to 2s
+        # estimate and get split roughly evenly by count — still correct,
+        # just not perfectly balanced.
        run: |
          source .venv/bin/activate
-          scripts/run_tests.sh --files '${{ matrix.slice.files }}'
+          python scripts/run_tests_parallel.py --slice ${{ matrix.slice }}/6
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -116,7 +121,7 @@ jobs:
      - name: Upload per-slice durations
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: test-durations-slice-${{ matrix.slice.index }}
+          name: test-durations-slice-${{ matrix.slice }}
          path: test_durations.json
          retention-days: 1

@@ -166,7 +171,7 @@ jobs:
          RG_VERSION=15.1.0
          RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599
          RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz
-          curl -sSfL --retry 3 --retry-delay 5 -o "$RG_TARBALL" \
+          curl -sSfL -o "$RG_TARBALL" \
            "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}"
          echo "${RG_SHA256}  ${RG_TARBALL}" | sha256sum -c -
          tar -xzf "$RG_TARBALL"
@@ -175,7 +180,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -195,9 +200,7 @@ jobs:
        # fails if the lock is out of sync with pyproject.toml), giving a
        # reproducible env. It also creates .venv itself, so no separate
        # `uv venv` step is needed.
-        uses: ./.github/actions/retry
-        with:
-          command: uv sync --locked --python 3.11 --extra all --extra dev
+        run: uv sync --locked --python 3.11 --extra all --extra dev

      - name: Minimize uv cache
        # Optimized for CI: prunes pre-built wheels that are cheap to
--- a/.github/workflows/typecheck.yml
+++ b/.github/workflows/typecheck.yml
@@ -2,11 +2,16 @@
 name: Typecheck

 on:
-  workflow_call:
+  push:
+    branches: [main]
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 jobs:
  typecheck:
-    name: Check TypeScript
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -19,13 +24,7 @@ jobs:
        with:
          node-version: 22
          cache: npm
-      # --ignore-scripts: typecheck only needs the TS sources + type defs, not
-      # native builds. Skipping install scripts drops node-pty's node-gyp
-      # header fetch — the transient flake that killed this job pre-`tsc` — and
-      # is faster. retry covers the remaining registry blips.
-      - uses: ./.github/actions/retry
-        with:
-          command: npm ci --ignore-scripts
+      - run: npm ci
      - run: npm run --prefix ${{ matrix.package }} typecheck

  # Production build of the desktop renderer. `typecheck` runs `tsc` only,
@@ -35,7 +34,6 @@ jobs:
  # users build apps/desktop from source on install/update. Run the real
  # `vite build` here so that class of break fails in CI instead.
  desktop-build:
-    name: Build desktop app
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -43,9 +41,5 @@ jobs:
        with:
          node-version: 22
          cache: npm
-      # Keep install scripts here: the production build may need node-pty's
-      # native binary. retry handles the transient install-time fetch flakes.
-      - uses: ./.github/actions/retry
-        with:
-          command: npm ci
+      - run: npm ci
      - run: npm run --prefix apps/desktop build
--- a/.github/workflows/upload_to_pypi.yml
+++ b/.github/workflows/upload_to_pypi.yml
@@ -5,11 +5,11 @@ name: Publish to PyPI
 on:
  push:
    tags:
-      - "v20*" # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
  workflow_dispatch:
    inputs:
      confirm_tag:
-        description: "Tag to publish (e.g. v2026.5.15). Must already exist."
+        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
        required: true
        type: string

@@ -27,7 +27,7 @@ jobs:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          persist-credentials: false
          # On workflow_dispatch, check out the confirmed tag.
@@ -43,17 +43,17 @@ jobs:
          fi

      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
        with:
-          python-version: "3.13"
+          python-version: '3.13'

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6

      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
        with:
-          node-version: "22"
+          node-version: '22'

      - name: Build web dashboard
        run: cd web && npm ci && npm run build
@@ -81,7 +81,7 @@ jobs:
        run: uv build --sdist --wheel

      - name: Upload distribution artifacts
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -94,17 +94,17 @@ jobs:
      name: pypi
      url: https://pypi.org/p/hermes-agent
    permissions:
-      id-token: write # OIDC trusted publishing
+      id-token: write  # OIDC trusted publishing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
        with:
          skip-existing: true

@@ -116,12 +116,12 @@ jobs:
    needs: publish
    runs-on: ubuntu-latest
    permissions:
-      contents: write # attach assets to the existing release
-      id-token: write # sigstore signing
+      contents: write   # attach assets to the existing release
+      id-token: write   # sigstore signing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -145,7 +145,7 @@ jobs:

      - name: Sign with Sigstore
        if: env.skip_sign != 'true'
-        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc # v3.3.0
+        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc  # v3.3.0
        with:
          inputs: >-
            ./dist/*.tar.gz
--- a/.github/workflows/uv-lockfile-check.yml
+++ b/.github/workflows/uv-lockfile-check.yml
@@ -4,7 +4,7 @@ name: uv.lock check
 # that modify pyproject.toml without regenerating uv.lock (or vice versa)
 # must not merge, because the Docker build's `uv sync --frozen` step will
 # fail on a stale lockfile and we'd rather catch it here than in the
-# docker workflow on main.
+# docker-publish workflow on main.
 #
 # ─────────────────────────────────────────────────────────────────────────
 # IMPORTANT: this check runs against the MERGED state, not just your branch
@@ -44,14 +44,25 @@ name: uv.lock check
 # the same way.  Better to catch it here than after merge.

 on:
-  workflow_call:
+  push:
+    branches: [main]
+    paths:
+      - "pyproject.toml"
+      - "uv.lock"
+      - ".github/workflows/uv-lockfile-check.yml"
+
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).
+  pull_request:
+    branches: [main]

 permissions:
  contents: read

 concurrency:
  group: uv-lockfile-check-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

 jobs:
  check:
@@ -63,7 +74,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5

      # `uv lock --check` re-resolves the project from pyproject.toml and
      # compares the result to uv.lock, exiting non-zero if they disagree.
@@ -100,7 +111,7 @@ jobs:

          This check is blocking because the Docker image build uses
          `uv sync --frozen --extra all`, which rejects stale lockfiles
-          — catching it here avoids a ~15 min failed docker run
+          — catching it here avoids a ~15 min failed docker-publish run
          on `main` post-merge.
          EOF
            echo "::error title=uv.lock out of sync::Run \`uv lock\` locally and commit the result. If on a PR, sync with main first."
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -123,17 +123,6 @@ conservative at the waist.
  without E2E proof, and plugins that touch core files.** Plugins live in their
  own directory and work within the ABCs/hooks we provide; if a plugin needs
  more, widen the generic plugin surface, don't special-case it in core.
- **Third-party products / other people's projects integrated into the core
-  tree.** Observability backends, vendor SaaS integrations, analytics dashboards,
-  and similar "someone else's product" plugins do NOT land under `plugins/` in
-  this repo. They place an ongoing maintenance burden on us to keep them working
-  against a fast-moving core, for a backend we don't own. Ship them as a
-  **standalone plugin repo** users install into `~/.hermes/plugins/` (or via a
-  pip entry point), and promote them in the Nous Research Discord
-  (`#plugins-skills-and-skins`). This is a coupling-and-maintenance decision, not
-  a quality bar — the plugin can be excellent and still be a close. PRs that add
-  such a directory to the tree are closed with a pointer to publish it as its own
-  repo.

 ### Before you call it a bug — verify the premise (and when NOT to close)

@@ -794,24 +783,6 @@ landing in this tree. PRs that add a new directory under
 provider as its own repo. Existing in-tree providers stay; bug fixes
 to them are welcome.

-**No new third-party-product plugins in-tree (policy, June 2026):** the
-same rule applies beyond memory providers. Plugins that integrate
-someone else's product or project — observability/metrics backends,
-vendor SaaS connectors, analytics dashboards, paid-service tie-ins —
-must ship as **standalone plugin repos** that users install into
-`~/.hermes/plugins/` (or via pip entry points). They register through
-the existing plugin discovery path and use the ABCs/hooks/ctx surface
-we expose; nothing special is needed in core. The reason is
-maintenance load: every product we absorb into the tree becomes our
-burden to keep working against a fast-moving core, for a backend we
-don't own. Promote standalone plugins in the Nous Research Discord
-(`#plugins-skills-and-skins`). PRs that add such a directory under
-`plugins/` are closed with a pointer to publish it as its own repo —
-this is a coupling decision, not a quality judgment. (The
-`observability/`, `kanban/`, `disk-cleanup/`, etc. directories already
-in the tree are existing precedent, not an invitation to add more
-third-party-product plugins alongside them.)
-
 ### Model-provider plugins (`plugins/model-providers/<name>/`)

 Every inference backend (openrouter, anthropic, gmi, deepseek, nvidia, …)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -85,23 +85,6 @@ This isn't a quality bar — it's a coupling-and-maintenance decision. Memory pr

 ---

-## Third-Party Product Integrations: Ship as a Standalone Plugin
-
-The same rule extends to **any plugin that integrates someone else's product or project** — observability/metrics backends, vendor SaaS connectors, analytics dashboards, paid-service tie-ins, and similar third-party integrations. **These do not land in this repo.**
-
-The reason is maintenance load, not quality. Every external product absorbed into the core tree becomes ours to keep working against a fast-moving codebase, for a backend we don't own and can't control. Hermes ships a lot and the core moves quickly; coupling third-party products into it creates an open-ended burden on the maintainers.
-
-Publish these as a **standalone plugin repo** instead:
-
- Implement the relevant ABC and use the existing plugin discovery path (`~/.hermes/plugins/`, project `.hermes/plugins/`, or a pip entry point) — see [Build a Hermes Plugin](https://hermes-agent.nousresearch.com/docs/guides/build-a-hermes-plugin)
- Register lifecycle hooks (`pre_tool_call`, `post_tool_call`, `pre_llm_call`, `post_llm_call`, `on_session_start`, `on_session_end`), tools (`ctx.register_tool`), and CLI subcommands (`ctx.register_cli_command`) through the surface we already expose — no core changes needed
- If your plugin needs a capability the framework doesn't expose, that's a feature request to **widen the generic plugin surface** (a new hook or `ctx` method) — never special-case your plugin in core
- Promote it in the [Nous Research Discord](https://discord.gg/NousResearch) `#plugins-skills-and-skins` channel so users can find and install it
-
-A well-built third-party-product plugin can clear automated review and still be closed for this reason — it's a placement decision, not a verdict on the code. PRs that add such a directory under `plugins/` will be closed with a pointer to publish it as its own repo.
-
---
-
 ## Development Setup

 ### Prerequisites
--- a/41
+++ b/41
@@ -189,13 +189,7 @@ RUN cd web && npm run build && \

 # ---------- Source code ----------
 # .dockerignore excludes node_modules, so the installs above survive.
-# --link decouples this layer from parents for cache purposes; --chmod bakes
-# the final read-only permissions at copy time so we skip the separate
-# `chmod -R` pass that previously walked ~30k files across the venv +
-# node_modules + source (21s amd64 / 222s arm64 — #49113).  `a+rX,go-w`
-# gives the non-root hermes user read + traverse but no write; root retains
-# write so the build steps below don't need chmod u+w dances.
-COPY --link --chmod=a+rX,go-w . .
+COPY . .

 # ---------- Permissions ----------
 # Link hermes-agent itself (editable). Deps are already installed in the
@@ -203,15 +197,19 @@ COPY --link --chmod=a+rX,go-w . .
 # resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

-# Wire the exec shim and install-method stamp.  Files under /opt/hermes are
-# already root-owned (COPY, uv sync, npm install all run as root) and
-# read-only for the hermes user (go-w from the --chmod above).
-
+# Keep /opt/hermes immutable for the runtime hermes user. Hosted/container
+# instances must not be able to self-edit the installed source or venv; user
+# data, skills, plugins, config, logs, and dashboard uploads live under
+# /opt/data instead. Root can still repair the image during build/boot, but
+# supervised Hermes processes drop to the non-root hermes user.
 USER root
 RUN mkdir -p /opt/hermes/bin && \
    cp /opt/hermes/docker/hermes-exec-shim.sh /opt/hermes/bin/hermes && \
    chmod 0755 /opt/hermes/bin/hermes && \
-    printf 'docker\n' > /opt/hermes/.install_method
+    printf 'docker\n' > /opt/hermes/.install_method && \
+    chown -R root:root /opt/hermes && \
+    chmod -R a+rX /opt/hermes && \
+    chmod -R a-w /opt/hermes
 # The ``.install_method`` stamp is baked next to the running code (the install
 # tree), NOT into $HERMES_HOME. $HERMES_HOME (/opt/data) is a shared data
 # volume that is commonly bind-mounted from the host and even shared with a
@@ -238,11 +236,13 @@ RUN mkdir -p /opt/hermes/bin && \
 #
 # The arg is optional — local `docker build` without --build-arg simply
 # omits the file, and the runtime falls back to live-git lookup.  CI
-# (.github/workflows/docker.yml) passes ${{ github.sha }} so
+# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
 # every published image has it.
 ARG HERMES_GIT_SHA=
 RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
-        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha; \
+        chmod u+w /opt/hermes && \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
+        chmod a-w /opt/hermes /opt/hermes/.hermes_build_sha; \
    fi

 # ---------- s6-overlay service wiring ----------
@@ -290,19 +290,6 @@ ENV HERMES_TUI_DIR=/opt/hermes/ui-tui
 ENV HERMES_HOME=/opt/data
 ENV HERMES_WRITE_SAFE_ROOT=/opt/data
 ENV HERMES_DISABLE_LAZY_INSTALLS=1
-# The published image seals /opt/hermes (root-owned, read-only) so a runtime
-# lazy install can't mutate the agent's own venv and brick it. But opt-in
-# backends (Firecrawl web search, Exa, Feishu, …) keep their SDKs in
-# tools/lazy_deps.py — deliberately NOT baked into [all] (see pyproject.toml
-# policy 2026-05-12: one quarantined release must not break every install).
-# Redirect those lazy installs to a writable dir on the durable data volume.
-# lazy_deps appends this dir to the END of sys.path, so a package installed
-# here can only ADD modules — it can never shadow or downgrade a core module,
-# so the sealed-venv guarantee holds even with installs re-enabled. The dir
-# is seeded + chowned to the hermes user by docker/stage2-hook.sh and lives
-# on the /opt/data volume, so it persists across container recreates / image
-# updates (an ABI stamp invalidates it if a rebuild bumps the interpreter).
-ENV HERMES_LAZY_INSTALL_TARGET=/opt/data/lazy-packages

 # `docker exec` privilege-drop shim. When operators run
 # `docker exec <c> hermes ...` they default to root, and any file the
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), OpenRouter, OpenAI, your own endpoint, and [many others](https://hermes-agent.nousresearch.com/docs/integrations/providers). Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [NovitaAI](https://novita.ai) (AI-native cloud for Model API, Agent Sandbox, and GPU Cloud), [NVIDIA NIM](https://build.nvidia.com) (Nemotron), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
--- a/acp_adapter/entry.py
+++ b/acp_adapter/entry.py
@@ -23,11 +23,6 @@ except ModuleNotFoundError:
    # new code but ``uv pip install -e .`` didn't finish.  Missing bootstrap
    # means UTF-8 stdio setup is skipped on Windows; POSIX is unaffected.
    pass
-else:
-    # Stop a ``utils/``/``proxy/``/``ui/`` package in the launch directory from
-    # shadowing Hermes's own modules — ``hermes acp`` can be started from any
-    # cwd, including a project that has same-named packages on its path.
-    hermes_bootstrap.harden_import_path()

 import argparse
 import asyncio
--- a/acp_adapter/tools.py
+++ b/acp_adapter/tools.py
@@ -74,7 +74,7 @@ _POLISHED_TOOLS = {
    "kanban_create", "kanban_show", "kanban_comment", "kanban_complete",
    "kanban_block", "kanban_link", "kanban_heartbeat",
    "yb_query_group_info", "yb_query_group_members", "yb_search_sticker",
-    "yb_send_dm", "yb_send_sticker",
+    "yb_send_dm", "yb_send_sticker", "mixture_of_agents",
 }


--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -106,12 +106,7 @@ def _custom_provider_extra_body_for_agent(
    base_url: str,
    custom_providers: List[Dict[str, Any]],
 ) -> Optional[Dict[str, Any]]:
-    provider_norm = (provider or "").strip().lower()
-    if provider_norm == "custom":
-        provider_key_filter = ""
-    elif provider_norm.startswith("custom:"):
-        provider_key_filter = provider_norm.split(":", 1)[1].strip()
-    else:
+    if (provider or "").strip().lower() != "custom":
        return None

    target_url = _normalized_custom_base_url(base_url)
@@ -122,13 +117,6 @@ def _custom_provider_extra_body_for_agent(
    for entry in custom_providers or []:
        if not isinstance(entry, dict):
            continue
-        if provider_key_filter:
-            entry_keys = {
-                str(entry.get("provider_key", "") or "").strip().lower(),
-                str(entry.get("name", "") or "").strip().lower(),
-            }
-            if provider_key_filter not in entry_keys:
-                continue
        if _normalized_custom_base_url(entry.get("base_url")) != target_url:
            continue
        extra_body = entry.get("extra_body")
@@ -719,55 +707,6 @@ def init_agent(
                    print("🔑 Using credentials: Microsoft Entra ID")
                elif isinstance(effective_key, str) and len(effective_key) > 12:
                    print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}")
-    elif agent.provider == "moa":
-        from agent.moa_loop import MoAClient
-        agent.api_mode = "chat_completions"
-
-        # Route reference-model outputs to the agent's tool_progress_callback so
-        # every surface that already consumes it (CLI spinner/scrollback, TUI,
-        # desktop, gateway) can show each reference's answer as a labelled block
-        # before the aggregator acts. The facade emits "moa.reference" and
-        # "moa.aggregating" events; we forward them through the same callback
-        # the tool lifecycle uses. Best-effort and cache-safe — these are
-        # display-only events, they never touch the message history.
-        def _moa_reference_relay(event: str, **kwargs: Any) -> None:
-            cb = getattr(agent, "tool_progress_callback", None)
-            if cb is None:
-                return
-            try:
-                if event == "moa.reference":
-                    label = str(kwargs.get("label") or "")
-                    text = str(kwargs.get("text") or "")
-                    idx = kwargs.get("index")
-                    count = kwargs.get("count")
-                    cb(
-                        "moa.reference",
-                        label,
-                        text,
-                        None,
-                        moa_index=idx,
-                        moa_count=count,
-                    )
-                elif event == "moa.aggregating":
-                    cb(
-                        "moa.aggregating",
-                        str(kwargs.get("aggregator") or ""),
-                        None,
-                        None,
-                        moa_ref_count=kwargs.get("ref_count"),
-                    )
-            except Exception:
-                pass
-
-        agent.client = MoAClient(
-            agent.model or "default",
-            reference_callback=_moa_reference_relay,
-        )
-        agent._client_kwargs = {}
-        agent.api_key = api_key or "moa-virtual-provider"
-        agent.base_url = "moa://local"
-        if not agent.quiet_mode:
-            print(f"🤖 AI Agent initialized with MoA preset: {agent.model}")
    elif agent.api_mode == "bedrock_converse":
        # AWS Bedrock — uses boto3 directly, no OpenAI client needed.
        # Region is extracted from the base_url or defaults to us-east-1.
@@ -1307,12 +1246,6 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

-    # Intent-ack continuation config: "auto" (default — codex_responses only,
-    # the historical gate), true (all api_modes), false (never), or a list of
-    # model-name substrings.  Resolved against the active api_mode/model in the
-    # conversation loop's intent-ack block.
-    agent._intent_ack_continuation = _agent_section.get("intent_ack_continuation", "auto")
-
    # Universal task-completion guidance toggle.  Default True.  Surfaced
    # as a separate flag from tool_use_enforcement because the guidance
    # applies to ALL models, not just the model families enforcement
@@ -1573,7 +1506,6 @@ def init_agent(
    # 3. Check general plugin system (user-installed plugins)
    # 4. Fall back to built-in ContextCompressor
    _selected_engine = None
-    _copy_failed = False
    _engine_name = "compressor"  # default
    try:
        _ctx_cfg = _agent_cfg.get("context", {}) if isinstance(_agent_cfg, dict) else {}
@@ -1591,35 +1523,15 @@ def init_agent(

        # Try general plugin system as fallback
        if _selected_engine is None:
-            _candidate = None
            try:
                from hermes_cli.plugins import get_plugin_context_engine
                _candidate = get_plugin_context_engine()
+                if _candidate and _candidate.name == _engine_name:
+                    _selected_engine = _candidate
            except Exception:
-                _candidate = None
-            if _candidate is not None and _candidate.name == _engine_name:
-                # Deep-copy the shared plugin singleton so a child agent's
-                # update_model() can't mutate the parent's compressor (#42449).
-                # Copy can fail for engines holding uncopyable state (locks, DB
-                # connections, clients); in that case fall back to the built-in
-                # compressor with an ACCURATE message rather than silently
-                # mislabelling it "not found".
-                import copy
-                try:
-                    _selected_engine = copy.deepcopy(_candidate)
-                except Exception as _copy_err:
-                    _copy_failed = True
-                    _ra().logger.warning(
-                        "Context engine '%s' could not be safely copied for this "
-                        "agent (%s) — falling back to built-in compressor. Plugin "
-                        "engines that hold uncopyable state (locks, DB connections) "
-                        "should implement __deepcopy__ to copy only mutable budget "
-                        "state.",
-                        _engine_name, _copy_err,
-                    )
-                    _selected_engine = None
+                pass

-        if _selected_engine is None and not _copy_failed:
+        if _selected_engine is None:
            _ra().logger.warning(
                "Context engine '%s' not found — falling back to built-in compressor",
                _engine_name,
@@ -1676,10 +1588,8 @@ def init_agent(
            f"Model {agent.model} has a context window of {_ctx:,} tokens, "
            f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
            f"by Hermes Agent.  Choose a model with at least "
-            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context.  If your server "
-            f"reports a window smaller than the model's true window, set "
-            f"model.context_length in config.yaml to the real value "
-            f"(this must be at least {MINIMUM_CONTEXT_LENGTH // 1000}K)."
+            f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
+            f"model.context_length in config.yaml to override."
        )

    # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand).
@@ -1711,27 +1621,16 @@ def init_agent(
            for t in agent.tools
            if isinstance(t, dict)
        }
-        from agent.memory_manager import normalize_tool_schema as _normalize_tool_schema
-        for _raw_schema in agent.context_compressor.get_tool_schemas():
-            _schema = _normalize_tool_schema(_raw_schema)
-            if _schema is None:
-                # A schema with no resolvable name (e.g. an already-wrapped
-                # entry) would append a nameless tool that strict providers
-                # 400 on, disabling the whole toolset (#47707). Skip it.
-                _ra().logger.warning(
-                    "Context engine returned a tool schema with no resolvable "
-                    "name; skipping to avoid poisoning the request (%r)",
-                    _raw_schema,
-                )
-                continue
-            _tname = _schema["name"]
-            if _tname in _existing_tool_names:
+        for _schema in agent.context_compressor.get_tool_schemas():
+            _tname = _schema.get("name", "")
+            if _tname and _tname in _existing_tool_names:
                continue  # already registered via plugin/cache path
            _wrapped = {"type": "function", "function": _schema}
            agent.tools.append(_wrapped)
-            agent.valid_tool_names.add(_tname)
-            agent._context_engine_tool_names.add(_tname)
-            _existing_tool_names.add(_tname)
+            if _tname:
+                agent.valid_tool_names.add(_tname)
+                agent._context_engine_tool_names.add(_tname)
+                _existing_tool_names.add(_tname)

    # Notify context engine of session start
    if hasattr(agent, "context_compressor") and agent.context_compressor:
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@@ -42,14 +42,6 @@ from utils import base_url_host_matches, base_url_hostname, env_var_enabled, ato
 logger = logging.getLogger(__name__)


-# Max consecutive successful credential-pool token refreshes of the SAME entry
-# on a persistent auth failure before we give up and let the fallback chain
-# activate. A single-entry OAuth pool can re-mint a fresh token indefinitely
-# even when the upstream keeps rejecting it, so without this cap the retry loop
-# spins forever and never reaches ``_try_activate_fallback``. See #26080.
-_MAX_AUTH_REFRESH_ATTEMPTS = 2
-
-
 def _ra():
    """Lazy ``run_agent`` reference for test-patch routing."""
    import run_agent
@@ -783,30 +775,6 @@ def recover_with_credential_pool(
            return False, has_retried_429
        refreshed = pool.try_refresh_current()
        if refreshed is not None:
-            # ``try_refresh_current()`` re-mints a fresh OAuth token and reports
-            # success even when the upstream keeps rejecting it — a single-entry
-            # pool (common for OAuth/Max subscribers) has nothing to rotate to,
-            # so a bare "refreshed → retry" loop spins forever on the same dead
-            # token and the configured fallback never activates. Cap consecutive
-            # same-entry refreshes and fall through to fallback once exceeded.
-            # See #26080.
-            refreshed_id = getattr(refreshed, "id", None)
-            if refreshed_id is not None:
-                refresh_counts = getattr(agent, "_auth_pool_refresh_counts", None)
-                if refresh_counts is None:
-                    refresh_counts = {}
-                    agent._auth_pool_refresh_counts = refresh_counts
-                refresh_key = (agent.provider, refreshed_id)
-                refresh_counts[refresh_key] = refresh_counts.get(refresh_key, 0) + 1
-                if refresh_counts[refresh_key] > _MAX_AUTH_REFRESH_ATTEMPTS:
-                    _ra().logger.warning(
-                        "Credential auth failure persists after %s refreshes for "
-                        "pool entry %s — treating as unrecoverable and allowing "
-                        "fallback to activate.",
-                        refresh_counts[refresh_key] - 1,
-                        refreshed_id,
-                    )
-                    return False, has_retried_429
            _ra().logger.info(f"Credential auth failure — refreshed pool entry {getattr(refreshed, 'id', '?')}")
            agent._swap_credential(refreshed)
            return True, has_retried_429
@@ -1078,34 +1046,6 @@ def restore_primary_runtime(agent) -> bool:
            api_mode=rt.get("compressor_api_mode", ""),
        )

-        # ── Re-select from the credential pool if one is available ──
-        # The snapshot's api_key was captured at construction time.  Across
-        # turns the pool may have rotated (token revocation, billing/rate-limit
-        # exhaustion, cooldown), leaving the snapshot key stale.  Restoring it
-        # blindly re-fails on the first request and burns through the remaining
-        # pool entries before cross-provider fallback even gets a chance.  Ask
-        # the pool for its current best entry and swap the live credential in.
-        # When the pool is absent, empty, or the entry has no usable key, we
-        # keep the snapshot key (the existing behavior).  Fixes #25205.
-        pool = getattr(agent, "_credential_pool", None)
-        if pool is not None and pool.has_available():
-            entry = pool.select()
-            if entry is not None:
-                entry_key = (
-                    getattr(entry, "runtime_api_key", None)
-                    or getattr(entry, "access_token", "")
-                )
-                if entry_key:
-                    # ``_swap_credential`` rebuilds the OpenAI/Anthropic client,
-                    # reapplies base-url-scoped headers, and carries the
-                    # accumulated base_url / OAuth-detection fixes (#33163).
-                    agent._swap_credential(entry)
-                    logger.info(
-                        "Restore re-selected pool entry %s (%s)",
-                        getattr(entry, "id", "?"),
-                        getattr(entry, "label", "?"),
-                    )
-
        # ── Reset fallback chain for the new turn ──
        agent._fallback_activated = False
        agent._fallback_index = 0
@@ -1480,15 +1420,6 @@ def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: boo
        keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", ""))
        if keepalive_http is not None:
            client_kwargs["http_client"] = keepalive_http
-    # Delegate all rate-limit / 5xx retry to hermes's outer conversation loop,
-    # which honors Retry-After and applies adaptive/jittered backoff. The OpenAI
-    # SDK default (max_retries=2) uses its own 1-2s backoff that ignores
-    # Retry-After and double-retries inside our loop — the same deadlock the
-    # Anthropic clients hit (#26293). This is the single chokepoint every primary
-    # OpenAI/aggregator client passes through (init, switch_model, recovery,
-    # restore, request-scoped); auxiliary_client builds its own clients and keeps
-    # SDK retries because it is NOT wrapped by the conversation loop.
-    client_kwargs.setdefault("max_retries", 0)
    # Uses the module-level `OpenAI` name, resolved lazily on first
    # access via __getattr__ below. Tests patch via `run_agent.OpenAI`.
    client = _ra().OpenAI(**client_kwargs)
@@ -1568,10 +1499,6 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
    # _client_kwargs is a dict — snapshot a shallow copy so mutating the
    # live dict doesn't poison the rollback target.
    _snapshot["_client_kwargs"] = dict(getattr(agent, "_client_kwargs", {}) or {})
-    # Snapshot the credential pool reference so a failed client rebuild can
-    # restore the original pool (issue #52727: pool reload is part of this
-    # switch and must be reversible on rollback).
-    _snapshot["_credential_pool"] = getattr(agent, "_credential_pool", _MISSING)

    try:
        # Clear the per-config context_length override so the new model's
@@ -1596,36 +1523,8 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        if api_key:
            agent.api_key = api_key

-        # ── Reload credential pool for the new provider (issue #52727) ──
-        # Without this, ``recover_with_credential_pool`` sees a
-        # ``pool.provider != agent.provider`` mismatch and short-circuits,
-        # leaving the new provider with no rotation/recovery on 401/429 and
-        # burning the original pool's entries. Only reload when the provider
-        # actually changed (or the pool was missing) — re-selecting the same
-        # provider must not churn the pool reference. A reload failure is
-        # logged + swallowed: the switch itself must still complete.
-        old_norm = (old_provider or "").strip().lower()
-        new_norm = (new_provider or "").strip().lower()
-        if old_norm != new_norm or getattr(agent, "_credential_pool", None) is None:
-            try:
-                from agent.credential_pool import load_pool
-                agent._credential_pool = load_pool(new_provider)
-            except Exception as _pool_exc:  # noqa: BLE001
-                logger.warning(
-                    "switch_model: credential pool reload failed for %s (%s); "
-                    "continuing without pool rotation this turn",
-                    new_provider, _pool_exc,
-                )
-
        # ── Build new client ──
-        if (new_provider or "").strip().lower() == "moa":
-            from agent.moa_loop import MoAClient
-
-            agent.api_key = api_key or "moa-virtual-provider"
-            agent.base_url = "moa://local"
-            agent._client_kwargs = {}
-            agent.client = MoAClient(agent.model or "default")
-        elif api_mode == "anthropic_messages":
+        if api_mode == "anthropic_messages":
            from agent.anthropic_adapter import (
                build_anthropic_client,
                resolve_anthropic_token,
@@ -1798,27 +1697,6 @@ def switch_model(agent, new_model, new_provider, api_key='', base_url='', api_mo
        old_model, old_provider, new_model, new_provider,
    )

-    # ── Persist billing route to session DB ──
-    # The agent's _session_db / session_id may not be set in all contexts
-    # (tests, bare agents without a session DB, etc.).  This ensures the
-    # dashboard Model cards show the actual provider after a mid-session
-    # /model switch instead of the stale session-creation provider.
-    # See #48248 for the full bug description.
-    _session_db = getattr(agent, "_session_db", None)
-    _session_id = getattr(agent, "session_id", None)
-    if _session_db is not None and _session_id:
-        try:
-            _session_db.update_session_billing_route(
-                _session_id,
-                provider=agent.provider,
-                base_url=agent.base_url,
-                billing_mode=getattr(agent, "api_mode", None),
-            )
-        except Exception:
-            logger.warning(
-                "Failed to persist billing route after model switch",
-                exc_info=True,
-            )


 def invoke_tool(agent, function_name: str, function_args: dict, effective_task_id: str,
@@ -2205,21 +2083,8 @@ def looks_like_codex_intermediate_ack(
    user_message: str,
    assistant_content: str,
    messages: List[Dict[str, Any]],
-    require_workspace: bool = True,
 ) -> bool:
-    """Detect a planning/ack message that should continue instead of ending the turn.
-
-    ``require_workspace`` (default True) keeps the original codex-coding scope:
-    the ack must reference a filesystem/repo workspace. The conversation loop
-    passes ``require_workspace=False`` when the user has explicitly opted into
-    intent-ack continuation for all api_modes (``agent.intent_ack_continuation``
-    is ``true`` or a model-list), so general autonomous workflows ("I'll run a
-    health check on the server", "I'll start the deployment") — which carry a
-    future-ack and an action verb but no filesystem reference — are caught too.
-    The future-ack + short-content + no-prior-tools + action-verb requirements
-    always apply, which is what keeps conversational "I'll help you brainstorm"
-    replies from tripping it.
-    """
+    """Detect a planning/ack message that should continue instead of ending the turn."""
    if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
        return False

@@ -2272,67 +2137,17 @@ def looks_like_codex_intermediate_ack(
        "path",
    )

-    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
-    if not assistant_mentions_action:
-        return False
-
-    # Opted-in (all-api_mode) path: a future-ack + action verb + no prior tool
-    # call is enough — the user asked us to keep going when the model only
-    # announces intent, regardless of whether a filesystem is involved.
-    if not require_workspace:
-        return True
-
    user_text = (user_message or "").strip().lower()
    user_targets_workspace = (
        any(marker in user_text for marker in workspace_markers)
        or "~/" in user_text
        or "/" in user_text
    )
+    assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
    assistant_targets_workspace = any(
        marker in assistant_text for marker in workspace_markers
    )
-    return user_targets_workspace or assistant_targets_workspace
-
-
-def intent_ack_continuation_mode(agent) -> str:
-    """Classify the resolved intent-ack continuation mode for this turn.
-
-    Returns one of:
-      * ``"off"``        — never continue.
-      * ``"codex_only"`` — historical scope: continue only on the
-        ``codex_responses`` api_mode, and only for codebase/workspace acks
-        (``require_workspace=True``).
-      * ``"all"``        — user opted in for every api_mode; continue on any
-        future-ack + action verb (``require_workspace=False``).
-
-    Mirrors the four-mode shape of ``agent.tool_use_enforcement``: ``"auto"``
-    (default) → codex_only; ``True``/"true"/"always"/"yes"/"on" → all;
-    ``False``/"false"/"never"/"no"/"off" → off; ``list`` → all when a substring
-    matches the active model name, else off.
-    """
-    mode = getattr(agent, "_intent_ack_continuation", "auto")
-
-    if mode is True or (isinstance(mode, str) and mode.lower() in {"true", "always", "yes", "on"}):
-        return "all"
-    if mode is False or (isinstance(mode, str) and mode.lower() in {"false", "never", "no", "off"}):
-        return "off"
-    if isinstance(mode, list):
-        model_lower = (agent.model or "").lower()
-        return "all" if any(p.lower() in model_lower for p in mode if isinstance(p, str)) else "off"
-    # "auto" or any unrecognised value — historical codex-only behavior.
-    return "codex_only" if agent.api_mode == "codex_responses" else "off"
-
-
-def intent_ack_continuation_enabled(agent) -> bool:
-    """Whether intent-ack continuation should fire at all for this turn.
-
-    The ``codex_ack_continuations < 2`` per-turn cap and the
-    ``looks_like_codex_intermediate_ack`` detector are applied by the caller;
-    this only decides the on/off gate. Callers that also need to know whether
-    the workspace requirement applies should use ``intent_ack_continuation_mode``
-    directly (``"codex_only"`` ⇒ require_workspace=True, ``"all"`` ⇒ False).
-    """
-    return intent_ack_continuation_mode(agent) != "off"
+    return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action



--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -673,9 +673,6 @@ def _build_anthropic_client_with_bearer_hook(
    kwargs = {
        "timeout": timeout_obj,
        "http_client": http_client,
-        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
-        # default max_retries=2 ignores it and double-retries. (#26293)
-        "max_retries": 0,
        # The SDK requires *something* for api_key/auth_token. Our
        # event hook overrides Authorization per request so this value
        # is never sent. The sentinel string makes accidental leaks
@@ -760,12 +757,6 @@ def build_anthropic_client(
    _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
    kwargs = {
        "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
-        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
-        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
-        # its own 1-2s backoff that ignores Retry-After and double-retries
-        # inside our loop — burning request slots against a bucket that won't
-        # refill for minutes. (#26293)
-        "max_retries": 0,
    }
    if normalized_base_url:
        # Azure Anthropic endpoints require an ``api-version`` query parameter.
@@ -861,9 +852,6 @@ def build_anthropic_bedrock_client(region: str):
    return _anthropic_sdk.AnthropicBedrock(
        aws_region=region,
        timeout=Timeout(timeout=900.0, connect=10.0),
-        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
-        # default max_retries=2 ignores it and double-retries. (#26293)
-        max_retries=0,
        default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
    )

@@ -926,72 +914,44 @@ def _read_claude_code_credentials_from_keychain() -> Optional[Dict[str, Any]]:
    return None


-def _read_claude_code_credentials_from_file() -> Optional[Dict[str, Any]]:
-    """Read Claude Code OAuth credentials from ~/.claude/.credentials.json.
-
-    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
-    """
-    cred_path = Path.home() / ".claude" / ".credentials.json"
-    if not cred_path.exists():
-        return None
-    try:
-        data = json.loads(cred_path.read_text(encoding="utf-8"))
-    except (json.JSONDecodeError, OSError, IOError) as e:
-        logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)
-        return None
-
-    oauth_data = data.get("claudeAiOauth")
-    if not (oauth_data and isinstance(oauth_data, dict)):
-        return None
-    access_token = oauth_data.get("accessToken", "")
-    if not access_token:
-        return None
-    return {
-        "accessToken": access_token,
-        "refreshToken": oauth_data.get("refreshToken", ""),
-        "expiresAt": oauth_data.get("expiresAt", 0),
-        "source": "claude_code_credentials_file",
-    }
-
-
 def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    """Read refreshable Claude Code OAuth credentials.

-    Reads from two possible sources and reconciles them:
+    Checks two sources in order:
      1. macOS Keychain (Darwin only) — "Claude Code-credentials" entry
      2. ~/.claude/.credentials.json file

-    Selection rules when both are present:
-      - If exactly one is non-expired, prefer that one. (Handles the case
-        where Claude Code refreshes one source but not the other — observed
-        in the wild on Claude Code 2.1.x.)
-      - Otherwise, prefer the source with the later ``expiresAt`` so that
-        any subsequent refresh uses the most recent ``refreshToken``.
-
    This intentionally excludes ~/.claude.json primaryApiKey. Opencode's
    subscription flow is OAuth/setup-token based with refreshable credentials,
    and native direct Anthropic provider usage should follow that path rather
    than auto-detecting Claude's first-party managed key.

-    Returns dict with {accessToken, refreshToken?, expiresAt?, source} or None.
+    Returns dict with {accessToken, refreshToken?, expiresAt?} or None.
    """
+    # Try macOS Keychain first (covers Claude Code >=2.1.114)
    kc_creds = _read_claude_code_credentials_from_keychain()
-    file_creds = _read_claude_code_credentials_from_file()
+    if kc_creds:
+        return kc_creds

-    if kc_creds and file_creds:
-        kc_valid = is_claude_code_token_valid(kc_creds)
-        file_valid = is_claude_code_token_valid(file_creds)
-        if kc_valid and not file_valid:
-            return kc_creds
-        if file_valid and not kc_valid:
-            return file_creds
-        # Both valid or both expired: prefer the later expiresAt so the
-        # downstream refresh path uses the freshest refresh_token.
-        kc_exp = kc_creds.get("expiresAt", 0) or 0
-        file_exp = file_creds.get("expiresAt", 0) or 0
-        return kc_creds if kc_exp >= file_exp else file_creds
+    # Fall back to JSON file
+    cred_path = Path.home() / ".claude" / ".credentials.json"
+    if cred_path.exists():
+        try:
+            data = json.loads(cred_path.read_text(encoding="utf-8"))
+            oauth_data = data.get("claudeAiOauth")
+            if oauth_data and isinstance(oauth_data, dict):
+                access_token = oauth_data.get("accessToken", "")
+                if access_token:
+                    return {
+                        "accessToken": access_token,
+                        "refreshToken": oauth_data.get("refreshToken", ""),
+                        "expiresAt": oauth_data.get("expiresAt", 0),
+                        "source": "claude_code_credentials_file",
+                    }
+        except (json.JSONDecodeError, OSError, IOError) as e:
+            logger.debug("Failed to read ~/.claude/.credentials.json: %s", e)

-    return kc_creds or file_creds
+    return None


 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
@@ -1074,40 +1034,8 @@ def refresh_anthropic_oauth_pure(refresh_token: str, *, use_json: bool = False)


 def _refresh_oauth_token(creds: Dict[str, Any]) -> Optional[str]:
-    """Attempt to refresh an expired Claude Code OAuth token.
-
-    Claude Code's OAuth refresh tokens are single-use: a successful refresh
-    rotates the pair and invalidates the old refresh token. Claude Code itself
-    also refreshes on its own schedule (IDE/CLI activity), so by the time
-    Hermes notices an expired token, Claude Code may have already rotated it.
-    POSTing our now-stale refresh token in that window races Claude Code and
-    fails with ``invalid_grant``.
-
-    So before refreshing, re-read the live credential sources. If Claude Code
-    has already produced a valid token, adopt it and skip the POST entirely.
-    Only fall back to refreshing ourselves when no fresh credential is found.
-    """
-    # Claude Code may have already refreshed — adopt its token rather than
-    # racing it with our (possibly already-rotated) refresh token. Only adopt
-    # when the live re-read produced a DIFFERENT token with a real future
-    # expiry: re-adopting the same credential we were just handed would be a
-    # no-op, and a 0/absent ``expiresAt`` means "managed key / unknown expiry"
-    # (see is_claude_code_token_valid) which must NOT be treated as a fresh
-    # refresh here.
-    current = read_claude_code_credentials()
-    if current:
-        current_token = current.get("accessToken", "")
-        current_exp = current.get("expiresAt", 0) or 0
-        if (
-            current_token
-            and current_token != creds.get("accessToken", "")
-            and current_exp > 0
-            and is_claude_code_token_valid(current)
-        ):
-            logger.debug("Adopted Claude Code's already-refreshed OAuth token")
-            return current_token
-
-    refresh_token = (current or {}).get("refreshToken", "") or creds.get("refreshToken", "")
+    """Attempt to refresh an expired Claude Code OAuth token."""
+    refresh_token = creds.get("refreshToken", "")
    if not refresh_token:
        logger.debug("No refresh token available — cannot refresh")
        return None
@@ -1369,15 +1297,7 @@ def run_oauth_setup_token() -> Optional[str]:
 # Stores credentials in ~/.hermes/.anthropic_oauth.json (our own file).

 _OAUTH_CLIENT_ID = "9d1c250a-e61b-44d9-88ed-5944d1962f5e"
-# Anthropic migrated the OAuth token endpoint to platform.claude.com;
-# console.anthropic.com now 404s. Callers should iterate _OAUTH_TOKEN_URLS
-# (new host first, console fallback). _OAUTH_TOKEN_URL is kept as the primary
-# for backward compatibility with existing imports and now points at the live host.
-_OAUTH_TOKEN_URLS = [
-    "https://platform.claude.com/v1/oauth/token",
-    "https://console.anthropic.com/v1/oauth/token",
-]
-_OAUTH_TOKEN_URL = _OAUTH_TOKEN_URLS[0]
+_OAUTH_TOKEN_URL = "https://console.anthropic.com/v1/oauth/token"
 _OAUTH_REDIRECT_URI = "https://console.anthropic.com/oauth/code/callback"
 _OAUTH_SCOPES = "org:create_api_key user:profile user:inference"
 _HERMES_OAUTH_FILE = get_hermes_home() / ".anthropic_oauth.json"
@@ -1475,34 +1395,18 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
            "code_verifier": verifier,
        }).encode()

-        # Anthropic migrated the OAuth token endpoint to platform.claude.com;
-        # console.anthropic.com now 404s. Try the new host first, then fall
-        # back to console for older deployments (mirrors the refresh path).
-        result = None
-        last_error = None
-        for endpoint in _OAUTH_TOKEN_URLS:
-            req = urllib.request.Request(
-                endpoint,
-                data=exchange_data,
-                headers={
-                    "Content-Type": "application/json",
-                    "User-Agent": f"claude-cli/{_get_claude_code_version()} (external, cli)",
-                },
-                method="POST",
-            )
-            try:
-                with urllib.request.urlopen(req, timeout=15) as resp:
-                    result = json.loads(resp.read().decode())
-                break
-            except Exception as exc:
-                last_error = exc
-                logger.debug("Anthropic token exchange failed at %s: %s", endpoint, exc)
-                continue
+        req = urllib.request.Request(
+            _OAUTH_TOKEN_URL,
+            data=exchange_data,
+            headers={
+                "Content-Type": "application/json",
+                "User-Agent": f"claude-cli/{_get_claude_code_version()} (external, cli)",
+            },
+            method="POST",
+        )

-        if result is None:
-            raise last_error if last_error is not None else ValueError(
-                "Anthropic token exchange failed"
-            )
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            result = json.loads(resp.read().decode())
    except Exception as e:
        print(f"Token exchange failed: {e}")
        return None
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -101,8 +101,6 @@ class _OpenAIProxy:
 OpenAI = _OpenAIProxy()  # module-level name, resolves lazily on call/isinstance

 from agent.credential_pool import load_pool
-from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length
-from agent.process_bootstrap import build_keepalive_http_client
 from hermes_cli.config import get_hermes_home
 from hermes_constants import OPENROUTER_BASE_URL
 from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars
@@ -110,23 +108,6 @@ from utils import base_url_host_matches, base_url_hostname, env_float, model_for
 logger = logging.getLogger(__name__)


-def _openai_http_client_kwargs(
-    base_url: Optional[str],
-    *,
-    async_mode: bool = False,
-) -> Dict[str, Any]:
-    """Inject keepalive httpx client with env-only proxy (not macOS system proxy)."""
-    client = build_keepalive_http_client(str(base_url or ""), async_mode=async_mode)
-    if client is None:
-        return {}
-    return {"http_client": client}
-
-
-def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
-    kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
-    return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
-
-
 # ── Interrupt protection for atomic auxiliary tasks ──────────────────────
 # Some auxiliary tasks must NOT be aborted mid-flight by a gateway interrupt
 # (e.g. an incoming user message while the agent is busy). Context
@@ -684,28 +665,6 @@ def _pool_runtime_base_url(entry: Any, fallback: str = "") -> str:
    return str(url or "").strip().rstrip("/")


-# Hostnames (lowercase, exact) that the auxiliary Anthropic path is allowed to
-# be pointed at via config.yaml model.base_url. Anything else falls back to the
-# Anthropic default — operators routing main-session traffic through a
-# non-Anthropic host (e.g. OpenRouter, OpenAI) with provider=anthropic in config
-# must NOT have that foreign host leak into the auxiliary client. See #52608.
-_ANTHROPIC_COMPATIBLE_HOSTS = frozenset({
-    "api.anthropic.com",
-})
-
-
-def _is_anthropic_compatible_host(url: str) -> bool:
-    """Return True if ``url``'s hostname is an Anthropic endpoint we trust for aux calls."""
-    if not url:
-        return False
-    try:
-        from urllib.parse import urlparse
-        host = (urlparse(url).hostname or "").strip().lower().rstrip(".")
-        return host in _ANTHROPIC_COMPATIBLE_HOSTS
-    except Exception:
-        return False
-
-
 def _nous_min_key_ttl_seconds() -> int:
    try:
        return max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800")))
@@ -1632,7 +1591,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
            _merged_aux = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_aux:
                extra["default_headers"] = _merged_aux
-            _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
+            _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
            _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
            return _client, model

@@ -1672,7 +1631,7 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
        _merged_aux2 = _apply_user_default_headers(extra.get("default_headers"))
        if _merged_aux2:
            extra["default_headers"] = _merged_aux2
-        _client = _create_openai_client(api_key=api_key, base_url=base_url, **extra)
+        _client = OpenAI(api_key=api_key, base_url=base_url, **extra)
        _client = _maybe_wrap_anthropic(_client, model, api_key, raw_base_url)
        return _client, model

@@ -1687,21 +1646,20 @@ def _try_openrouter(explicit_api_key: str = None, model: str = None) -> Tuple[Op
    pool_present, entry = _select_pool_entry("openrouter")
    if pool_present:
        or_key = explicit_api_key or _pool_runtime_api_key(entry)
-        if or_key:
-            base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
-            logger.debug("Auxiliary client: OpenRouter via pool")
-            return _create_openai_client(api_key=or_key, base_url=base_url,
-                           default_headers=build_or_headers()), model or _OPENROUTER_MODEL
-        # Pool exists but is exhausted (no usable runtime key) — fall through to
-        # the OPENROUTER_API_KEY env-var path rather than failing outright.
-        logger.debug("Auxiliary client: OpenRouter pool exhausted, trying OPENROUTER_API_KEY")
+        if not or_key:
+            _mark_provider_unhealthy("openrouter", ttl=60)
+            return None, None
+        base_url = _pool_runtime_base_url(entry, OPENROUTER_BASE_URL) or OPENROUTER_BASE_URL
+        logger.debug("Auxiliary client: OpenRouter via pool")
+        return OpenAI(api_key=or_key, base_url=base_url,
+                       default_headers=build_or_headers()), model or _OPENROUTER_MODEL

    or_key = explicit_api_key or os.getenv("OPENROUTER_API_KEY")
    if not or_key:
        _mark_provider_unhealthy("openrouter", ttl=60)
        return None, None
    logger.debug("Auxiliary client: OpenRouter")
-    return _create_openai_client(api_key=or_key, base_url=OPENROUTER_BASE_URL,
+    return OpenAI(api_key=or_key, base_url=OPENROUTER_BASE_URL,
                   default_headers=build_or_headers()), model or _OPENROUTER_MODEL


@@ -1794,7 +1752,7 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
-        _create_openai_client(
+        OpenAI(
            api_key=api_key,
            base_url=base_url,
        ),
@@ -2071,7 +2029,7 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
    if _custom_headers:
        _extra["default_headers"] = _custom_headers
    if custom_mode == "codex_responses":
-        real_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
+        real_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
        return CodexAuxiliaryClient(real_client, model), model
    if custom_mode == "anthropic_messages":
        # Third-party Anthropic-compatible gateway (MiniMax, Zhipu GLM,
@@ -2085,14 +2043,14 @@ def _try_custom_endpoint() -> Tuple[Optional[Any], Optional[str]]:
                "Custom endpoint declares api_mode=anthropic_messages but the "
                "anthropic SDK is not installed — falling back to OpenAI-wire."
            )
-            return _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra), model
+            return OpenAI(api_key=custom_key, base_url=_clean_base, **_extra), model
        return (
            AnthropicAuxiliaryClient(real_client, model, custom_key, custom_base, is_oauth=False),
            model,
        )
    # URL-based anthropic detection for custom endpoints that didn't set
    # api_mode explicitly (e.g. kimi.com/coding reached via custom config).
-    _fallback_client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **_extra)
+    _fallback_client = OpenAI(api_key=custom_key, base_url=_clean_base, **_extra)
    _fallback_client = _maybe_wrap_anthropic(
        _fallback_client, model, custom_key, custom_base, custom_mode,
    )
@@ -2121,7 +2079,7 @@ def _build_xai_oauth_aux_client(model: str) -> Tuple[Optional[Any], Optional[str
        return None, None
    api_key, base_url = resolved
    logger.debug("Auxiliary client: xAI OAuth (%s via Responses API)", model)
-    real_client = _create_openai_client(api_key=api_key, base_url=base_url)
+    real_client = OpenAI(api_key=api_key, base_url=base_url)
    return CodexAuxiliaryClient(real_client, model), model


@@ -2158,7 +2116,7 @@ def _build_codex_client(model: str) -> Tuple[Optional[Any], Optional[str]]:
            return None, None
        base_url = _CODEX_AUX_BASE_URL
    logger.debug("Auxiliary client: Codex OAuth (%s via Responses API)", model)
-    real_client = _create_openai_client(
+    real_client = OpenAI(
        api_key=codex_token,
        base_url=base_url,
        default_headers=_codex_cloudflare_headers(codex_token),
@@ -2258,7 +2216,7 @@ def _try_azure_foundry(
    if _dq:
        extra["default_query"] = _dq

-    client = _create_openai_client(api_key=api_key, base_url=_clean_base, **extra)
+    client = OpenAI(api_key=api_key, base_url=_clean_base, **extra)

    if runtime_api_mode == "codex_responses":
        # GPT-5.x / o-series / codex models on Azure Foundry are
@@ -2297,16 +2255,9 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
    if not token:
        return None, None

-    # Allow base URL override from config.yaml model.base_url, but only when:
-    #   1. the configured provider is anthropic (otherwise a non-Anthropic
-    #      base_url, e.g. Codex endpoint, would leak into Anthropic requests), AND
-    #   2. the override URL actually points at an Anthropic-compatible endpoint.
-    # Without gate (2), operators who route main-session traffic through a
-    # non-Anthropic provider that accepts Anthropic-format requests (e.g.
-    # OpenRouter at openrouter.ai/api/v1, with provider=anthropic in config.yaml)
-    # would have every auxiliary side-channel call (memory extractors,
-    # reflection, vision, title generation) 401 from the foreign host —
-    # see issue #52608.
+    # Allow base URL override from config.yaml model.base_url, but only
+    # when the configured provider is anthropic — otherwise a non-Anthropic
+    # base_url (e.g. Codex endpoint) would leak into Anthropic requests.
    base_url = _pool_runtime_base_url(entry, _ANTHROPIC_DEFAULT_BASE_URL) if pool_present else _ANTHROPIC_DEFAULT_BASE_URL
    try:
        from hermes_cli.config import load_config
@@ -2316,7 +2267,7 @@ def _try_anthropic(explicit_api_key: str = None) -> Tuple[Optional[Any], Optiona
            cfg_provider = str(model_cfg.get("provider") or "").strip().lower()
            if cfg_provider == "anthropic":
                cfg_base_url = (model_cfg.get("base_url") or "").strip().rstrip("/")
-                if cfg_base_url and _is_anthropic_compatible_host(cfg_base_url):
+                if cfg_base_url:
                    base_url = cfg_base_url
    except Exception:
        pass
@@ -2519,7 +2470,7 @@ def _is_payment_error(exc: Exception) -> bool:
    # but sometimes wrap them in 429 or other codes.
    # Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
    # uses different language but is semantically identical to credit exhaustion.
-    if status in {402, 403, 404, 429, None}:
+    if status in {402, 404, 429, None}:
        if any(kw in err_lower for kw in (
            "credits", "insufficient funds",
            "can only afford", "billing",
@@ -2528,8 +2479,6 @@ def _is_payment_error(exc: Exception) -> bool:
            "balance_depleted", "no usable credits",
            "model_not_supported_on_free_tier",
            "not available on the free tier",
-            "requires a subscription", "upgrade for access",
-            "upgrade for higher limits", "reached your session usage limit",
            # Daily / monthly / weekly quota exhaustion keywords
            "quota exceeded", "quota_exceeded",
            "too many tokens per day", "daily limit",
@@ -2748,79 +2697,6 @@ def _is_model_not_found_error(exc: Exception) -> bool:
    ))


-def _is_model_incompatible_error(exc: Exception) -> bool:
-    """Detect "this route cannot serve this model" 400s (capability mismatch).
-
-    Distinct from :func:`_is_model_not_found_error` (the model does not exist
-    anywhere): here the model name is valid but the *current provider/account*
-    is structurally unable to run it. The canonical case is a configured
-    fallback that cannot run the main model — e.g. an ``openai-codex`` /
-    ChatGPT-account fallback asked to compress a ``glm-5.2`` conversation::
-
-        Error code: 400 - {'detail': "The 'glm-5.2' model is not supported
-        when using Codex with a ChatGPT account."}
-
-    The candidate authenticates fine and builds a client, so the auth and
-    payment predicates don't fire and the call would otherwise raise and
-    abort the whole auxiliary task (commonly compression — which then drops
-    middle turns and churns the session, destroying the prompt cache).
-    Treating it as a fallback-worthy capability error lets the chain skip the
-    incapable route and continue to the next candidate, mirroring the
-    context-window feasibility screen (#52392).
-
-    Billing/quota 400s belong to :func:`_is_payment_error`; "model does not
-    exist" 400s belong to :func:`_is_model_not_found_error`. This predicate
-    explicitly excludes both so the three don't overlap.
-    """
-    status = getattr(exc, "status_code", None)
-    if status not in {400, None}:
-        return False
-    err_lower = str(exc).lower()
-    # Not-found 400s ("invalid model ID", "model does not exist") are owned by
-    # _is_model_not_found_error. Billing/free-tier 400s are owned by the
-    # payment path — key on the billing keywords directly here rather than
-    # calling _is_payment_error(), because that predicate is status-gated
-    # ({402,403,404,429,None}) and would not recognise a 400-coded billing
-    # body, letting it leak into this capability bucket.
-    if _is_model_not_found_error(exc):
-        return False
-    if any(kw in err_lower for kw in (
-        "credits", "insufficient funds", "billing", "out of funds",
-        "balance_depleted", "no usable credits", "payment required",
-        "free tier", "free-tier", "not available on the free tier",
-        "model_not_supported_on_free_tier", "quota",
-    )):
-        return False
-    return any(kw in err_lower for kw in (
-        "is not supported when using",   # codex/ChatGPT-account model gating
-        "model is not supported",
-        "not supported with this",
-        "not supported for this account",
-        "model_not_supported",
-        "does not support this model",
-        "unsupported model",
-    ))
-
-
-def _is_invalid_aux_response_error(exc: Exception) -> bool:
-    """Detect provider responses that authenticated but cannot serve aux shape.
-
-    Some OpenAI-compatible routes return HTTP 200 with an empty/malformed
-    ChatCompletion instead of a normal provider error.  That is still a
-    provider/model capability failure for auxiliary tasks: downstream callers
-    need ``choices[0].message`` and should be able to continue through the
-    same fallback path as explicit model-incompatibility errors.
-    """
-    if not isinstance(exc, RuntimeError):
-        return False
-    msg = str(exc).lower()
-    return (
-        "auxiliary " in msg
-        and "llm returned invalid response" in msg
-        and "choices[0].message" in msg
-    )
-
-
 def _evict_cached_clients(provider: str) -> None:
    """Drop cached auxiliary clients for a provider so fresh creds are used."""
    normalized = _normalize_aux_provider(provider)
@@ -3271,88 +3147,6 @@ def _try_main_agent_model_fallback(
    return client, resolved_model or main_model, label


-# ── Context-window screening for runtime fallback chains (issue #52392) ──
-#
-# When the runtime auxiliary fallback chain selects a candidate that is
-# reachable but has a context window smaller than the compression task
-# requires, the call errors out instead of continuing to the next, viable
-# candidate. The startup feasibility check in
-# ``agent.conversation_compression.check_compression_model_feasibility``
-# already filters too-small auxiliary models at startup, but the runtime
-# fallback chain (``_try_configured_fallback_chain`` and
-# ``_try_main_fallback_chain``) does not apply the same filter, so
-# compression can stop at the first alive door even if the room behind it
-# is too small.
-#
-# The helpers below screen each candidate by its effective context window
-# before it is returned. ``None`` results from ``get_model_context_length``
-# are passed through (we cannot prove a model is too small, so we do not
-# block it). This preserves the existing fallback surface for
-# unrecognised/custom models while closing the gap on the well-known ones.
-
-def _task_minimum_context_length(task: Optional[str]) -> Optional[int]:
-    """Return the minimum context length required for an auxiliary task.
-
-    Only ``compression`` carries an explicit minimum today (the same
-    ``MINIMUM_CONTEXT_LENGTH`` (64K) floor that
-    ``check_compression_model_feasibility`` already enforces at startup).
-    Other tasks (``vision``, ``title_generation``, ``web_extract``,
-    ``skills_hub``, ``mcp``, ``session_search``) return ``None`` — they
-    have no per-task context floor and the runtime chain must remain
-    permissive for them.
-
-    Returns ``None`` for an empty/``None`` task name so the helper is a
-    safe no-op when called from generic sites.
-    """
-    if not task:
-        return None
-    if task == "compression":
-        return MINIMUM_CONTEXT_LENGTH
-    return None
-
-
-def _candidate_context_window(
-    provider: str,
-    model: str,
-    base_url: str = "",
-    api_key: str = "",
-) -> Optional[int]:
-    """Resolve the effective context window for a fallback candidate.
-
-    Thin wrapper around :func:`agent.model_metadata.get_model_context_length`
-    that swallows probe failures (returns ``None``). Callers treat
-    ``None`` as "unknown — pass through" so the existing fallback
-    surface is preserved when the context-length resolver chain cannot
-    determine a value (custom endpoints, models not in the registry,
-    offline endpoints).
-
-    Best-effort, never raises — the runtime fallback chain must keep
-    moving even if the resolver hits a probe error.
-    """
-    if not model:
-        return None
-    try:
-        ctx = get_model_context_length(
-            model,
-            base_url=base_url,
-            api_key=api_key,
-            provider=provider,
-        )
-    except Exception as exc:
-        logger.debug(
-            "Auxiliary fallback: could not resolve context window for %s/%s: %s",
-            provider, model, exc,
-        )
-        return None
-    # ``get_model_context_length`` returns an int (with a 256K default
-    # fallback when nothing else matches). We still propagate ``None`` if
-    # a future change returns ``Optional[int]`` — being explicit is
-    # cheap and the test suite covers both shapes.
-    if isinstance(ctx, int) and ctx > 0:
-        return ctx
-    return None
-
-
 def _try_configured_fallback_chain(
    task: str,
    failed_provider: str,
@@ -3377,7 +3171,6 @@ def _try_configured_fallback_chain(

    skip = failed_provider.lower().strip()
    tried = []
-    min_ctx = _task_minimum_context_length(task)

    for i, entry in enumerate(chain):
        if not isinstance(entry, dict):
@@ -3395,20 +3188,6 @@ def _try_configured_fallback_chain(
            fb_client, resolved_model = None, None

        if fb_client is not None:
-            if min_ctx is not None and resolved_model:
-                fb_ctx = _candidate_context_window(
-                    fb_provider,
-                    resolved_model,
-                    base_url=str(entry.get("base_url") or ""),
-                    api_key=_fallback_entry_api_key(entry) or "",
-                )
-                if fb_ctx is not None and fb_ctx < min_ctx:
-                    logger.info(
-                        "Auxiliary %s: skipping %s (%s context=%d < min=%d), continuing chain",
-                        task, label, resolved_model, fb_ctx, min_ctx,
-                    )
-                    tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
-                    continue
            logger.info(
                "Auxiliary %s: %s on %s — configured fallback to %s (%s)",
                task, reason, failed_provider, label, resolved_model or fb_model or "default",
@@ -3424,28 +3203,6 @@ def _try_configured_fallback_chain(
    return None, None, ""


-def _try_configured_fallback_for_unavailable_client(
-    task: Optional[str],
-    failed_provider: str,
-) -> Tuple[Optional[Any], Optional[str], str]:
-    """Try task fallback_chain when an explicit aux provider cannot build.
-
-    This covers the "no client" case before any request is sent: missing
-    raw env key, unavailable OAuth/pool credentials, or provider resolver
-    returning ``(None, None)``.  It deliberately stops at the configured
-    per-task fallback chain; the main-agent model remains the last-resort
-    runtime fallback for request-time capacity errors.
-    """
-    explicit = (failed_provider or "").strip().lower()
-    if not task or not explicit or explicit in {"auto"}:
-        return None, None, ""
-    return _try_configured_fallback_chain(
-        task,
-        explicit,
-        reason="provider unavailable",
-    )
-
-
 def _fallback_entry_api_key(entry: Dict[str, Any]) -> Optional[str]:
    """Resolve inline or env-backed API key from a fallback-chain entry."""
    explicit = str(entry.get("api_key") or "").strip()
@@ -3504,7 +3261,6 @@ def _try_main_fallback_chain(
    main_norm = (_read_main_provider() or "").strip().lower()
    skip = {p for p in (failed_norm, main_norm, "auto") if p}
    tried: List[str] = []
-    min_ctx = _task_minimum_context_length(task)

    for i, entry in enumerate(chain):
        if not isinstance(entry, dict):
@@ -3528,20 +3284,6 @@ def _try_main_fallback_chain(
            logger.debug("Auxiliary %s: main fallback %s failed to resolve: %s", task or "call", label, exc)
            fb_client, resolved_model = None, None
        if fb_client is not None:
-            if min_ctx is not None:
-                fb_ctx = _candidate_context_window(
-                    fb_provider,
-                    resolved_model or fb_model,
-                    base_url=str(entry.get("base_url") or ""),
-                    api_key=_fallback_entry_api_key(entry) or "",
-                )
-                if fb_ctx is not None and fb_ctx < min_ctx:
-                    logger.info(
-                        "Auxiliary %s: skipping %s (context=%d < min=%d), continuing chain",
-                        task or "call", label, fb_ctx, min_ctx,
-                    )
-                    tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
-                    continue
            logger.info(
                "Auxiliary %s: %s on %s — main fallback chain to %s (%s)",
                task or "call", reason, failed_provider or "auto", label,
@@ -3643,37 +3385,6 @@ def _resolve_auto(
    # config.yaml (auxiliary.<task>.provider) still win over this.
    main_provider = str(runtime_provider or _read_main_provider() or "")
    main_model = str(runtime_model or _read_main_model() or "")
-
-    # MoA virtual provider: the "model" is a preset name (e.g. "opus-gpt") and
-    # there is no real "moa" HTTP endpoint, so resolving an aux client against
-    # provider="moa"/model=<preset> sends the preset name as the model id and
-    # the provider 400s ("opus-gpt is not a valid model ID"). Auxiliary tasks
-    # (title generation, compression, vision, …) don't need the reference
-    # fan-out — they should run on the aggregator, which is the preset's acting
-    # model. Resolve the MoA preset to its aggregator slot and continue Step 1
-    # with that real provider+model. Mirrors the MoA context-length resolution.
-    if main_provider == "moa":
-        try:
-            from hermes_cli.config import load_config
-            from hermes_cli.moa_config import resolve_moa_preset
-
-            _preset = resolve_moa_preset(load_config().get("moa") or {}, main_model)
-            _agg = _preset.get("aggregator") or {}
-            _agg_provider = str(_agg.get("provider") or "").strip()
-            _agg_model = str(_agg.get("model") or "").strip()
-            if _agg_provider and _agg_model and _agg_provider.lower() != "moa":
-                main_provider = _agg_provider
-                main_model = _agg_model
-                # The MoA virtual runtime carries a non-HTTP base_url
-                # ("moa://local") and a placeholder api_key; they belong to the
-                # facade, not the aggregator's real provider. Drop them so the
-                # aggregator resolves through its own provider credentials.
-                runtime_base_url = ""
-                runtime_api_key = ""
-                runtime_api_mode = ""
-        except Exception:
-            logger.debug("MoA aux resolution to aggregator failed", exc_info=True)
-
    if (main_provider and main_model
            and main_provider not in {"auto", ""}):
        resolved_provider = main_provider
@@ -3820,10 +3531,6 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
    _merged_async = _apply_user_default_headers(async_kwargs.get("default_headers"))
    if _merged_async:
        async_kwargs["default_headers"] = _merged_async
-    async_kwargs = {
-        **_openai_http_client_kwargs(sync_base_url, async_mode=True),
-        **async_kwargs,
-    }
    return AsyncOpenAI(**async_kwargs), model


@@ -4034,7 +3741,7 @@ def resolve_provider_client(
                               "but no Codex OAuth token found (run: hermes model)")
                return None, None
            final_model = _normalize_resolved_model(model, provider)
-            raw_client = _create_openai_client(
+            raw_client = OpenAI(
                api_key=codex_token,
                base_url=_CODEX_AUX_BASE_URL,
                default_headers=_codex_cloudflare_headers(codex_token),
@@ -4115,7 +3822,7 @@ def resolve_provider_client(
            _merged_custom = _apply_user_default_headers(extra.get("default_headers"))
            if _merged_custom:
                extra["default_headers"] = _merged_custom
-            client = _create_openai_client(api_key=custom_key, base_url=_clean_base, **extra)
+            client = OpenAI(api_key=custom_key, base_url=_clean_base, **extra)
            client = _wrap_if_needed(client, final_model, custom_base, custom_key)
            return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                    else (client, final_model))
@@ -4219,7 +3926,7 @@ def resolve_provider_client(
                        _fb_headers = _apply_user_default_headers(_fb_extra.get("default_headers"))
                        if _fb_headers:
                            _fb_extra["default_headers"] = _fb_headers
-                        client = _create_openai_client(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
+                        client = OpenAI(api_key=custom_key, base_url=_fb_clean, **_fb_extra)
                        return (_to_async_client(client, final_model, is_vision=is_vision) if async_mode
                                else (client, final_model))
                    sync_anthropic = AnthropicAuxiliaryClient(
@@ -4228,7 +3935,7 @@ def resolve_provider_client(
                    if async_mode:
                        return AsyncAnthropicAuxiliaryClient(sync_anthropic), final_model
                    return sync_anthropic, final_model
-                client = _create_openai_client(api_key=custom_key, base_url=_clean_base2, **_extra2)
+                client = OpenAI(api_key=custom_key, base_url=_clean_base2, **_extra2)
                # codex_responses or inherited auto-detect (via _wrap_if_needed).
                # _wrap_if_needed reads the closed-over `api_mode` (the task-level
                # override). Named-provider entry api_mode=codex_responses also
@@ -4370,7 +4077,7 @@ def resolve_provider_client(
        _merged_main = _apply_user_default_headers(headers)
        if _merged_main:
            headers = _merged_main
-        client = _create_openai_client(api_key=api_key, base_url=base_url,
+        client = OpenAI(api_key=api_key, base_url=base_url,
                        **({"default_headers": headers} if headers else {}))

        # Copilot GPT-5+ models (except gpt-5-mini) require the Responses
@@ -4906,7 +4613,7 @@ def _refresh_nous_auxiliary_client(
        return None, model

    fresh_key, fresh_base_url = runtime
-    sync_client = _create_openai_client(api_key=fresh_key, base_url=fresh_base_url)
+    sync_client = OpenAI(api_key=fresh_key, base_url=fresh_base_url)
    final_model = model

    current_loop = None
@@ -5547,9 +5254,6 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
        if not choices or not hasattr(choices[0], "message"):
            raise AttributeError("missing choices[0].message")
    except (AttributeError, TypeError, IndexError) as exc:
-        recovered = _recover_aux_response_message(response)
-        if recovered is not None:
-            return recovered
        response_type = type(response).__name__
        response_preview = str(response)[:120]
        raise RuntimeError(
@@ -5561,64 +5265,6 @@ def _validate_llm_response(response: Any, task: str = None) -> Any:
    return response


-def _recover_aux_response_message(response: Any) -> Optional[Any]:
-    """Synthesize chat-completions shape from Responses-style text fields.
-
-    Auxiliary callers consume ``choices[0].message``.  Some compatible
-    endpoints return text outside ``choices`` (for example ``output_text`` or
-    ``output`` items).  Preserve that response before declaring it malformed.
-    """
-    text = _extract_aux_response_text(response)
-    if not text:
-        return None
-
-    choice = SimpleNamespace(
-        message=SimpleNamespace(content=text),
-        finish_reason=getattr(response, "finish_reason", None) or "stop",
-    )
-    try:
-        response.choices = [choice]
-        return response
-    except Exception:
-        return SimpleNamespace(
-            id=getattr(response, "id", ""),
-            model=getattr(response, "model", ""),
-            object=getattr(response, "object", "chat.completion"),
-            choices=[choice],
-            usage=getattr(response, "usage", None),
-        )
-
-
-def _extract_aux_response_text(response: Any) -> str:
-    output_text = _obj_get(response, "output_text")
-    if isinstance(output_text, str) and output_text.strip():
-        return output_text.strip()
-
-    output = _obj_get(response, "output")
-    if not isinstance(output, list):
-        return ""
-
-    parts: List[str] = []
-    for item in output:
-        item_type = _obj_get(item, "type")
-        if item_type and item_type != "message":
-            continue
-        for part in (_obj_get(item, "content") or []):
-            part_type = _obj_get(part, "type")
-            if part_type in {"output_text", "text", None}:
-                text = _obj_get(part, "text")
-                if isinstance(text, str) and text.strip():
-                    parts.append(text.strip())
-    return "\n".join(parts).strip()
-
-
-def _obj_get(obj: Any, key: str, default: Any = None) -> Any:
-    value = getattr(obj, key, default)
-    if value is default and isinstance(obj, dict):
-        value = obj.get(key, default)
-    return value
-
-
 def call_llm(
    task: str = None,
    *,
@@ -5698,30 +5344,21 @@ def call_llm(
        )
        if client is None:
            # When the user explicitly chose a non-OpenRouter provider but no
-            # credentials were found, honor the task fallback_chain before
-            # raising.  Missing raw env keys are recoverable for auxiliary
-            # tasks because fallback entries may use OAuth / credential-pool
-            # auth (for example openai-codex).
+            # credentials were found, fail fast instead of silently routing
+            # through OpenRouter (which causes confusing 404s).
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
-                    task, _explicit,
+                raise RuntimeError(
+                    f"Provider '{_explicit}' is set in config.yaml but no API key "
+                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                    f"variable, or switch to a different provider with `hermes model`."
                )
-                if fb_client is not None:
-                    client, final_model = fb_client, fb_model
-                    resolved_provider = fb_label or resolved_provider
-                else:
-                    raise RuntimeError(
-                        f"Provider '{_explicit}' is set in config.yaml but no API key "
-                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                        f"variable, or switch to a different provider with `hermes model`."
-                    )
            # For auto/custom with no credentials, try the full auto chain
            # rather than hardcoding OpenRouter (which may be depleted).
            # Pass model=None so each provider uses its own default —
            # resolved_model may be an OpenRouter-format slug that doesn't
            # work on other providers.
-            if client is None and not resolved_base_url:
+            if not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", main_runtime=main_runtime, task=task)
@@ -6016,21 +5653,10 @@ def call_llm(
        # When the provider returns a 429 rate-limit (not billing), fall
        # back to an alternative provider instead of exhausting retries
        # against the same rate-limited endpoint.
-        #
-        # ── Auth error fallback (#21165) ─────────────────────────────
-        # When the resolved provider returns 401 and neither the Nous
-        # refresh path nor explicit provider credential refresh applies,
-        # fall back to an alternative provider instead of dropping the
-        # auxiliary task on the floor (silent compression failure /
-        # message loss). Auth is NOT a capacity error: it only bypasses
-        # the explicit-provider gate when the user is in auto mode.
        should_fallback = (
-            _is_auth_error(first_err)
-            or _is_payment_error(first_err)
+            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
-            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
        # Respect explicit provider choice for transient errors (auth, request
        # validation, etc.) but allow fallback when the provider clearly cannot
@@ -6041,24 +5667,9 @@ def call_llm(
        is_auto = resolved_provider in {"auto", "", None}
        # Capacity errors bypass the explicit-provider gate: the provider
        # literally cannot serve this request regardless of user intent.
-        # Rate limits are included: after retries are exhausted, a 429 means
-        # the provider cannot serve this request — fall back. See #52228.
-        # Model-incompatibility 400s are also a hard capability mismatch (the
-        # route cannot run this model at all — e.g. a codex/ChatGPT-account
-        # fallback asked to compress a glm-5.2 conversation), so they bypass
-        # the explicit-provider gate and continue to the next candidate
-        # instead of aborting the auxiliary task and churning the session.
-        is_capacity_error = (
-            _is_payment_error(first_err)
-            or _is_connection_error(first_err)
-            or _is_rate_limit_error(first_err)
-            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
-        )
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_auth_error(first_err):
-                reason = "auth error"
-            elif _is_payment_error(first_err):
+            if _is_payment_error(first_err):
                reason = "payment error"
                # Resolve the actual provider label (resolved_provider may be
                # "auto"; the client's base_url tells us which backend got the
@@ -6069,10 +5680,6 @@ def call_llm(
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
-            elif _is_model_incompatible_error(first_err):
-                reason = "model incompatible with route"
-            elif _is_invalid_aux_response_error(first_err):
-                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
@@ -6247,21 +5854,12 @@ async def async_call_llm(
        if client is None:
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
-                    task, _explicit,
+                raise RuntimeError(
+                    f"Provider '{_explicit}' is set in config.yaml but no API key "
+                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                    f"variable, or switch to a different provider with `hermes model`."
                )
-                if fb_client is not None:
-                    client, final_model = _to_async_client(
-                        fb_client, fb_model or "", is_vision=(task == "vision")
-                    )
-                    resolved_provider = fb_label or resolved_provider
-                else:
-                    raise RuntimeError(
-                        f"Provider '{_explicit}' is set in config.yaml but no API key "
-                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                        f"variable, or switch to a different provider with `hermes model`."
-                    )
-            if client is None and not resolved_base_url:
+            if not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", async_mode=True, main_runtime=main_runtime, task=task)
@@ -6507,47 +6105,24 @@ async def async_call_llm(
                        raise

        # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
-        # Auth error fallback (#21165): a 401 that survived the refresh path
-        # falls back in auto mode just like the sync call_llm() path. Auth is
-        # NOT a capacity error, so on an explicit provider it still respects
-        # the user's choice (handled by the is_auto/is_capacity_error gate).
        should_fallback = (
-            _is_auth_error(first_err)
-            or _is_payment_error(first_err)
-            or _is_connection_error(first_err)
-            or _is_rate_limit_error(first_err)
-            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
-        )
-        # Capacity errors (payment/quota/connection/rate-limit) bypass the
-        # explicit-provider gate — the provider cannot serve the request
-        # regardless of user intent. Rate limits are included: after retries
-        # are exhausted, a 429 means the provider is at capacity. See #52228.
-        # See #26803: daily token quota must fall back like a 402 credit error.
-        # Model-incompatibility 400s (route cannot run this model at all)
-        # bypass the gate too — see the sync call_llm() path for rationale.
-        is_auto = resolved_provider in {"auto", "", None}
-        is_capacity_error = (
            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
-            or _is_model_incompatible_error(first_err)
-            or _is_invalid_aux_response_error(first_err)
        )
+        # Capacity errors (payment/quota/connection) bypass the explicit-provider
+        # gate — the provider cannot serve the request regardless of user intent.
+        # See #26803: daily token quota must fall back like a 402 credit error.
+        is_auto = resolved_provider in {"auto", "", None}
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
-            if _is_auth_error(first_err):
-                reason = "auth error"
-            elif _is_payment_error(first_err):
+            if _is_payment_error(first_err):
                reason = "payment error"
                _mark_provider_unhealthy(
                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
-            elif _is_model_incompatible_error(first_err):
-                reason = "model incompatible with route"
-            elif _is_invalid_aux_response_error(first_err):
-                reason = "invalid provider response"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -37,18 +37,6 @@ from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname, env_float, env_int

 logger = logging.getLogger(__name__)
-_OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
-
-# When the fallback chain is fully exhausted on a non-rate-limit failure
-# (e.g. every provider returns a non-retryable client error like HTTP 400),
-# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
-# and does not reset _fallback_index=0 to replay the entire chain again.
-# Without this, a client/gateway that re-submits immediately would re-marshal
-# the full (potentially 80k-token) context once per provider every turn and
-# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
-# billing reasons keep their own 60s cooldown (set above); this is the
-# narrower non-rate-limit case.  See issue #24996.
-_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0


 def _ra():
@@ -127,23 +115,6 @@ def _is_openai_codex_backend(agent) -> bool:
    )


-def _validated_openrouter_provider_sort(raw_sort: Any) -> Optional[str]:
-    """Return a normalized OpenRouter provider.sort value or None."""
-    if not isinstance(raw_sort, str):
-        return None
-    sort_value = raw_sort.strip().lower()
-    if not sort_value:
-        return None
-    if sort_value in _OPENROUTER_PROVIDER_SORT_VALUES:
-        return sort_value
-    logger.warning(
-        "Ignoring invalid OpenRouter provider.sort value %r (allowed: %s)",
-        raw_sort,
-        ", ".join(sorted(_OPENROUTER_PROVIDER_SORT_VALUES)),
-    )
-    return None
-
-
 def _env_float(name: str, default: float) -> float:
    try:
        return float(os.getenv(name, str(default)))
@@ -258,11 +229,6 @@ def interruptible_api_call(agent, api_kwargs: dict):
                        invalidate_runtime_client(region)
                    raise
                result["response"] = normalize_converse_response(raw_response)
-            elif agent.provider == "moa":
-                # MoA is a virtual chat-completions provider backed by the
-                # in-process MoAClient facade. Do not rebuild a request-local
-                # OpenAI client from the virtual runtime metadata.
-                result["response"] = agent.client.chat.completions.create(**api_kwargs)
            else:
                request_client = _set_request_client(
                    agent._create_request_openai_client(
@@ -732,9 +698,8 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        _prefs["ignore"] = agent.providers_ignored
    if agent.providers_order:
        _prefs["order"] = agent.providers_order
-    _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
-    if _provider_sort:
-        _prefs["sort"] = _provider_sort
+    if agent.provider_sort:
+        _prefs["sort"] = agent.provider_sort
    if agent.provider_require_parameters:
        _prefs["require_parameters"] = True
    if agent.provider_data_collection:
@@ -1050,23 +1015,18 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
                    "arguments": tool_call.function.arguments
                },
            }
-            # Tool-call arguments are intentionally NOT redacted here. This
-            # dict enters the in-memory conversation history that is replayed
-            # to the model on every subsequent turn AND persisted to state.db,
-            # which is itself replayed verbatim on session resume
-            # (get_messages_as_conversation). Masking a credential to `***`
-            # here poisons that replay: the model reads back its own
-            # `PGPASSWORD='***' psql ...` call and copies the placeholder into
-            # the next tool call, breaking every credential-dependent command
-            # on the second turn (#43083). The masking also provided no real
-            # protection — the same secret still leaks verbatim through tool
-            # OUTPUT (file contents, command output, diffs, the compaction
-            # block), none of which this pass ever touched. Keeping secrets
-            # out of the replayable store is a separate tokenization/vault
-            # concern, not something arg-redaction can deliver without
-            # breaking replay. Storage-time redaction remains governed by the
-            # `security.redact_secrets` toggle. (#19798 introduced this;
-            # #43083 removed it.)
+            # Defence-in-depth: redact credentials from tool call arguments
+            # before they enter conversation history. Tool execution uses the
+            # raw API response object, not this dict, so redacting the
+            # persisted shape is safe and only affects storage. Catches the
+            # case where a model accidentally inlines a secret into a tool
+            # call (e.g. `terminal(command="curl -H 'Authorization: Bearer
+            # sk-...'")`). (#19798)
+            if isinstance(tc_dict["function"]["arguments"], str):
+                from agent.redact import redact_sensitive_text
+                tc_dict["function"]["arguments"] = redact_sensitive_text(
+                    tc_dict["function"]["arguments"]
+                )
            # Preserve extra_content (e.g. Gemini thought_signature) so it
            # is sent back on subsequent API calls.  Without this, Gemini 3
            # thinking models reject the request with a 400 error.
@@ -1133,22 +1093,8 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
        if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
            agent._rate_limited_until = time.monotonic() + 60
    if agent._fallback_index >= len(agent._fallback_chain):
-        # Chain exhausted.  If we actually walked a non-empty chain and the
-        # failure was NOT a rate-limit/billing event (those already armed
-        # their own 60s cooldown above), arm a short cooldown so the next
-        # turn's restore_primary_runtime stays gated instead of resetting
-        # _fallback_index=0 and re-marshaling the whole context across every
-        # provider again.  Guards the cross-turn replay storm in #24996.
-        if (
-            len(agent._fallback_chain) > 0
-            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
-        ):
-            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
-            agent._rate_limited_until = max(
-                _existing_cooldown,
-                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
-            )
        return False
+
    fb = agent._fallback_chain[agent._fallback_index]
    agent._fallback_index += 1
    fb_provider = (fb.get("provider") or "").strip().lower()
@@ -1264,16 +1210,14 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
            agent._transport_cache.clear()
        agent._fallback_activated = True

-        # Rebind the credential pool to the fallback provider when the provider
-        # changes.  Keeping the primary pool attached would make downstream
-        # recovery (rate_limit / billing / auth) mutate the wrong credential
-        # set and can overwrite the fallback's base_url back to the primary
-        # endpoint.  See #33163.
-        #
+        # Clear the credential pool when the fallback provider doesn't match
+        # the pool's provider.  The pool was seeded for the primary provider;
+        # leaving it attached means downstream recovery (rate_limit / billing /
+        # auth) calls ``_swap_credential`` with a primary entry which overwrites
+        # the agent's ``base_url`` back to the primary's endpoint — every
+        # fallback request then 404s against the wrong host.  See #33163.
        # When the fallback shares the pool's provider (e.g. both openrouter
-        # entries with different routing) the pool is preserved.  When the
-        # providers differ, load the fallback provider's own pool if one exists
-        # so provider-specific rotation continues to work after the switch.
+        # entries with different routing) the pool is preserved.
        _existing_pool = getattr(agent, "_credential_pool", None)
        if _existing_pool is not None:
            _pool_provider = (getattr(_existing_pool, "provider", "") or "").strip().lower()
@@ -1284,22 +1228,6 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
                    fb_provider, fb_model, _pool_provider,
                )
                agent._credential_pool = None
-        if getattr(agent, "_credential_pool", None) is None:
-            try:
-                from agent.credential_pool import load_pool
-
-                fallback_pool = load_pool(fb_provider)
-                if fallback_pool and fallback_pool.has_credentials():
-                    agent._credential_pool = fallback_pool
-                    logger.info(
-                        "Fallback to %s/%s: attached fallback credential pool",
-                        fb_provider, fb_model,
-                    )
-            except Exception as exc:
-                logger.debug(
-                    "Fallback to %s/%s: could not attach credential pool: %s",
-                    fb_provider, fb_model, exc,
-                )

        # Honor per-provider / per-model request_timeout_seconds for the
        # fallback target (same knob the primary client uses).  None = use
@@ -1530,9 +1458,8 @@ def handle_max_iterations(agent, messages: list, api_call_count: int) -> str:
                provider_preferences["ignore"] = agent.providers_ignored
            if agent.providers_order:
                provider_preferences["order"] = agent.providers_order
-            _provider_sort = _validated_openrouter_provider_sort(agent.provider_sort)
-            if _provider_sort:
-                provider_preferences["sort"] = _provider_sort
+            if agent.provider_sort:
+                provider_preferences["sort"] = agent.provider_sort
            if provider_preferences and (
                (agent.provider or "").strip().lower() == "openrouter"
                or agent._is_openrouter_url()
@@ -2464,19 +2391,12 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            diag=request_client_holder.get("diag"),
                        )
                        _close_request_client_once("stream_mid_tool_retry_cleanup")
-                        if agent.api_mode == "anthropic_messages":
-                            try:
-                                agent._anthropic_client.close()
-                                agent._rebuild_anthropic_client()
-                            except Exception:
-                                pass
-                        else:
-                            try:
-                                agent._replace_primary_openai_client(
-                                    reason="stream_mid_tool_retry_pool_cleanup"
-                                )
-                            except Exception:
-                                pass
+                        try:
+                            agent._replace_primary_openai_client(
+                                reason="stream_mid_tool_retry_pool_cleanup"
+                            )
+                        except Exception:
+                            pass
                        continue

                    # SSE error events from proxies (e.g. OpenRouter sends
@@ -2524,19 +2444,12 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            _close_request_client_once("stream_retry_cleanup")
                            # Also rebuild the primary client to purge
                            # any dead connections from the pool.
-                            if agent.api_mode == "anthropic_messages":
-                                try:
-                                    agent._anthropic_client.close()
-                                    agent._rebuild_anthropic_client()
-                                except Exception:
-                                    pass
-                            else:
-                                try:
-                                    agent._replace_primary_openai_client(
-                                        reason="stream_retry_pool_cleanup"
-                                    )
-                                except Exception:
-                                    pass
+                            try:
+                                agent._replace_primary_openai_client(
+                                    reason="stream_retry_pool_cleanup"
+                                )
+                            except Exception:
+                                pass
                            continue
                        # Retries exhausted. Log the final failure with
                        # full diagnostic detail (chain, headers,
@@ -2648,17 +2561,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0)
        else:
            _stream_stale_timeout = _stream_stale_timeout_base
-        # Reasoning-model floor: known reasoning models (Nemotron 3 Ultra,
-        # OpenAI o1/o3, Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ,
-        # xAI Grok reasoning, etc.) routinely exceed the default 180s chat-
-        # model threshold during their thinking phase.  The cloud gateway
-        # upstream kills the socket first, surfacing as BrokenPipeError.
-        # Raises the floor only — never overrides explicit user config
-        # (handled by get_provider_stale_timeout above).
-        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-        _reasoning_floor = get_reasoning_stale_timeout_floor(api_kwargs.get("model"))
-        if _reasoning_floor is not None:
-            _stream_stale_timeout = max(_stream_stale_timeout, _reasoning_floor)

    t = threading.Thread(target=_call, daemon=True)
    t.start()
@@ -2707,17 +2609,10 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                pass
            # Rebuild the primary client too — its connection pool
            # may hold dead sockets from the same provider outage.
-            if agent.api_mode == "anthropic_messages":
-                try:
-                    agent._anthropic_client.close()
-                    agent._rebuild_anthropic_client()
-                except Exception:
-                    pass
-            else:
-                try:
-                    agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
-                except Exception:
-                    pass
+            try:
+                agent._replace_primary_openai_client(reason="stale_stream_pool_cleanup")
+            except Exception:
+                pass
            # Reset the timer so we don't kill repeatedly while
            # the inner thread processes the closure.
            last_chunk_time["t"] = time.time()
@@ -2793,30 +2688,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                role="assistant", content=_partial_text, tool_calls=None,
                reasoning_content=None,
            )
-            # Detect provider output-layer content filtering (e.g. MiniMax
-            # "output new_sensitive (1027)", Azure/OpenAI content_filter,
-            # Anthropic safety refusal).  The raw error is about to be
-            # swallowed into a finish_reason=length stub, so classify it HERE
-            # while we still have it and stamp the stub.  Retrying such a
-            # content-deterministic filter on the same primary just re-hits
-            # the filter — the conversation loop reads this tag and activates
-            # the fallback chain instead of burning continuation retries.
-            # error_classifier is the single source of truth for "what counts
-            # as a content filter" (#32421).
-            _content_filter_terminated = False
-            try:
-                from agent.error_classifier import classify_api_error, FailoverReason
-                _cls = classify_api_error(
-                    result["error"],
-                    provider=str(getattr(agent, "provider", "") or ""),
-                    model=str(getattr(agent, "model", "") or ""),
-                )
-                _content_filter_terminated = (
-                    _cls.reason == FailoverReason.content_policy_blocked
-                )
-            except Exception:
-                _content_filter_terminated = False
-            _stub = SimpleNamespace(
+            return SimpleNamespace(
                id=PARTIAL_STREAM_STUB_ID,
                model=getattr(agent, "model", "unknown"),
                choices=[SimpleNamespace(
@@ -2825,9 +2697,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                usage=None,
                _dropped_tool_names=_partial_names or None,
            )
-            if _content_filter_terminated:
-                _stub._content_filter_terminated = True
-            return _stub
        raise result["error"]
    return result["response"]

--- a/agent/coding_context.py
+++ b/agent/coding_context.py
@@ -60,8 +60,6 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 logger = logging.getLogger("hermes.coding_context")

 CODING_TOOLSET = "coding"
@@ -85,59 +83,6 @@ _PROJECT_MARKERS = (
 # Agent-instruction files surfaced separately from manifests in the snapshot.
 _CONTEXT_FILES = ("AGENTS.md", "CLAUDE.md", ".cursorrules")

-# Source-file extensions that make a git repo a *code* workspace even with no
-# manifest. Without this, `git init` on a notes/writing/research folder (a huge
-# non-coding use case) would flip the whole session into the coding posture just
-# for having a `.git`. A manifest still wins on its own (see `_PROJECT_MARKERS`).
-_CODE_EXTENSIONS = frozenset({
-    ".py", ".pyi", ".ipynb", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
-    ".go", ".rs", ".java", ".kt", ".kts", ".scala", ".rb", ".php", ".c", ".h",
-    ".cc", ".cpp", ".hpp", ".cs", ".swift", ".m", ".mm", ".dart", ".ex", ".exs",
-    ".lua", ".sh", ".bash", ".zsh", ".sql", ".vue", ".svelte", ".r", ".jl",
-    ".hs", ".clj", ".erl", ".pl",
-})
-
-# Dirs never worth scanning for the code check (deps/build/vcs/venv noise).
-_CODE_SCAN_SKIP_DIRS = frozenset({
-    ".git", "node_modules", "venv", ".venv", "__pycache__", "dist", "build",
-    "target", ".next", ".turbo", "vendor",
-})
-
-# Bounded sweep: a code workspace reveals itself in the first handful of entries.
-_CODE_SCAN_MAX_ENTRIES = 500
-
-
-def _has_code_files(root: Path) -> bool:
-    """Cheap, bounded check for source files in a repo's top two levels.
-
-    Lets a git repo of loose scripts (no manifest) still read as a code
-    workspace while a bare notes/writing repo does not. Scans the root and its
-    immediate subdirectories only, capped at ``_CODE_SCAN_MAX_ENTRIES`` stats —
-    a handful of readdirs at session start, not a full walk.
-    """
-    seen = 0
-    stack = [(root, True)]
-    while stack:
-        directory, is_root = stack.pop()
-        try:
-            with os.scandir(directory) as entries:
-                for entry in entries:
-                    seen += 1
-                    if seen > _CODE_SCAN_MAX_ENTRIES:
-                        return False
-                    name = entry.name
-                    try:
-                        if entry.is_file():
-                            if os.path.splitext(name)[1].lower() in _CODE_EXTENSIONS:
-                                return True
-                        elif is_root and entry.is_dir() and name not in _CODE_SCAN_SKIP_DIRS and not name.startswith("."):
-                            stack.append((Path(entry.path), False))
-                    except OSError:
-                        continue
-        except OSError:
-            continue
-    return False
-
 # Lockfile → package manager, checked in priority order.
 _PY_LOCKFILES = (("uv.lock", "uv"), ("poetry.lock", "poetry"), ("Pipfile.lock", "pipenv"))
 _JS_LOCKFILES = (
@@ -423,16 +368,10 @@ def _detect_profile_name(mode: str, platform: str, cwd_str: str) -> str:
    if platform and platform.strip().lower() not in INTERACTIVE_CODING_PLATFORMS:
        return GENERAL_PROFILE.name
    cwd = Path(cwd_str)
-    # A recognized project root (manifest / AGENTS.md / .cursorrules) is a code
-    # workspace on its own — cheap stat checks, no scan.
-    if _marker_root(cwd) is not None:
-        return CODING_PROFILE.name
    git_root = _git_root(cwd)
    if git_root is not None and git_root == _home():
        git_root = None  # dotfiles repo at $HOME — not a code workspace
-    # A bare git repo only counts when it actually holds code, so `git init` on a
-    # notes/writing/research folder stays in the general posture.
-    if git_root is not None and _has_code_files(git_root):
+    if git_root is not None or _marker_root(cwd) is not None:
        return CODING_PROFILE.name
    return GENERAL_PROFILE.name

@@ -649,14 +588,12 @@ def _enabled_mcp_servers(config: Optional[dict[str, Any]]) -> list[str]:


 def _git(cwd: Path, *args: str) -> str:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        out = subprocess.run(
            ["git", "-C", str(cwd), *args],
            capture_output=True,
            text=True,
            timeout=_GIT_TIMEOUT,
-            **_popen_kwargs,
        )
    except (OSError, subprocess.SubprocessError):
        return ""
@@ -698,32 +635,25 @@ def _read_small(path: Path) -> str:
        return ""


-@dataclass(frozen=True)
-class ProjectFacts:
-    """Structured project facts — the model's verify loop, detected once.
+def _project_facts(root: Path) -> list[str]:
+    """Detected project facts for the workspace snapshot.

-    The same data that feeds the workspace snapshot, exposed structurally so
-    non-prompt consumers (e.g. the desktop verify UI) read it instead of
-    re-detecting and drifting from the prompt.
+    The point is to hand the model its *verify loop* up front — which manifest,
+    which package manager, and the exact test/lint/build commands — instead of
+    making it rediscover them every session. Cheap: stat calls plus reads of a
+    couple of small files; built once at prompt-build time (cache-safe).
    """
+    facts: list[str] = []

-    manifests: list[str]
-    package_managers: list[str]
-    verify_commands: list[str]
-    context_files: list[str]
-
-
-def detect_project_facts(root: Path) -> ProjectFacts:
-    """Detect manifests, package manager(s), verify commands, and context files.
-
-    Cheap: stat calls plus reads of a couple of small files. The single source
-    of truth for both the prompt snapshot (:func:`_project_facts`) and the
-    gateway's ``project.facts`` — so the UI never re-sniffs verify commands.
-    """
    manifests = [m for m in _PROJECT_MARKERS if m not in _CONTEXT_FILES and (root / m).is_file()]
-    package_managers = list(
-        dict.fromkeys(pm for lock, pm in (*_PY_LOCKFILES, *_JS_LOCKFILES) if (root / lock).is_file())
-    )
+    package_managers = [
+        pm for lock, pm in (*_PY_LOCKFILES, *_JS_LOCKFILES) if (root / lock).is_file()
+    ]
+    if manifests:
+        line = f"- Project: {', '.join(manifests[:6])}"
+        if package_managers:
+            line += f" ({'/'.join(dict.fromkeys(package_managers))})"
+        facts.append(line)

    verify: list[str] = []
    if (root / "scripts" / "run_tests.sh").is_file():
@@ -743,61 +673,17 @@ def detect_project_facts(root: Path) -> ProjectFacts:
            f"make {name}" for name in _VERIFY_TARGETS
            if re.search(rf"^{re.escape(name)}\s*:", makefile, re.MULTILINE)
        )
+    if verify:
+        deduped = list(dict.fromkeys(verify))[:_MAX_VERIFY_COMMANDS]
+        facts.append(f"- Verify: {'; '.join(deduped)}")

-    return ProjectFacts(
-        manifests=manifests,
-        package_managers=package_managers,
-        verify_commands=list(dict.fromkeys(verify))[:_MAX_VERIFY_COMMANDS],
-        context_files=[c for c in _CONTEXT_FILES if (root / c).is_file()],
-    )
-
-
-def _project_facts(root: Path) -> list[str]:
-    """Render :func:`detect_project_facts` as workspace-snapshot lines.
-
-    Hands the model its *verify loop* up front — which manifest, which package
-    manager, and the exact test/lint/build commands — instead of making it
-    rediscover them every session. Built once at prompt-build time; the string
-    output must stay byte-stable to preserve the prompt cache.
-    """
-    f = detect_project_facts(root)
-    facts: list[str] = []
-
-    if f.manifests:
-        line = f"- Project: {', '.join(f.manifests[:6])}"
-        if f.package_managers:
-            line += f" ({'/'.join(f.package_managers)})"
-        facts.append(line)
-    if f.verify_commands:
-        facts.append(f"- Verify: {'; '.join(f.verify_commands)}")
-    if f.context_files:
-        facts.append(f"- Context files: {', '.join(f.context_files)}")
+    context_files = [c for c in _CONTEXT_FILES if (root / c).is_file()]
+    if context_files:
+        facts.append(f"- Context files: {', '.join(context_files)}")

    return facts


-def project_facts_for(cwd: Optional[str | Path] = None) -> Optional[dict[str, Any]]:
-    """Structured project facts for ``cwd`` — ``None`` outside a workspace.
-
-    Same detection the system-prompt snapshot uses (git root, else marker root),
-    exposed for non-prompt consumers (the desktop verify UI) so they never
-    re-derive "are we coding?" or duplicate the verify-command sniffing.
-    """
-    resolved = _resolve_cwd(cwd)
-    root = _git_root(resolved) or _marker_root(resolved)
-    if root is None:
-        return None
-
-    f = detect_project_facts(root)
-    return {
-        "root": str(root),
-        "manifests": f.manifests,
-        "packageManagers": f.package_managers,
-        "verifyCommands": f.verify_commands,
-        "contextFiles": f.context_files,
-    }
-
-
 def build_coding_workspace_block(cwd: Optional[str | Path] = None) -> str:
    """Workspace snapshot for the system prompt (empty outside a workspace).

--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -890,15 +890,7 @@ class ContextCompressor(ContextEngine):
        # This is independent of the abort_on_summary_failure config flag:
        # rotating on a broken credential is never the right behavior.
        self._last_summary_auth_failure: bool = False
-        # Set when summary generation ultimately fails due to a transient
-        # network/connection error (httpx/httpcore connection drop, premature
-        # stream close, etc.) — distinct from auth failures but treated the
-        # same way by compress(): ABORT and preserve the session unchanged
-        # rather than destroy the middle window for a deterministic
-        # "summary unavailable" marker. Retrying once the network recovers is
-        # strictly better than discarding context for a transient blip
-        # (#29559, #25585). Independent of abort_on_summary_failure.
-        self._last_summary_network_failure: bool = False
+        # When a user-configured summary model fails and we recover by
        # retrying on the main model, record the failure so gateway /
        # CLI callers can still warn the user even though compression
        # succeeded.  Silent recovery would hide the broken config.
@@ -1695,7 +1687,6 @@ This compaction should PRIORITISE preserving all information related to the focu
            self._summary_model_fallen_back = False
            self._last_summary_error = None
            self._last_summary_auth_failure = False
-            self._last_summary_network_failure = False
            return self._with_summary_prefix(summary)
        except Exception as e:
            # ``call_llm`` raises ``RuntimeError`` for two very different cases:
@@ -1828,15 +1819,6 @@ This compaction should PRIORITISE preserving all information related to the focu
            if len(err_text) > 220:
                err_text = err_text[:217].rstrip() + "..."
            self._last_summary_error = err_text
-            # A terminal connection/network failure (we reach this branch only
-            # after any main-model fallback has already been tried or is
-            # unavailable). Flag it so compress() ABORTS and preserves the
-            # session unchanged instead of destroying the middle window for a
-            # placeholder marker — retrying once the network recovers is
-            # strictly better than dropping context (#29559, #25585). Mirrors
-            # the auth-failure carve-out; independent of abort_on_summary_failure.
-            if _is_streaming_closed:
-                self._last_summary_network_failure = True
            logger.warning(
                "Failed to generate context summary: %s. "
                "Further summary attempts paused for %d seconds.",
@@ -2400,7 +2382,6 @@ This compaction should PRIORITISE preserving all information related to the focu
        self._last_aux_model_failure_model = None
        self._last_compress_aborted = False
        self._last_summary_auth_failure = False
-        self._last_summary_network_failure = False

        # Manual /compress (force=True) bypasses the failure cooldown so the
        # user can retry immediately after an auto-compress abort.  Without
@@ -2517,21 +2498,15 @@ This compaction should PRIORITISE preserving all information related to the focu
        #           surface a warning.
        # Default is False (historical behavior).
        #
-        # EXCEPTION — auth AND transient network failures always abort. A
-        # 401/403 from the summary call means the credential or endpoint is
-        # broken (invalid/blocked key, or a token pointed at the wrong
-        # inference host). A connection/stream-close error means the network
-        # blipped at the compaction moment (#29559). In BOTH cases rotating into
+        # EXCEPTION — auth failures always abort. A 401/403 from the summary
+        # call means the credential or endpoint is broken (invalid/blocked
+        # key, or a token pointed at the wrong inference host). Rotating into
        # a child session with a placeholder summary on a broken credential
        # strands the user on a degraded session for zero benefit — every
        # subsequent call fails the same way. So when the failure was an auth
        # error we abort regardless of abort_on_summary_failure, preserving
        # the conversation unchanged until the credential is fixed.
-        if not summary and (
-            self.abort_on_summary_failure
-            or self._last_summary_auth_failure
-            or self._last_summary_network_failure
-        ):
+        if not summary and (self.abort_on_summary_failure or self._last_summary_auth_failure):
            n_skipped = compress_end - compress_start
            self._last_summary_dropped_count = 0  # nothing actually dropped
            self._last_summary_fallback_used = False
@@ -2546,15 +2521,6 @@ This compaction should PRIORITISE preserving all information related to the focu
                        "with /compress or start fresh with /new.",
                        n_skipped,
                    )
-                elif self._last_summary_network_failure:
-                    logger.warning(
-                        "Summary generation failed with a network/connection "
-                        "error — aborting compression. %d message(s) preserved "
-                        "unchanged; the session was NOT rotated. This is "
-                        "transient: retry with /compress once connectivity "
-                        "recovers, or continue the conversation as-is.",
-                        n_skipped,
-                    )
                else:
                    logger.warning(
                        "Summary generation failed — aborting compression "
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -12,7 +12,6 @@ from pathlib import Path
 from typing import Awaitable, Callable

 from agent.model_metadata import estimate_tokens_rough
-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags

 _QUOTED_REFERENCE_VALUE = r'(?:`[^`\n]+`|"[^"\n]+"|\'[^\'\n]+\')'
 REFERENCE_PATTERN = re.compile(
@@ -291,7 +290,6 @@ def _expand_git_reference(
    args: list[str],
    label: str,
 ) -> tuple[str | None, str | None]:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["git", *args],
@@ -300,7 +298,6 @@ def _expand_git_reference(
            text=True,
            timeout=30,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"{ref.raw}: git command timed out (30s)", None
@@ -486,7 +483,6 @@ def _iter_visible_entries(path: Path, cwd: Path, limit: int) -> list[Path]:


 def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        result = subprocess.run(
            ["rg", "--files", str(path.relative_to(cwd))],
@@ -495,7 +491,6 @@ def _rg_files(path: Path, cwd: Path, limit: int) -> list[Path] | None:
            text=True,
            timeout=10,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
        return None
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -90,7 +90,6 @@ def check_compression_model_feasibility(agent: Any) -> None:
    try:
        from agent.auxiliary_client import (
            _resolve_task_provider_model,
-            _try_configured_fallback_for_unavailable_client,
            get_text_auxiliary_client,
        )
        from agent.model_metadata import (
@@ -98,6 +97,10 @@ def check_compression_model_feasibility(agent: Any) -> None:
            get_model_context_length,
        )

+        client, aux_model = get_text_auxiliary_client(
+            "compression",
+            main_runtime=agent._current_main_runtime(),
+        )
        # Best-effort aux provider label for the warning message. The
        # configured provider may be "auto", in which case we fall back
        # to the client's base_url hostname so the user can still tell
@@ -106,19 +109,6 @@ def check_compression_model_feasibility(agent: Any) -> None:
            _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
        except Exception:
            _aux_cfg_provider = ""
-        client, aux_model = get_text_auxiliary_client(
-            "compression",
-            main_runtime=agent._current_main_runtime(),
-        )
-        if client is None or not aux_model:
-            fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
-                "compression",
-                _aux_cfg_provider,
-            )
-            if fb_client is not None and fb_model:
-                client, aux_model = fb_client, fb_model
-                if "(" in fb_label and fb_label.endswith(")"):
-                    _aux_cfg_provider = fb_label.rsplit("(", 1)[1][:-1]
        if client is None or not aux_model:
            if _aux_cfg_provider and _aux_cfg_provider != "auto":
                msg = (
@@ -288,29 +278,6 @@ def replay_compression_warning(agent: Any) -> None:
            pass


-def conversation_history_after_compression(agent: Any, messages: list) -> Optional[list]:
-    """Return the correct flush baseline after a compression boundary.
-
-    Legacy compression rotates to a fresh child session. That child has not
-    seen the compacted transcript through the normal same-turn flush path yet,
-    so callers must clear ``conversation_history`` to ``None`` and let the next
-    persistence call write the whole compacted list.
-
-    In-place compaction is different: ``archive_and_compact()`` has already
-    soft-archived the previous active rows and inserted ``messages`` as the new
-    active live transcript under the same session id. If the same agent turn
-    continues with ``conversation_history=None``, the identity-based flush path
-    treats those already-persisted compacted dicts as new and appends them a
-    second time, doubling the active context and retriggering compression.
-
-    A shallow copy is intentional: it captures the current compacted dict
-    identities as history while allowing later same-turn appends to remain new.
-    """
-    if bool(getattr(agent, "_last_compaction_in_place", False)):
-        return list(messages)
-    return None
-
-
 def compress_context(
    agent: Any,
    messages: list,
@@ -838,11 +805,10 @@ def try_shrink_image_parts_in_messages(
    Pillow couldn't help (caller should surface the original error).

    Strategy: look for ``image_url`` / ``input_image`` parts carrying a
-    ``data:image/...;base64,...`` payload, plus Anthropic-native
-    ``{"type": "image", "source": {"type": "base64", ...}}`` blocks.
-    For each one whose encoded size exceeds 4 MB (a safe target that slides
-    under Anthropic's 5 MB ceiling with header overhead) or whose longest side
-    exceeds ``max_dimension``, write the base64 to a tempfile, call
+    ``data:image/...;base64,...`` payload.  For each one whose encoded
+    size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB
+    ceiling with header overhead) or whose longest side exceeds
+    ``max_dimension``, write the base64 to a tempfile, call
    ``vision_tools._resize_image_for_vision`` to produce a smaller data
    URL, and substitute it in place.

@@ -998,28 +964,6 @@ def try_shrink_image_parts_in_messages(
            logger.warning("image-shrink recovery: re-encode failed — %s", exc)
            return None, triggered_by is not None

-    def _source_to_data_url(source: Any) -> Optional[str]:
-        if not isinstance(source, dict) or source.get("type") != "base64":
-            return None
-        data = source.get("data")
-        if not isinstance(data, str) or not data:
-            return None
-        media_type = str(source.get("media_type") or "image/jpeg").strip()
-        if not media_type.startswith("image/"):
-            media_type = "image/jpeg"
-        return f"data:{media_type};base64,{data}"
-
-    def _write_data_url_to_source(source: dict, data_url: str) -> None:
-        header, _, data = data_url.partition(",")
-        media_type = "image/jpeg"
-        if header.startswith("data:"):
-            candidate = header[len("data:"):].split(";", 1)[0].strip()
-            if candidate.startswith("image/"):
-                media_type = candidate
-        source["type"] = "base64"
-        source["media_type"] = media_type
-        source["data"] = data
-
    for msg in api_messages:
        if not isinstance(msg, dict):
            continue
@@ -1030,16 +974,6 @@ def try_shrink_image_parts_in_messages(
            if not isinstance(part, dict):
                continue
            ptype = part.get("type")
-            if ptype == "image":
-                source = part.get("source")
-                url = _source_to_data_url(source)
-                resized, unshrinkable = _shrink_data_url(url or "")
-                if resized and isinstance(source, dict):
-                    _write_data_url_to_source(source, resized)
-                    changed_count += 1
-                elif unshrinkable:
-                    unshrinkable_oversized += 1
-                continue
            if ptype not in {"image_url", "input_image"}:
                continue
            image_value = part.get("image_url")
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -28,7 +28,6 @@ import uuid
 from typing import Any, Dict, List, Optional

 from agent.codex_responses_adapter import _summarize_user_message_for_log
-from agent.conversation_compression import conversation_history_after_compression
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
 from agent.iteration_budget import IterationBudget
@@ -36,7 +35,6 @@ from agent.turn_context import build_turn_context
 from agent.turn_retry_state import TurnRetryState
 from agent.memory_manager import build_memory_context_block
 from agent.message_sanitization import (
-    close_interrupted_tool_sequence,
    _repair_tool_call_arguments,
    _sanitize_messages_non_ascii,
    _sanitize_messages_surrogates,
@@ -57,7 +55,7 @@ from agent.model_metadata import (
 )
 from agent.process_bootstrap import _install_safe_stdio
 from agent.prompt_caching import apply_anthropic_cache_control
-from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff
+from agent.retry_utils import jittered_backoff
 from agent.trajectory import has_incomplete_scratchpad
 from agent.usage_pricing import estimate_usage_cost, normalize_usage
 from hermes_constants import PARTIAL_STREAM_STUB_ID
@@ -503,7 +501,6 @@ def run_conversation(
    stream_callback: Optional[callable] = None,
    persist_user_message: Optional[str] = None,
    persist_user_timestamp: Optional[float] = None,
-    moa_config: Optional[dict[str, Any]] = None,
 ) -> Dict[str, Any]:
    """
    Run a complete conversation with tool calling until completion.
@@ -526,19 +523,6 @@ def run_conversation(
    Returns:
        Dict: Complete conversation result with final response and message history
    """
-    if moa_config is None:
-        try:
-            from hermes_cli.moa_config import decode_moa_turn
-
-            _decoded_message, _decoded_moa_config = decode_moa_turn(user_message)
-            if _decoded_moa_config is not None:
-                user_message = _decoded_message
-                moa_config = _decoded_moa_config
-                if persist_user_message is None:
-                    persist_user_message = _decoded_message
-        except Exception:
-            pass
-
    # ── Per-turn setup (the prologue) ──
    # All once-per-turn setup — stdio guarding, retry-counter resets, user
    # message sanitization, todo/nudge hydration, system-prompt restore-or-
@@ -588,13 +572,6 @@ def run_conversation(
    compression_attempts = 0
    _turn_exit_reason = "unknown"  # Diagnostic: why the loop ended

-    # Per-turn tally of consecutive successful credential-pool token refreshes,
-    # keyed by (provider, pool-entry-id). A persistent upstream 401 lets
-    # ``try_refresh_current()`` "succeed" forever on a single-entry OAuth pool,
-    # so this tally caps same-entry refreshes and lets the fallback chain take
-    # over instead of spinning. Reset here so each turn starts fresh. See #26080.
-    agent._auth_pool_refresh_counts = {}
-
    # Optional opt-in runtime: if api_mode == codex_app_server, hand the
    # turn to the codex app-server subprocess (terminal/file ops/patching
    # all run inside Codex). Default Hermes path is bypassed entirely.
@@ -824,28 +801,6 @@ def run_conversation(
        if effective_system:
            api_messages = [{"role": "system", "content": effective_system}] + api_messages

-        if moa_config:
-            try:
-                from agent.moa_loop import aggregate_moa_context
-
-                _moa_context = aggregate_moa_context(
-                    user_prompt=original_user_message if isinstance(original_user_message, str) else str(original_user_message),
-                    api_messages=api_messages,
-                    reference_models=moa_config.get("reference_models") or [],
-                    aggregator=moa_config.get("aggregator") or {},
-                    temperature=float(moa_config.get("reference_temperature", 0.6) or 0.6),
-                    aggregator_temperature=float(moa_config.get("aggregator_temperature", 0.4) or 0.4),
-                )
-                if _moa_context:
-                    for _msg in reversed(api_messages):
-                        if _msg.get("role") == "user":
-                            _base = _msg.get("content", "")
-                            if isinstance(_base, str):
-                                _msg["content"] = _base + "\n\n" + _moa_context
-                            break
-            except Exception as _moa_exc:
-                logger.warning("MoA context aggregation failed: %s", _moa_exc)
-
        # Inject ephemeral prefill messages right after the system prompt
        # but before conversation history. Same API-call-time-only pattern.
        if agent.prefill_messages:
@@ -1167,7 +1122,7 @@ def run_conversation(
                # stream.  Mirror the ACP exclusion used for Responses
                # API upgrade (lines ~1083-1085).
                elif (
-                    agent.provider in {"copilot-acp", "moa"}
+                    agent.provider == "copilot-acp"
                    or str(agent.base_url or "").lower().startswith("acp://copilot")
                    or str(agent.base_url or "").lower().startswith("acp+tcp://")
                ):
@@ -1441,12 +1396,10 @@ def run_conversation(
                    while time.time() < sleep_end:
                        if agent._interrupt_requested:
                            agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
-                            _interrupt_text = f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries})."
-                            close_interrupted_tool_sequence(messages, _interrupt_text)
                            agent._persist_session(messages, conversation_history)
                            agent.clear_interrupt()
                            return {
-                                "final_response": _interrupt_text,
+                                "final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).",
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
@@ -1699,56 +1652,6 @@ def run_conversation(

                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
-                        # ── Content-filter stream stall → fallback (#32421) ──
-                        # When the provider's output-layer safety filter (e.g.
-                        # MiniMax "output new_sensitive (1027)", Azure
-                        # content_filter) kills the stream mid-delivery, the
-                        # raw error was classified at the swallow point and the
-                        # stub tagged ``_content_filter_terminated``.  This
-                        # filter is content-deterministic — continuation
-                        # retries against the SAME primary just re-hit it and
-                        # burn paid attempts (the loop used to give up with
-                        # "Response remained truncated after 3 continuation
-                        # attempts" and never consult the fallback chain).
-                        # Escalate to the configured fallback BEFORE retrying.
-                        _cf_terminated = getattr(
-                            response, "_content_filter_terminated", False
-                        )
-                        if (
-                            _cf_terminated
-                            and agent._fallback_index < len(agent._fallback_chain)
-                        ):
-                            agent._vprint(
-                                f"{agent.log_prefix}🛡️  Content filter terminated "
-                                f"stream — activating fallback provider...",
-                                force=True,
-                            )
-                            agent._emit_status(
-                                "Content filter terminated stream; switching to fallback..."
-                            )
-                            if agent._try_activate_fallback():
-                                # Roll the partial content (if any was already
-                                # appended in a prior continuation pass) back to
-                                # the last clean turn so the fallback provider
-                                # gets a coherent continuation point.
-                                if truncated_response_parts:
-                                    messages = agent._get_messages_up_to_last_assistant(messages)
-                                agent._session_messages = messages
-                                length_continue_retries = 0
-                                truncated_response_parts = []
-                                retry_count = 0
-                                compression_attempts = 0
-                                _retry.primary_recovery_attempted = False
-                                _retry.restart_with_rebuilt_messages = True
-                                break
-                            # No fallback available — fall through to normal
-                            # continuation (best-effort, may loop).
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  No fallback provider "
-                                f"configured — retrying with same provider "
-                                f"(may re-hit filter)...",
-                                force=True,
-                            )
                        if assistant_message is not None and not _trunc_has_tool_calls:
                            length_continue_retries += 1
                            interim_msg = agent._build_assistant_message(assistant_message, finish_reason)
@@ -2068,21 +1971,9 @@ def run_conversation(
                    agent.thinking_callback("")
                api_elapsed = time.time() - api_start_time
                agent._vprint(f"{agent.log_prefix}⚡ Interrupted during API call.", force=True)
-                interrupted = True
-                # Preserve any assistant text already streamed to the user
-                # before the stop landed. Dropping it leaves history with no
-                # record of the half-finished reply on screen, so the next turn
-                # the model "forgets" what it just said — exactly what users hit
-                # when they stop to redirect mid-response.
-                _partial = agent._strip_think_blocks(
-                    getattr(agent, "_current_streamed_assistant_text", "") or ""
-                ).strip()
-                if _partial:
-                    messages.append({"role": "assistant", "content": _partial})
-                    final_response = _partial
-                else:
-                    final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
                agent._persist_session(messages, conversation_history)
+                interrupted = True
+                final_response = f"{INTERRUPT_WAITING_FOR_MODEL_PREFIX}{api_elapsed:.1f}s elapsed)."
                break

            except Exception as api_error:
@@ -2316,15 +2207,6 @@ def run_conversation(
                    # "unknown variant `image_url`, expected `text`".
                    "unknown variant `image_url`, expected `text`",
                    "unknown variant image_url, expected text",
-                    # OpenRouter routes a request to upstream endpoints and,
-                    # when none of the candidate endpoints for the model accept
-                    # image input, returns HTTP 404 "No endpoints found that
-                    # support image input". Without this phrase the agent never
-                    # strips the images, the retry loop re-sends the same
-                    # rejected request until exhaustion, and the gateway leaves
-                    # every subsequent message queued behind the stuck turn —
-                    # the P1 in issue #21160. The 404 passes the 4xx gate below.
-                    "no endpoints found that support image input",
                )
                _err_lower = _err_body.lower()
                _looks_like_image_rejection = any(
@@ -2781,12 +2663,10 @@ def run_conversation(
                # Check for interrupt before deciding to retry
                if agent._interrupt_requested:
                    agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
-                    _interrupt_text = f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))})."
-                    close_interrupted_tool_sequence(messages, _interrupt_text)
                    agent._persist_session(messages, conversation_history)
                    agent.clear_interrupt()
                    return {
-                        "final_response": _interrupt_text,
+                        "final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).",
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
@@ -2896,9 +2776,10 @@ def run_conversation(
                            approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )
-                        conversation_history = conversation_history_after_compression(
-                            agent, messages
-                        )
+                        # Compression created a new session — clear history
+                        # so _flush_messages_to_session_db writes compressed
+                        # messages to the new session, not skipping them.
+                        conversation_history = None
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
                            agent._buffer_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
@@ -2910,25 +2791,15 @@ def run_conversation(
                    # Fall through to normal error handling if compression
                    # is exhausted or didn't help.

-                # Eager fallback for rate-limit errors (429 or quota exhaustion)
-                # and transport errors (connection failure / timeout / provider
-                # overloaded).  Rate limits and billing: switch immediately —
-                # the primary provider won't recover within the retry window.
-                # Transport errors: allow 1 retry first (transient hiccups
-                # recover), then fall back if the provider is truly unreachable.
+                # Eager fallback for rate-limit errors (429 or quota exhaustion).
+                # When a fallback model is configured, switch immediately instead
+                # of burning through retries with exponential backoff -- the
+                # primary provider won't recover within the retry window.
                is_rate_limited = classified.reason in {
                    FailoverReason.rate_limit,
                    FailoverReason.billing,
                }
-                _is_transport_failure = classified.reason in {
-                    FailoverReason.timeout,
-                    FailoverReason.overloaded,
-                }
-                _should_fallback = (
-                    is_rate_limited
-                    or (_is_transport_failure and retry_count >= 2)
-                )
-                if _should_fallback and agent._fallback_index < len(agent._fallback_chain):
+                if is_rate_limited and agent._fallback_index < len(agent._fallback_chain):
                    # Don't eagerly fallback if credential pool rotation may
                    # still recover.  See _pool_may_recover_from_rate_limit
                    # for the single-credential-pool and CloudCode-quota
@@ -2943,10 +2814,6 @@ def run_conversation(
                            agent._buffer_status(
                                "⚠️ Billing or credits exhausted — switching to fallback provider..."
                            )
-                        elif _is_transport_failure:
-                            agent._buffer_status(
-                                "⚠️ Provider unreachable — switching to fallback provider..."
-                            )
                        else:
                            agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
@@ -3121,9 +2988,10 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history
+                    # so _flush_messages_to_session_db writes compressed
+                    # messages to the new session, not skipping them.
+                    conversation_history = None

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3287,9 +3155,10 @@ def run_conversation(
                        messages, system_message, approx_tokens=approx_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history
+                    # so _flush_messages_to_session_db writes compressed
+                    # messages to the new session, not skipping them.
+                    conversation_history = None

                    # Re-estimate tokens after compression.  Same-message-count
                    # compression (tool-result pruning, in-place summarization)
@@ -3551,13 +3420,6 @@ def run_conversation(
                    ):
                        _retry.primary_recovery_attempted = True
                        retry_count = 0
-                        # Primary transport recovery starts a fresh attempt
-                        # cycle. Re-open fallback state so a follow-on 429 can
-                        # still activate fallback_providers after stale
-                        # pre-recovery fallback/credential-pool bookkeeping.
-                        _retry.has_retried_429 = False
-                        agent._fallback_index = 0
-                        agent._fallback_activated = False
                        continue
                    # Try fallback before giving up entirely
                    if agent._has_pending_fallback():
@@ -3623,65 +3485,6 @@ def run_conversation(
                            force=True,
                        )

-                    # Detect thinking-timeout pattern: a known reasoning model
-                    # hit a transport-layer error before the first content
-                    # token arrived.  Distinct from _is_stream_drop above
-                    # (which fires for large file-write stream drops) and
-                    # from any classifier reason that's not a transport
-                    # timeout.  Reuses the reasoning-model allowlist from
-                    # agent/reasoning_timeouts.py (Fixes #52217) so the
-                    # trigger is consistent with what the per-model
-                    # stale-timeout floor covers.  After the classifier
-                    # override at agent/error_classifier.py:720-738 (this
-                    # PR), transport disconnects on reasoning models route
-                    # to FailoverReason.timeout rather than
-                    # context_overflow, so this branch actually fires.
-                    # Detection and message text live in
-                    # agent.thinking_timeout_guidance so they're
-                    # unit-testable without driving the full retry loop.
-                    # (Part 2 of Fixes #52310.)
-                    from agent.thinking_timeout_guidance import (
-                        is_thinking_timeout,
-                    )
-                    _is_thinking_timeout = is_thinking_timeout(
-                        classified,
-                        _model,
-                        error_msg,
-                    )
-                    if _is_thinking_timeout:
-                        agent._vprint(
-                            f"{agent.log_prefix}   💡 The model's thinking "
-                            f"phase exceeded the upstream proxy's idle "
-                            f"timeout before the first content token "
-                            f"arrived. This is a known issue with "
-                            f"reasoning models behind cloud gateways "
-                            f"(NVIDIA NIM, OpenAI, Anthropic, DeepSeek).",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      Workarounds in priority order:",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      1. Set "
-                            f"`providers.{_provider}.models.{_model}.stale_timeout_seconds: 900` "
-                            f"in `~/.hermes/config.yaml` to extend the per-call "
-                            f"timeout. (Hermes's built-in floor is 600s for "
-                            f"known reasoning models — if you still see this "
-                            f"after raising, the upstream cap is even shorter.)",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      2. Lower `reasoning_budget` or set "
-                            f"`reasoning_effort: medium` on this model if the provider supports it.",
-                            force=True,
-                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      3. Use a smaller / faster reasoning "
-                            f"model if the task doesn't require deep thinking.",
-                            force=True,
-                        )
-
                    logger.error(
                        "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s",
                        agent.log_prefix, max_retries, _final_summary,
@@ -3698,22 +3501,7 @@ def run_conversation(
                            _final_response += f"\n\n{_billing_guidance}"
                    else:
                        _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
-                    if _is_thinking_timeout:
-                        # Thinking-timeout guidance overrides the generic
-                        # stream-drop guidance — the latter is wrong for
-                        # this case (it suggests splitting large file
-                        # writes, which isn't what happened).  See the
-                        # reasoning-model override at
-                        # agent/error_classifier.py:720-738 and the
-                        # detection block above for context.
-                        from agent.thinking_timeout_guidance import (
-                            build_thinking_timeout_guidance,
-                        )
-                        _final_response += build_thinking_timeout_guidance(
-                            provider=_provider,
-                            model=_model,
-                        )
-                    elif _is_stream_drop:
+                    if _is_stream_drop:
                        _final_response += (
                            "\n\nThe provider's stream connection keeps "
                            "dropping — this often happens when generating "
@@ -3745,47 +3533,20 @@ def run_conversation(
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
-                                # Cap at 10 minutes. Anthropic Tier 1 input-token
-                                # buckets reset in ~171s, so a 120s cap caused us to
-                                # retry before the actual reset window and re-trip the
-                                # limit. 600s covers all realistic provider reset
-                                # windows while still rejecting pathological values. (#26293)
-                                _retry_after = min(float(_ra_raw), 600)
+                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
-                _backoff_policy = None
-                if is_rate_limited and not _retry_after:
-                    wait_time, _backoff_policy = adaptive_rate_limit_backoff(
-                        retry_count,
-                        base_url=str(_base),
-                        model=_model,
-                        error=api_error,
-                        default_wait=wait_time,
-                    )
                if is_rate_limited:
-                    _policy_note = ""
-                    if _backoff_policy == "zai_coding_overload_long":
-                        _policy_note = " (Z.AI Coding overload adaptive long backoff)"
-                    elif _backoff_policy == "zai_coding_overload_short":
-                        _policy_note = " (Z.AI Coding overload short retry)"
-                    _rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..."
-                    # Normal retries are buffered to avoid noisy transient chatter. Long
-                    # Z.AI Coding waits are different: they can last minutes, so surface
-                    # progress immediately instead of making the TUI look frozen.
-                    if _backoff_policy == "zai_coding_overload_long":
-                        agent._emit_status(_rate_limit_status)
-                    else:
-                        agent._buffer_status(_rate_limit_status)
+                    agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
                else:
                    agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
                logger.warning(
-                    "Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s",
+                    "Retrying API call in %ss (attempt %s/%s) %s error=%s",
                    wait_time,
                    retry_count,
                    max_retries,
                    agent._client_log_context(),
-                    _backoff_policy or "default",
                    api_error,
                )
                # Sleep in small increments so we can respond to interrupts quickly
@@ -3795,12 +3556,10 @@ def run_conversation(
                while time.time() < sleep_end:
                    if agent._interrupt_requested:
                        agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
-                        _interrupt_text = f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries})."
-                        close_interrupted_tool_sequence(messages, _interrupt_text)
                        agent._persist_session(messages, conversation_history)
                        agent.clear_interrupt()
                        return {
-                            "final_response": _interrupt_text,
+                            "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
@@ -3831,17 +3590,6 @@ def run_conversation(
            _retry.restart_with_compressed_messages = False
            continue

-        if _retry.restart_with_rebuilt_messages:
-            # A content-filter stream stall (#32421) was escalated to the
-            # fallback chain and the partial content rolled back.  Re-issue
-            # the API call against the now-active fallback provider.  Refund
-            # the budget/count for the stalled attempt so the fallback gets a
-            # fair turn.
-            api_call_count -= 1
-            agent.iteration_budget.refund()
-            _retry.restart_with_rebuilt_messages = False
-            continue
-
        if _retry.restart_with_length_continuation:
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
@@ -4302,19 +4050,6 @@ def run_conversation(

                messages.append(assistant_msg)
                agent._emit_interim_assistant_message(assistant_msg)
-                try:
-                    # Persist the assistant tool-call turn before any tool
-                    # side effects run. If a destructive tool restarts or
-                    # terminates Hermes mid-turn, resume logic still sees the
-                    # exact tool-call block that already executed.
-                    agent._flush_messages_to_session_db(messages, conversation_history)
-                except Exception as exc:
-                    logger.warning(
-                        "Incremental tool-call persistence failed before execution "
-                        "(session=%s): %s",
-                        agent.session_id or "none",
-                        exc,
-                    )

                # Close any open streaming display (response box, reasoning
                # box) before tool execution begins.  Intermediate turns may
@@ -4416,9 +4151,10 @@ def run_conversation(
                        approx_tokens=agent.context_compressor.last_prompt_tokens,
                        task_id=effective_task_id,
                    )
-                    conversation_history = conversation_history_after_compression(
-                        agent, messages
-                    )
+                    # Compression created a new session — clear history so
+                    # _flush_messages_to_session_db writes compressed messages
+                    # to the new session (see preflight compression comment).
+                    conversation_history = None
                
                # Save session log incrementally (so progress is visible even if interrupted)
                agent._session_messages = messages
@@ -4460,11 +4196,7 @@ def run_conversation(
                            "as final response"
                        )
                        final_response = _recovered
-                        # Streaming delivered a fragment, not a confirmed
-                        # final preview. Leave response_previewed false so
-                        # gateway fallback delivery can send the recovered
-                        # text plus the abnormal-turn explanation.
-                        agent._response_was_previewed = False
+                        agent._response_was_previewed = True
                        break

                    # If the previous turn already delivered real content alongside
@@ -4709,20 +4441,14 @@ def run_conversation(
                # status from earlier failed attempts in this turn.
                agent._clear_status_buffer()

-                from agent.agent_runtime_helpers import (
-                    intent_ack_continuation_mode,
-                )
-
-                _ack_mode = intent_ack_continuation_mode(agent)
                if (
-                    _ack_mode != "off"
+                    agent.api_mode == "codex_responses"
                    and agent.valid_tool_names
                    and codex_ack_continuations < 2
                    and agent._looks_like_codex_intermediate_ack(
                        user_message=user_message,
                        assistant_content=final_response,
                        messages=messages,
-                        require_workspace=(_ack_mode == "codex_only"),
                    )
                ):
                    codex_ack_continuations += 1
@@ -4753,10 +4479,9 @@ def run_conversation(
                final_msg = agent._build_assistant_message(assistant_message, finish_reason)

                # Pop thinking-only prefill and empty-response retry
-                # scaffolding before appending either a final response or a
-                # verification-stop follow-up. These internal turns are only
-                # for the next API retry and should not become durable
-                # transcript context.
+                # scaffolding before appending the final response.  These
+                # internal turns are only for the next API retry and should
+                # not become durable transcript context.
                while (
                    messages
                    and isinstance(messages[-1], dict)
@@ -4768,48 +4493,6 @@ def run_conversation(
                ):
                    messages.pop()

-                try:
-                    from agent.verification_stop import (
-                        build_verify_on_stop_nudge,
-                        verify_on_stop_enabled,
-                    )
-
-                    if verify_on_stop_enabled():
-                        _verify_nudge = build_verify_on_stop_nudge(
-                            session_id=getattr(agent, "session_id", None),
-                            changed_paths=getattr(agent, "_turn_file_mutation_paths", set()),
-                            attempts=getattr(agent, "_verification_stop_nudges", 0),
-                        )
-                    else:
-                        _verify_nudge = None
-                except Exception:
-                    logger.debug("verification stop-loop check failed", exc_info=True)
-                    _verify_nudge = None
-
-                if _verify_nudge:
-                    agent._verification_stop_nudges = (
-                        getattr(agent, "_verification_stop_nudges", 0) + 1
-                    )
-                    final_msg["finish_reason"] = "verification_required"
-                    messages.append(final_msg)
-                    # Keep the attempted final answer in model history so the
-                    # synthetic user nudge preserves role alternation, but do
-                    # not surface it to the user as an interim answer. The
-                    # whole point of this guard is to prevent premature
-                    # "done" claims before checks run.
-                    messages.append({
-                        "role": "user",
-                        "content": _verify_nudge,
-                        "_verification_stop_synthetic": True,
-                    })
-                    agent._session_messages = messages
-                    # Run the verification-stop loop silently — the nudge is an
-                    # internal turn that should not add noise to the user's
-                    # terminal. Keep a debug breadcrumb in agent.log for tracing.
-                    logger.debug("verification stop-loop nudge issued (attempt %d)",
-                                 agent._verification_stop_nudges)
-                    continue
-
                messages.append(final_msg)
                
                _turn_exit_reason = f"text_response(finish_reason={finish_reason})"
--- a/agent/copilot_acp_client.py
+++ b/agent/copilot_acp_client.py
@@ -23,7 +23,6 @@ from typing import Any

 from agent.file_safety import get_read_block_error, is_write_denied
 from agent.redact import redact_sensitive_text
-from tools.environments.local import hermes_subprocess_env

 ACP_MARKER_BASE_URL = "acp://copilot"
 _DEFAULT_TIMEOUT_SECONDS = 900.0
@@ -95,10 +94,7 @@ def _resolve_home_dir() -> str:


 def _build_subprocess_env() -> dict[str, str]:
-    # Copilot ACP is a model-driving CLI executor: it legitimately needs LLM
-    # provider credentials. Route through the central helper so Tier-1 secrets
-    # (gateway bot tokens, GitHub auth, infra) are still stripped (#29157).
-    env = hermes_subprocess_env(inherit_credentials=True)
+    env = os.environ.copy()
    home = _resolve_home_dir()
    env["HOME"] = home
    from hermes_constants import apply_subprocess_home_env
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -11,7 +11,6 @@ import uuid
 import re
 from dataclasses import dataclass, fields, replace
 from datetime import datetime, timezone
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
@@ -448,63 +447,6 @@ def get_pool_strategy(provider: str) -> str:
 DEFAULT_MAX_CONCURRENT_PER_CREDENTIAL = 1


-def _write_through_provider_state_to_global_root(
-    provider_id: str, state: Dict[str, Any]
-) -> None:
-    """Persist a rotated OAuth ``state`` into the global-root auth.json.
-
-    Best-effort write-through for the multi-profile rotation hazard
-    (#48415 / #43589): nous, openai-codex, and xai-oauth rotate the
-    refresh_token on refresh, so when a profile pool refresh rotates a grant
-    it resolved from the root fallback, the rotated chain must land back in
-    root. Otherwise root keeps a now-revoked refresh token and every other
-    profile reading the stale root grant dies with ``refresh_token_reused`` /
-    ``invalid_grant`` once its access token expires.
-
-    Only updates ``providers.<provider_id>`` in the root store; never touches
-    the profile store (the caller already saved that). Swallows all errors — a
-    failed write-through degrades to the pre-existing behavior (root stale), it
-    must never break the profile's own successful save. Mirrors
-    ``hermes_cli.auth._write_through_xai_oauth_to_global_root`` (which covers
-    the non-pool xAI refresh path) for the credential-pool refresh path.
-    """
-    try:
-        global_path = auth_mod._global_auth_file_path()
-    except Exception:
-        return
-    if global_path is None:
-        # Classic mode (profile == root); the profile save already hit root.
-        return
-    # Seat belt: under pytest, refuse to write the real user's
-    # ~/.hermes/auth.json even when HERMES_HOME points at a profile path
-    # (mirrors the read-side guard in _load_global_auth_store). Uses the
-    # unmodified HOME env, not Path.home() which fixtures may monkeypatch.
-    if os.environ.get("PYTEST_CURRENT_TEST"):
-        real_home_env = os.environ.get("HOME", "")
-        if real_home_env:
-            real_root = Path(real_home_env) / ".hermes" / "auth.json"
-            try:
-                if global_path.resolve(strict=False) == real_root.resolve(strict=False):
-                    return
-            except Exception:
-                return
-    try:
-        if global_path.exists():
-            global_store = _load_auth_store(global_path)
-        else:
-            global_store = {}
-        if not isinstance(global_store, dict):
-            return
-        _store_provider_state(global_store, provider_id, dict(state), set_active=False)
-        auth_mod._save_auth_store(global_store, global_path)
-    except Exception as exc:  # pragma: no cover - best effort
-        logger.debug(
-            "%s pool refresh: write-through to global root failed: %s",
-            provider_id,
-            exc,
-        )
-
-
 class CredentialPool:
    def __init__(self, provider: str, entries: List[PooledCredential]):
        self.provider = provider
@@ -537,11 +479,10 @@ class CredentialPool:
                self._entries[idx] = new
                return

-    def _persist(self, *, removed_ids: Optional[List[str]] = None) -> None:
+    def _persist(self) -> None:
        write_credential_pool(
            self.provider,
            [entry.to_dict() for entry in self._entries],
-            removed_ids=removed_ids,
        )

    def _is_terminal_auth_failure(
@@ -859,28 +800,6 @@ class CredentialPool:
        try:
            with _auth_store_lock():
                auth_store = _load_auth_store()
-                # Decide BEFORE writing whether this profile is reading the
-                # grant from the global root (no own providers.<id> block) vs.
-                # genuinely shadowing it. A pool refresh rotates single-use
-                # OAuth refresh tokens, so a profile that resolved the grant
-                # from root MUST write the rotated chain back to root too —
-                # otherwise root keeps a revoked refresh token and every other
-                # profile reading the stale root grant dies with
-                # refresh_token_reused / invalid_grant once its access token
-                # expires. This mirrors the xAI write-through in
-                # hermes_cli.auth._save_xai_oauth_tokens (#43589); the pool
-                # refresh path is the Codex/xAI analog reported in #48415.
-                _wt_provider_id = {
-                    "nous": "nous",
-                    "openai-codex": "openai-codex",
-                    "xai-oauth": "xai-oauth",
-                }.get(self.provider)
-                write_through_to_root = bool(_wt_provider_id) and not (
-                    isinstance(auth_store.get("providers"), dict)
-                    and isinstance(
-                        auth_store["providers"].get(_wt_provider_id), dict
-                    )
-                )
                if self.provider == "nous":
                    state = _load_provider_state(auth_store, "nous")
                    if state is None:
@@ -936,10 +855,6 @@ class CredentialPool:
                    return

                _save_auth_store(auth_store)
-                if write_through_to_root and _wt_provider_id:
-                    _write_through_provider_state_to_global_root(
-                        _wt_provider_id, state
-                    )
        except Exception as exc:
            logger.debug("Failed to sync %s pool entry back to auth store: %s", self.provider, exc)

@@ -1125,17 +1040,13 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal xAI OAuth state: %s", clear_exc
                        )
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source == "loopback_pkce"
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "loopback_pkce"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            # For openai-codex: same race as xAI/nous — another Hermes process
            # may have consumed the refresh token between our proactive sync
@@ -1195,17 +1106,13 @@ class CredentialPool:
                        logger.debug(
                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
                        )
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source == "device_code"
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source != "device_code"
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
@@ -1262,17 +1169,13 @@ class CredentialPool:
                        auth_mod.NOUS_DEVICE_CODE_SOURCE,
                        f"manual:{auth_mod.NOUS_DEVICE_CODE_SOURCE}",
                    }
-                    removed_ids = [
-                        item.id for item in self._entries
-                        if item.source in singleton_sources
-                    ]
                    self._entries = [
                        item for item in self._entries
                        if item.source not in singleton_sources
                    ]
                    if self._current_id == entry.id:
                        self._current_id = None
-                    self._persist(removed_ids=removed_ids)
+                    self._persist()
                    return None
            self._mark_exhausted(entry, None)
            return None
@@ -1434,7 +1337,7 @@ class CredentialPool:
            pruned_ids = set(entries_to_prune)
            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
-            self._persist(removed_ids=entries_to_prune)
+            self._persist()
        return available

    def _select_unlocked(self) -> Optional[PooledCredential]:
@@ -1608,11 +1511,7 @@ class CredentialPool:
            replace(entry, priority=new_priority)
            for new_priority, entry in enumerate(self._entries)
        ]
-        write_credential_pool(
-            self.provider,
-            [entry.to_dict() for entry in self._entries],
-            removed_ids=[removed.id],
-        )
+        self._persist()
        if self._current_id == removed.id:
            self._current_id = None
        return removed
@@ -2274,11 +2173,6 @@ def _seed_custom_pool(pool_key: str, entries: List[PooledCredential]) -> Tuple[b
 def load_pool(provider: str) -> CredentialPool:
    provider = (provider or "").strip().lower()
    raw_entries = read_credential_pool(provider)
-    disk_ids = {
-        entry.get("id")
-        for entry in raw_entries
-        if isinstance(entry, dict) and entry.get("id")
-    }
    raw_needs_sanitization = any(
        isinstance(payload, dict)
        and sanitize_borrowed_credential_payload(payload, provider) != payload
@@ -2307,10 +2201,8 @@ def load_pool(provider: str) -> CredentialPool:
        changed |= _normalize_pool_priorities(provider, entries)

    if changed:
-        new_ids = {entry.id for entry in entries}
        write_credential_pool(
            provider,
            [entry.to_dict() for entry in sorted(entries, key=lambda item: item.priority)],
-            removed_ids=disk_ids - new_ids,
        )
    return CredentialPool(provider, entries)
--- a/agent/curator.py
+++ b/agent/curator.py
@@ -273,21 +273,6 @@ def should_run_now(now: Optional[datetime] = None) -> bool:
 # Automatic state transitions (pure function, no LLM)
 # ---------------------------------------------------------------------------

-def _cron_referenced_skills() -> Set[str]:
-    """Skill names referenced by any cron job (incl. paused/disabled).
-
-    Best-effort: a cron-module import error or corrupt jobs store must never
-    break the curator, so any failure yields an empty set (no protection,
-    but no crash).
-    """
-    try:
-        from cron.jobs import referenced_skill_names as _refs
-        return _refs()
-    except Exception as e:
-        logger.debug("Curator could not read cron skill references: %s", e, exc_info=True)
-        return set()
-
-
 def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int]:
    """Walk every curator-managed skill and move active/stale/archived based on
    the latest real activity timestamp. Pinned skills are never touched.
@@ -307,8 +292,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
    stale_cutoff = now - timedelta(days=get_stale_after_days())
    archive_cutoff = now - timedelta(days=get_archive_after_days())

-    cron_referenced = _cron_referenced_skills()
-
    counts = {"marked_stale": 0, "archived": 0, "reactivated": 0, "checked": 0, "seeded": 0}

    for row in _u.agent_created_report():
@@ -317,15 +300,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int
        if row.get("pinned"):
            continue

-        # A skill referenced by any cron job (incl. paused/disabled) is in
-        # use by definition — resuming or the next fire must find it. The
-        # scheduler only bumps usage when a job actually fires, so jobs that
-        # fire less often than archive_after_days, paused jobs, and far-future
-        # one-shots would otherwise have their skills aged out from under
-        # them. Treat referenced skills like pinned: never auto-transition.
-        if name in cron_referenced:
-            continue
-
        # First sight of a curation-eligible skill with no persisted record
        # (e.g. a newly-eligible built-in): anchor its clock to now and defer.
        if not row.get("_persisted", True):
@@ -342,18 +316,6 @@ def apply_automatic_transitions(now: Optional[datetime] = None) -> Dict[str, int

        current = row.get("state", _u.STATE_ACTIVE)

-        # Never-used skills (use_count == 0) get a grace floor: don't archive
-        # one until it is at least stale_after_days old. A use=0 skill is
-        # absence of evidence, not evidence of staleness — a skill created
-        # recently may simply not have had its trigger come up yet.
-        never_used = int(row.get("use_count", 0) or 0) == 0
-        if never_used and anchor > stale_cutoff:
-            # Younger than the stale window — leave it alone entirely.
-            if current == _u.STATE_STALE:
-                _u.set_state(name, _u.STATE_ACTIVE)
-                counts["reactivated"] += 1
-            continue
-
        if anchor <= archive_cutoff and current != _u.STATE_ARCHIVED:
            ok, _msg = _u.archive_skill(name)
            if ok:
@@ -415,10 +377,8 @@ CURATOR_REVIEW_PROMPT = (
    "bodies + `references/`, `templates/`, and `scripts/` subfiles for "
    "session-specific detail — not one-session-one-skill micro-entries.\n\n"
    "Hard rules — do not violate:\n"
-    "1. DO NOT touch bundled, hub-installed, or external-dir skills "
-    "(`skills.external_dirs`). The candidate list below is already filtered "
-    "to local curator-managed skills only; external skills are externally "
-    "owned and read-only to this background curator.\n"
+    "1. DO NOT touch bundled or hub-installed skills. The candidate list "
+    "below is already filtered to agent-created skills only.\n"
    "2. DO NOT delete any skill. Archiving (moving the skill's directory "
    "into ~/.hermes/skills/.archive/) is the maximum destructive action. "
    "Archives are recoverable; deletion is not.\n"
@@ -428,19 +388,10 @@ CURATOR_REVIEW_PROMPT = (
    "back load-bearing UX (slash-command entry points referenced in docs and "
    "tips) and are filtered out of the candidate list below — never resurrect "
    "one as an archive or absorb target.\n"
-    "3c. DO NOT archive or prune any skill marked `cron=yes` in the candidate "
-    "list. A cron job depends on it and will fail to load it on its next "
-    "run. You MAY still consolidate it into an umbrella — but only because "
-    "the curator rewrites cron job skill references to follow consolidations; "
-    "never simply prune it.\n"
    "4. DO NOT use usage counters as a reason to skip consolidation. The "
    "counters are new and often mostly zero. Judge overlap on CONTENT, "
    "not on use_count. 'use=0' is not evidence a skill is valuable; it's "
-    "absence of evidence either way. Corollary: 'use=0' is ALSO not a "
-    "reason to PRUNE a skill. Never archive a never-used skill (use=0) "
-    "unless it is at least 30 days old (check last_activity / created date) "
-    "AND its content is genuinely obsolete or fully absorbed elsewhere — a "
-    "recently-created skill simply may not have had its trigger come up yet.\n"
+    "absence of evidence either way.\n"
    "5. DO NOT reject consolidation on the grounds that 'each skill has "
    "a distinct trigger'. Pairwise distinctness is the wrong bar. The "
    "right bar is: 'would a human maintainer write this as N separate "
@@ -518,9 +469,8 @@ CURATOR_REVIEW_PROMPT = (
    "skill, or `absorbed_into=\"\"` when you're truly pruning with no "
    "forwarding target. This drives cron-job skill-reference migration — "
    "guessing from your YAML summary after the fact is fragile.\n"
-    "  - terminal                       — move LOCAL candidate content into "
-    "a support subfile when package integrity requires it; never mv, cp, rm, "
-    "patch, or rewrite bundled, hub-installed, or external-dir skills\n\n"
+    "  - terminal                       — mv a sibling into the archive "
+    "OR move its content into a support subfile\n\n"
    "'keep' is a legitimate decision ONLY when the skill is already a "
    "class-level umbrella and none of the proposed merges would improve "
    "discoverability. 'This is narrow but distinct from its siblings' "
@@ -1460,14 +1410,12 @@ def _render_candidate_list() -> str:
    rows = skill_usage.agent_created_report()
    if not rows:
        return "No agent-created skills to review."
-    cron_referenced = _cron_referenced_skills()
    lines = [f"Agent-created skills ({len(rows)}):\n"]
    for r in rows:
        lines.append(
            f"- {r['name']}  "
            f"state={r['state']}  "
            f"pinned={'yes' if r.get('pinned') else 'no'}  "
-            f"cron={'yes' if r['name'] in cron_referenced else 'no'}  "
            f"activity={r.get('activity_count', 0)}  "
            f"use={r.get('use_count', 0)}  "
            f"view={r.get('view_count', 0)}  "
@@ -1895,14 +1843,6 @@ def _run_llm_review(prompt: str) -> Dict[str, Any]:
        # Disable recursive nudges — the curator must never spawn its own review.
        review_agent._memory_nudge_interval = 0
        review_agent._skill_nudge_interval = 0
-        # Tag this fork as autonomous background curation so skill_manage's
-        # background-review write guard fires. Without this the fork inherits
-        # the default "assistant_tool" origin, is_background_review() is False,
-        # and the external/bundled/hub-installed skill_manage guards never
-        # trigger during the curation pass they exist to protect against.
-        # turn_context.py binds this onto the write-origin ContextVar at turn
-        # start (see agent/turn_context.py).
-        review_agent._memory_write_origin = "background_review"

        # Redirect the forked agent's stdout/stderr to /dev/null while it
        # runs so its tool-call chatter doesn't pollute the foreground
--- a/agent/display.py
+++ b/agent/display.py
@@ -6,7 +6,6 @@ Used by AIAgent._execute_tool_calls for CLI feedback.

 import logging
 import os
-import re
 import sys
 import threading
 import time
@@ -16,7 +15,6 @@ from pathlib import Path
 from typing import Any

 from utils import safe_json_loads
-from agent.redact import redact_sensitive_text
 from agent.tool_result_classification import file_mutation_result_landed

 # ANSI escape codes for coloring tool failure indicators
@@ -179,223 +177,6 @@ def _truncate_preview(text: str, max_len: int | None) -> str:
    return text


-_SHELL_SILENT_HEADS = {"cd", "pushd", "popd", "export", "set", "unset", "source", ".", "true", "false", ":"}
-_SHELL_PIPE_TAIL_HEADS = {"head", "tail", "wc", "sort", "uniq"}
-
-
-def _shell_basename(head: str) -> str:
-    return head.rsplit("/", 1)[-1] if head else ""
-
-
-def _split_shell_words(segment: str) -> list[str]:
-    words: list[str] = []
-    buf: list[str] = []
-    quote: str | None = None
-
-    for i, ch in enumerate(segment):
-        if quote:
-            buf.append(ch)
-            if ch == quote and (i == 0 or segment[i - 1] != "\\"):
-                quote = None
-            continue
-
-        if ch in {"'", '"'}:
-            quote = ch
-            buf.append(ch)
-            continue
-
-        if ch.isspace():
-            if buf:
-                words.append("".join(buf))
-                buf = []
-            continue
-
-        buf.append(ch)
-
-    if buf:
-        words.append("".join(buf))
-
-    return words
-
-
-def _strip_shell_pipe_tail(segment: str) -> str:
-    words = _split_shell_words(segment)
-    out: list[str] = []
-
-    for i, word in enumerate(words):
-        if word == "|" and _shell_basename(words[i + 1] if i + 1 < len(words) else "") in _SHELL_PIPE_TAIL_HEADS:
-            break
-        out.append(word)
-
-    return " ".join(out).strip()
-
-
-def _split_shell_compound(command: str) -> list[str]:
-    segments: list[str] = []
-    buf: list[str] = []
-    quote: str | None = None
-    i = 0
-
-    while i < len(command):
-        ch = command[i]
-
-        if quote:
-            buf.append(ch)
-            if ch == quote and (i == 0 or command[i - 1] != "\\"):
-                quote = None
-            i += 1
-            continue
-
-        if ch in {"'", '"'}:
-            quote = ch
-            buf.append(ch)
-            i += 1
-            continue
-
-        op_len = 2 if command.startswith("&&", i) or command.startswith("||", i) else 1 if ch in {";", "\n"} else 0
-        if op_len:
-            segment = _strip_shell_pipe_tail("".join(buf).strip())
-            if segment:
-                segments.append(segment)
-            buf = []
-            i += op_len
-            continue
-
-        buf.append(ch)
-        i += 1
-
-    segment = _strip_shell_pipe_tail("".join(buf).strip())
-    if segment:
-        segments.append(segment)
-
-    return segments
-
-
-def _shell_head_word(segment: str) -> str:
-    words = _split_shell_words(segment)
-    index = 0
-    while index < len(words) and re.match(r"^[A-Za-z_]\w*=", words[index]):
-        index += 1
-    return _shell_basename(words[index] if index < len(words) else "")
-
-
-def _clean_shell_segment(segment: str) -> str:
-    words = _split_shell_words(segment)
-    out: list[str] = []
-    i = 0
-    while i < len(words):
-        word = words[i]
-        if re.match(r"^\d*(?:>>?|<)$", word):
-            i += 2
-            continue
-        if re.match(r"^\d*(?:>&|<&)\d+$", word) or re.match(r"^\d*>&\d+$", word):
-            i += 1
-            continue
-        out.append(word)
-        i += 1
-    return " ".join(out).strip()
-
-
-def _is_shell_boundary_echo(segment: str) -> bool:
-    words = _split_shell_words(segment)
-    if _shell_basename(words[0] if words else "") != "echo":
-        return False
-    rest = " ".join(words[1:])
-    return bool(re.search(r"-{2,}|_exit=|(?:^|\s|=)\$[?{]|PIPESTATUS", rest))
-
-
-def summarize_shell_command(command: str) -> str:
-    """Compact shell wrapper/plumbing for display while preserving raw command elsewhere."""
-    original = _oneline(command)
-    if not original:
-        return ""
-
-    segments = _split_shell_compound(original)
-    if len(segments) <= 1:
-        return _clean_shell_segment(segments[0] if segments else original) or original
-
-    core: list[str] = []
-    for segment in segments:
-        cleaned = _clean_shell_segment(segment)
-        head = _shell_head_word(cleaned)
-        if cleaned and head not in _SHELL_SILENT_HEADS and not _is_shell_boundary_echo(cleaned):
-            core.append(cleaned)
-
-    if not core:
-        return original
-    if len(core) == 1:
-        return core[0]
-
-    count = len(core) - 1
-    return f"{core[0]} + {count} {'command' if count == 1 else 'commands'}"
-
-
-def _read_file_line_label(args: dict) -> str:
-    offset = args.get("offset")
-    limit = args.get("limit")
-    if not isinstance(offset, int) or offset <= 0:
-        return ""
-    if not isinstance(limit, int) or limit <= 1:
-        return f"L{offset}"
-    return f"L{offset}-{offset + limit - 1}"
-
-
-def redact_browser_typed_text_for_display(value: Any, typed_text: Any) -> Any:
-    """Apply secret redaction to browser_type text in display-facing payloads.
-
-    Backends sometimes echo the attempted input in error strings or fallback
-    metadata.  When the raw typed value contains a recognizable secret (API
-    key, token, JWT, etc.) the redacted form differs from the raw value, so we
-    replace every occurrence of the raw value with its redacted form before a
-    browser_type result reaches logs, callbacks, the model, or chat history.
-
-    Normal typed text (search queries, addresses, form fields) matches no
-    secret pattern, so it passes through unchanged and stays readable.
-
-    Redaction is forced here regardless of the global ``security.redact_secrets``
-    preference: a typed credential leaking into chat history is a security
-    boundary, not mere log hygiene.
-    """
-    if typed_text is None:
-        return value
-    needle = str(typed_text)
-    if needle == "":
-        return value
-    redacted = redact_sensitive_text(needle, force=True)
-    if redacted == needle:
-        # Nothing secret-looking in the typed text; leave payload untouched.
-        return value
-    if isinstance(value, str):
-        return value.replace(needle, redacted)
-    if isinstance(value, dict):
-        return {
-            key: redact_browser_typed_text_for_display(item, typed_text)
-            for key, item in value.items()
-        }
-    if isinstance(value, list):
-        return [redact_browser_typed_text_for_display(item, typed_text) for item in value]
-    if isinstance(value, tuple):
-        return tuple(redact_browser_typed_text_for_display(item, typed_text) for item in value)
-    return value
-
-
-def redact_tool_args_for_display(tool_name: str, args: dict | None) -> dict | None:
-    """Return a copy of tool args safe for logs/progress UI.
-
-    For ``browser_type`` the ``text`` argument is run through the same
-    secret-pattern redactor used for logs.  Recognizable credentials (API
-    keys, tokens) are masked before the value reaches tool progress
-    notifications; normal typed text is left intact for debuggability.
-    """
-    if not isinstance(args, dict):
-        return args
-    if tool_name == "browser_type" and isinstance(args.get("text"), str):
-        safe_args = dict(args)
-        safe_args["text"] = redact_sensitive_text(args["text"], force=True)
-        return safe_args
-    return args
-
-
 def _delegate_task_goal_parts(tasks: Any, *, per_goal_len: int) -> tuple[int, list[str]]:
    if not isinstance(tasks, list):
        return 0, []
@@ -419,14 +200,13 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
        max_len = _tool_preview_max_len
    if not args:
        return None
-    args = redact_tool_args_for_display(tool_name, args) or args
    primary_args = {
        "terminal": "command", "web_search": "query", "web_extract": "urls",
        "read_file": "path", "write_file": "path", "patch": "path",
        "search_files": "pattern", "browser_navigate": "url",
        "browser_click": "ref", "browser_type": "text",
        "image_generate": "prompt", "text_to_speech": "text",
-        "vision_analyze": "question",
+        "vision_analyze": "question", "mixture_of_agents": "user_prompt",
        "skill_view": "name", "skills_list": "category",
        "cronjob": "action",
        "execute_code": "code", "delegate_task": "goal",
@@ -473,23 +253,6 @@ def build_tool_preview(tool_name: str, args: dict, max_len: int | None = None) -
        else:
            return f"planning {len(todos_arg)} task(s)"

-    if tool_name in {"terminal", "execute_code"}:
-        key = "code" if tool_name == "execute_code" else "command"
-        command = args.get(key)
-        if command is None:
-            return None
-        preview = summarize_shell_command(str(command))
-        return _truncate_preview(preview, max_len) if preview else None
-
-    if tool_name == "read_file":
-        path = args.get("path") or args.get("file") or args.get("filepath")
-        if path is None:
-            return None
-        label = Path(str(path).replace("\\", "/")).name or str(path)
-        line_label = _read_file_line_label(args)
-        preview = f"{label} {line_label}".strip()
-        return _truncate_preview(preview, max_len) if preview else None
-
    if tool_name == "session_search":
        query = _oneline(args.get("query", ""))
        return f"recall: \"{query[:25]}{'...' if len(query) > 25 else ''}\""
@@ -1143,7 +906,6 @@ def get_cute_tool_message(
    When *result* is provided the line is checked for failure indicators.
    Failed tool calls get a red prefix and an informational suffix.
    """
-    args = redact_tool_args_for_display(tool_name, args) or args
    dur = f"{duration:.1f}s"
    is_failure, failure_suffix = _detect_tool_failure(tool_name, result)
    skin_prefix = get_skin_tool_prefix()
@@ -1181,7 +943,7 @@ def get_cute_tool_message(
            return _wrap(f"┊ 📄 fetch     {_trunc(domain, 35)}{extra}  {dur}")
        return _wrap(f"┊ 📄 fetch     pages  {dur}")
    if tool_name == "terminal":
-        return _wrap(f"┊ 💻 $         {_trunc(build_tool_preview(tool_name, args) or args.get('command', ''), 42)}  {dur}")
+        return _wrap(f"┊ 💻 $         {_trunc(args.get('command', ''), 42)}  {dur}")
    if tool_name == "process":
        action = args.get("action", "?")
        sid = args.get("session_id", "")[:12]
@@ -1189,7 +951,7 @@ def get_cute_tool_message(
                  "wait": f"wait {sid}", "kill": f"kill {sid}", "write": f"write {sid}", "submit": f"submit {sid}"}
        return _wrap(f"┊ ⚙️  proc      {labels.get(action, f'{action} {sid}')}  {dur}")
    if tool_name == "read_file":
-        return _wrap(f"┊ 📖 read      {_trunc(build_tool_preview(tool_name, args) or args.get('path', ''), 42)}  {dur}")
+        return _wrap(f"┊ 📖 read      {_path(args.get('path', ''))}  {dur}")
    if tool_name == "write_file":
        return _wrap(f"┊ ✍️  write     {_path(args.get('path', ''))}  {dur}")
    if tool_name == "patch":
@@ -1275,6 +1037,8 @@ def get_cute_tool_message(
        return _wrap(f"┊ 🔊 speak     {_trunc(args.get('text', ''), 30)}  {dur}")
    if tool_name == "vision_analyze":
        return _wrap(f"┊ 👁️  vision    {_trunc(args.get('question', ''), 30)}  {dur}")
+    if tool_name == "mixture_of_agents":
+        return _wrap(f"┊ 🧠 reason    {_trunc(args.get('user_prompt', ''), 30)}  {dur}")
    if tool_name == "send_message":
        return _wrap(f"┊ 📨 send      {args.get('target', '?')}: \"{_trunc(args.get('message', ''), 25)}\"  {dur}")
    if tool_name == "cronjob":
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -133,31 +133,6 @@ _RATE_LIMIT_PATTERNS = [
    "servicequotaexceededexception",
 ]

-# Patterns that indicate provider-side overload, NOT a per-credential rate
-# limit or billing problem.  The credential is valid — the server is just
-# busy — so the correct recovery is "back off and retry the same key", never
-# "rotate the credential" (rotating exhausts the pool while the endpoint is
-# still busy; a single-key user has nothing to rotate to).  Some providers
-# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
-# status path matches the body against this list before falling through to
-# the rate_limit default.  Phrases are kept narrow and overload-flavoured so a
-# normal rate-limit message ("you have been rate-limited") doesn't hit this
-# bucket. (#14038, #15297)
-_OVERLOADED_PATTERNS = [
-    "overloaded",
-    "temporarily overloaded",
-    "service is temporarily overloaded",
-    "service may be temporarily overloaded",
-    "server is overloaded",
-    "server overloaded",
-    "service overloaded",
-    "service is overloaded",
-    "upstream overloaded",
-    "currently overloaded",
-    "at capacity",
-    "over capacity",
-]
-
 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
 _USAGE_LIMIT_PATTERNS = [
    "usage limit",
@@ -355,14 +330,6 @@ _CONTENT_POLICY_BLOCKED_PATTERNS = [
    # echo back; the underscore form is provider-specific enough.
    "content_filter",
    "responsibleaipolicyviolation",
-    # MiniMax output-layer safety filter. The error string is surfaced
-    # verbatim by MiniMax SDK / OpenAI-compatible endpoints, usually in the
-    # form "output new_sensitive (1027)" when the model's *output* (often a
-    # large tool-call argument block) trips the upstream safety filter and
-    # the SSE stream is truncated mid-flight. ``new_sensitive`` is the
-    # filter name and is narrow enough that billing / format / auth error
-    # strings will not collide. See #32421.
-    "new_sensitive",
 ]

 # Auth patterns (non-status-code signals)
@@ -750,26 +717,6 @@ def classify_api_error(

    is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS)
    if is_disconnect and not status_code:
-        # Reasoning-model override: a transport disconnect on a reasoning
-        # model is much more likely the upstream proxy idle-killing a
-        # long thinking stream than a true context overflow — even on
-        # large sessions.  The default disconnect+large-session routing
-        # below would otherwise send the user into the compression
-        # branch (should_compress=True) and silently delete
-        # conversation history on a phantom context-length error.
-        # Reasoning models have multi-minute thinking phases that
-        # routinely exceed the cloud gateway's idle window (NVIDIA
-        # NIM ~120s — first-party repro at NVIDIA/NemoClaw#4846;
-        # OpenAI worker / Anthropic stream-idle similar).  The
-        # per-reasoning-model stale-timeout floor in
-        # agent/reasoning_timeouts.py raises the stale-detector
-        # threshold to tolerate long thinking, so a true
-        # transport-layer failure here is recoverable via the retry
-        # path — not via context compression.  Reclassify as timeout.
-        # (Part 1 of Fixes #52310.)
-        from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-        if get_reasoning_stale_timeout_floor(model) is not None:
-            return _result(FailoverReason.timeout, retryable=True)
        # Absolute token/message-count thresholds are only a proxy for smaller
        # context windows.  Large-context sessions can have hundreds of
        # messages while still being far below their actual token budget.
@@ -896,19 +843,7 @@ def _classify_by_status(
        )

    if status_code == 429:
-        # Already checked long_context_tier above. Some providers (notably
-        # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
-        # code as a true per-credential rate limit, but the credential is
-        # valid and the correct recovery is "back off and retry the same key",
-        # NOT "rotate the credential" (which exhausts the pool while the
-        # endpoint is still busy, and does nothing for a single-key user).
-        # Disambiguate on the error body so an overload 429 takes the
-        # transient-overload path instead of burning the pool. (#14038)
-        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
-            return result_fn(
-                FailoverReason.overloaded,
-                retryable=True,
-            )
+        # Already checked long_context_tier above; this is a normal rate limit
        return result_fn(
            FailoverReason.rate_limit,
            retryable=True,
@@ -1259,17 +1194,6 @@ def _classify_by_message(
            should_fallback=True,
        )

-    # Overloaded / server-busy patterns — must come BEFORE the rate_limit and
-    # billing checks so that a message-only "overloaded" (no 503/529 status,
-    # e.g. some Anthropic-compatible proxies) classifies as a transient
-    # overload (backoff + retry) instead of falling through to `unknown` or
-    # incorrectly triggering credential rotation.
-    if any(p in error_msg for p in _OVERLOADED_PATTERNS):
-        return result_fn(
-            FailoverReason.overloaded,
-            retryable=True,
-        )
-
    # Billing patterns
    if any(p in error_msg for p in _BILLING_PATTERNS):
        return result_fn(
@@ -1359,25 +1283,19 @@ def _extract_status_code(error: Exception) -> Optional[int]:


 def _extract_error_body(error: Exception) -> dict:
-    """Extract the structured error body from an SDK exception or its cause chain."""
-    current = error
-    for _ in range(5):  # Match _extract_status_code() traversal depth.
-        body = getattr(current, "body", None)
-        if isinstance(body, dict):
-            return body
-        # Some errors have .response.json()
-        response = getattr(current, "response", None)
-        if response is not None:
-            try:
-                json_body = response.json()
-                if isinstance(json_body, dict):
-                    return json_body
-            except Exception:
-                pass
-        cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
-        if cause is None or cause is current:
-            break
-        current = cause
+    """Extract the structured error body from an SDK exception."""
+    body = getattr(error, "body", None)
+    if isinstance(body, dict):
+        return body
+    # Some errors have .response.json()
+    response = getattr(error, "response", None)
+    if response is not None:
+        try:
+            json_body = response.json()
+            if isinstance(json_body, dict):
+                return json_body
+        except Exception:
+            pass
    return {}


--- a/agent/file_safety.py
+++ b/agent/file_safety.py
@@ -77,22 +77,15 @@ def build_write_denied_prefixes(home: str) -> list[str]:
    ]


-def get_safe_write_roots() -> set[str]:
-    """Return resolved HERMES_WRITE_SAFE_ROOT paths. Supports multiple directories
-    separated by ``os.pathsep`` (``:`` on Unix, ``;`` on Windows).
-    E.g., ``/opt/data:/var/www/html`` on Unix, ``C:\\data;D:\\www`` on Windows."""
-    env = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
-    if not env:
-        return set()
-    roots: set[str] = set()
-    for path in env.split(os.pathsep):
-        if path:
-            try:
-                resolved = os.path.realpath(os.path.expanduser(path))
-                roots.add(resolved)
-            except (OSError, ValueError):
-                continue
-    return roots
+def get_safe_write_root() -> Optional[str]:
+    """Return the resolved HERMES_WRITE_SAFE_ROOT path, or None if unset."""
+    root = os.getenv("HERMES_WRITE_SAFE_ROOT", "")
+    if not root:
+        return None
+    try:
+        return os.path.realpath(os.path.expanduser(root))
+    except Exception:
+        return None


 def is_write_denied(path: str) -> bool:
@@ -131,15 +124,9 @@ def is_write_denied(path: str) -> bool:
        except Exception:
            pass

-    safe_roots = get_safe_write_roots()
-    if safe_roots:
-        allowed = False
-        for safe_root in safe_roots:
-            if resolved == safe_root or resolved.startswith(safe_root + os.sep):
-                allowed = True
-                break
-        if not allowed:
-            return True
+    safe_root = get_safe_write_root()
+    if safe_root and not (resolved == safe_root or resolved.startswith(safe_root + os.sep)):
+        return True

    return False

--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@@ -388,98 +388,14 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
    # BMP: "BM"
    if raw.startswith(b"BM"):
        return "image/bmp"
-    # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
-    if len(raw) >= 12 and raw[4:8] == b"ftyp":
-        brand = raw[8:12]
-        if brand in {b"avif", b"avis"}:
-            return "image/avif"
-        if brand in {
-            b"heic", b"heix", b"hevc", b"hevx",
-            b"mif1", b"msf1", b"heim", b"heis",
-        }:
-            return "image/heic"
-    # TIFF: II*\0 (little-endian) or MM\0* (big-endian)
-    if raw[:4] in {b"II*\x00", b"MM\x00*"}:
-        return "image/tiff"
-    # ICO: 00 00 01 00 (reserved=0, type=1=icon)
-    if raw[:4] == b"\x00\x00\x01\x00":
-        return "image/x-icon"
-    # SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
-    head = raw[:512].lstrip().lower()
-    if head.startswith(b"<?xml") or head.startswith(b"<svg"):
-        if b"<svg" in head:
-            return "image/svg+xml"
+    # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc.
+    if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in {
+        b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis",
+    }:
+        return "image/heic"
    return None


-# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
-# accepts natively. Anything outside this set has to be transcoded to PNG
-# before we declare media_type, otherwise the provider returns HTTP 400
-# ("Could not process image" / "Unsupported image media type") and the
-# whole turn fails with no salvage path.
-#
-# Discord (and a few other chat platforms) freely accept attachments in
-# formats outside this set -- AVIF screenshots from Chromium, HEIC from
-# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
-# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
-# it is skipped (logged) rather than transcoded.
-_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
-    "image/png", "image/jpeg", "image/gif", "image/webp",
-})
-
-
-def _transcode_to_png(raw: bytes) -> Optional[bytes]:
-    """Decode arbitrary image bytes with Pillow and re-encode as PNG.
-
-    Returns None if Pillow isn't installed or can't decode the input
-    (rare formats, corrupted bytes, missing optional decoder plugin for
-    HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
-    the image so the rest of the turn still works.
-
-    HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
-    them on demand and swallow ImportError so a missing plugin just
-    looks like 'Pillow can't decode this' rather than crashing.
-    """
-    try:
-        from PIL import Image
-    except ImportError:
-        logger.info(
-            "image_routing: Pillow not installed; cannot transcode "
-            "non-standard image format to PNG. Install with `pip install Pillow` "
-            "(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
-        )
-        return None
-    # Optional plugin registration. Silent on failure: an unsupported
-    # format will just fall through to Image.open raising below.
-    try:
-        import pillow_heif  # type: ignore
-
-        pillow_heif.register_heif_opener()
-    except Exception:
-        pass
-    try:
-        import pillow_avif  # type: ignore  # noqa: F401  -- registers AVIF on import
-    except Exception:
-        pass
-    try:
-        from io import BytesIO
-
-        with Image.open(BytesIO(raw)) as im:
-            # Pick an output mode PNG can serialise. Anything other than
-            # the standard set gets normalised to RGBA so transparency is
-            # preserved where the source had it.
-            if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
-                im = im.convert("RGBA")
-            buf = BytesIO()
-            im.save(buf, format="PNG", optimize=False)
-            return buf.getvalue()
-    except Exception as exc:
-        logger.info(
-            "image_routing: Pillow could not transcode image to PNG -- %s", exc
-        )
-        return None
-
-
 def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
    """Return image MIME type for *path*.

@@ -515,18 +431,8 @@ def _file_to_data_url(path: Path) -> Optional[str]:
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

-    Format compatibility IS handled here: if the sniffed MIME isn't one
-    of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
-    HEIC, BMP, TIFF, or ICO that some providers reject outright), we
-    transcode to PNG with Pillow before declaring media_type. This fixes
-    the user-visible "Could not process image" HTTP 400 from Anthropic on
-    Discord-attached AVIF/HEIC/BMP files.
-
-    Returns None if the file can't be read OR if the format isn't
-    universally supported AND Pillow can't transcode it (Pillow missing,
-    HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
-    caller reports those paths in ``skipped`` and the rest of the turn
-    proceeds.
+    Returns None only if the file can't be read (missing, permission
+    denied, etc.); the caller reports those paths in ``skipped``.
    """
    try:
        raw = path.read_bytes()
@@ -534,22 +440,6 @@ def _file_to_data_url(path: Path) -> Optional[str]:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path, raw=raw)
-    if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
-        transcoded = _transcode_to_png(raw)
-        if transcoded is None:
-            logger.warning(
-                "image_routing: %s is %s which is not accepted by all major "
-                "vision providers and could not be transcoded to PNG; "
-                "skipping this attachment.",
-                path, mime,
-            )
-            return None
-        logger.info(
-            "image_routing: transcoded %s (%s) -> image/png for provider compatibility",
-            path.name, mime,
-        )
-        raw = transcoded
-        mime = "image/png"
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"

--- a/agent/insights.py
+++ b/agent/insights.py
@@ -81,19 +81,6 @@ def _bar_chart(values: List[int], max_width: int = 20) -> List[str]:
    return ["█" * max(1, int(v / peak * max_width)) if v > 0 else "" for v in values]


-def _fmt_ms(ms: float) -> str:
-    """Compact human duration from milliseconds (e.g. 850ms, 2.4s, 1.5m)."""
-    try:
-        ms = float(ms or 0)
-    except (TypeError, ValueError):
-        return "0ms"
-    if ms < 1000:
-        return f"{int(ms)}ms"
-    if ms < 60_000:
-        return f"{ms / 1000:.1f}s"
-    return f"{ms / 60_000:.1f}m"
-
-
 class InsightsEngine:
    """
    Analyzes session history and produces usage insights.
@@ -151,7 +138,6 @@ class InsightsEngine:
                },
                "activity": {},
                "top_sessions": [],
-                "telemetry": {},
            }

        # Compute insights
@@ -162,7 +148,6 @@ class InsightsEngine:
        skills = self._compute_skill_breakdown(skill_usage)
        activity = self._compute_activity_patterns(sessions)
        top_sessions = self._compute_top_sessions(sessions)
-        telemetry = self._compute_telemetry(cutoff)

        return {
            "days": days,
@@ -176,37 +161,8 @@ class InsightsEngine:
            "skills": skills,
            "activity": activity,
            "top_sessions": top_sessions,
-            "telemetry": telemetry,
        }

-    # =========================================================================
-    # Telemetry (observability) — from the tel_* tables (local telemetry)
-    # =========================================================================
-
-    def _compute_telemetry(self, cutoff: float) -> Dict[str, Any]:
-        """Roll up the local telemetry tables for the same window.
-
-        Reuses the engine's existing connection. Fully fail-soft: if the tel_*
-        tables are empty or absent (telemetry.local disabled, fresh install), this
-        returns an empty dict and the renderer skips the section.
-        """
-        try:
-            from agent.telemetry import metrics
-        except Exception:
-            return {}
-        try:
-            since_ns = int(cutoff * 1e9)
-            if not metrics.has_data(conn=self._conn):
-                return {}
-            return {
-                "workflows": metrics.workflow_summary(since_ns=since_ns, conn=self._conn),
-                "model_calls": metrics.model_call_summary(since_ns=since_ns, conn=self._conn),
-                "tool_calls": metrics.tool_call_summary(conn=self._conn),
-                "errors": metrics.error_summary(conn=self._conn),
-            }
-        except Exception:
-            return {}
-
    # =========================================================================
    # Data gathering (SQL queries)
    # =========================================================================
@@ -896,80 +852,8 @@ class InsightsEngine:
                lines.append(f"  {ts['label']:<20} {ts['value']:<18} ({ts['date']}, {ts['session_id']})")
            lines.append("")

-        # Telemetry / observability (local telemetry) — only when data exists
-        tel = report.get("telemetry") or {}
-        if tel:
-            self._append_telemetry_section(lines, tel)
-
        return "\n".join(lines)

-    def _append_telemetry_section(self, lines: List[str], tel: Dict[str, Any]) -> None:
-        """Render the observability rollups (workflows, tools, providers, errors)."""
-        wf = tel.get("workflows", {})
-        mc = tel.get("model_calls", {})
-        tc = tel.get("tool_calls", {})
-        errs = tel.get("errors", {}).get("by_class", {})
-
-        lines.append("  📡 Observability (local telemetry)")
-        lines.append("  " + "─" * 56)
-
-        total_runs = wf.get("total_runs", 0)
-        if total_runs:
-            sr = wf.get("success_rate", 0.0) * 100
-            p50 = wf.get("duration_ms_p50", 0)
-            p95 = wf.get("duration_ms_p95", 0)
-            lines.append(
-                f"  Workflows: {total_runs:,}   Success: {sr:.1f}%   "
-                f"Duration p50/p95: {_fmt_ms(p50)} / {_fmt_ms(p95)}"
-            )
-            by_entry = wf.get("by_entrypoint", {})
-            if by_entry:
-                entry_str = ", ".join(
-                    f"{k}: {v}" for k, v in sorted(by_entry.items(), key=lambda x: -x[1])
-                )
-                lines.append(f"  Entrypoints: {entry_str}")
-
-        # Tool reliability
-        if tc.get("total"):
-            fail_pct = tc.get("failure_rate", 0.0) * 100
-            lines.append(
-                f"  Tool calls: {tc['total']:,}   Failure rate: {fail_pct:.1f}%"
-            )
-            tools = tc.get("by_tool", {})
-            fails = tc.get("failures_by_tool", {})
-            top = sorted(tools.items(), key=lambda x: -x[1])[:6]
-            if top:
-                parts = []
-                for name, n in top:
-                    f = fails.get(name, 0)
-                    parts.append(f"{name}: {n}" + (f" ({f} failed)" if f else ""))
-                lines.append("    " + "   ".join(parts))
-
-        # Provider / model mix + cache (real names)
-        by_provider = mc.get("by_provider", {})
-        if by_provider:
-            prov_str = ", ".join(
-                f"{k}: {v}" for k, v in sorted(by_provider.items(), key=lambda x: -x[1])
-            )
-            lines.append(f"  Providers: {prov_str}")
-        by_model = mc.get("by_model", {})
-        if by_model:
-            model_str = ", ".join(
-                f"{k}: {v}" for k, v in sorted(by_model.items(), key=lambda x: -x[1])[:8]
-            )
-            cache = mc.get("cache_hit_rate", 0.0) * 100
-            suffix = f"   Cache hit: {cache:.1f}%" if cache else ""
-            lines.append(f"  Models: {model_str}{suffix}")
-
-        # Error classes
-        if errs:
-            err_str = ", ".join(
-                f"{k}: {v}" for k, v in sorted(errs.items(), key=lambda x: -x[1])[:6]
-            )
-            lines.append(f"  Errors: {err_str}")
-
-        lines.append("")
-
    def format_gateway(self, report: Dict) -> str:
        """Format the insights report for gateway/messaging (shorter)."""
        if report.get("empty"):
--- a/agent/learn_prompt.py
+++ b/agent/learn_prompt.py
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-"""``/learn`` — build the standards-guided prompt that turns whatever the user
-described into a reusable skill.
-
-``/learn`` is open-ended. The user can point it at anything they can describe:
-a directory of code, an API doc URL, a workflow they just walked the agent
-through in this conversation, or pasted notes. This module builds ONE prompt
-that instructs the live agent to:
-
-  1. Gather the sources the user named, using the tools it already has
-     (``read_file`` / ``search_files`` for dirs, ``web_extract`` for URLs, the
-     current conversation for "what I just did", the user's text for pasted
-     material).
-  2. Author a single ``SKILL.md`` via ``skill_manage`` that follows the Hermes
-     skill-authoring standards (description <=60 chars, the modern section
-     order, Hermes-tool framing, no invented commands).
-
-There is no separate distillation engine and no model-tool footprint: the
-agent does the work with its existing toolset, so this works identically on
-local, Docker, and remote terminal backends. Every surface (CLI ``/learn``,
-gateway ``/learn``, the dashboard "Learn a skill" panel) calls
-:func:`build_learn_prompt` and feeds the result to the agent as a normal turn.
-"""
-
-from __future__ import annotations
-
-# The house-style rules, distilled from AGENTS.md "Skill authoring standards
-# (HARDLINE)" and the hermes-agent-dev new-skill salvage reference. Embedded in
-# the prompt so the agent authors skills the way a maintainer would by hand.
-_AUTHORING_STANDARDS = """\
-Follow the Hermes skill-authoring standards exactly. These are the same
-HARDLINE rules a maintainer enforces in review:
-
-Frontmatter:
- name: lowercase-hyphenated, <=64 chars, no spaces.
- description: ONE sentence, **<=60 characters**, ends with a period. State the
-  capability, not the implementation. No marketing words (powerful,
-  comprehensive, seamless, advanced, robust). Do NOT repeat the skill name. If
-  the description contains a colon, wrap the whole value in double quotes.
-  This is the most-violated rule and it is NOT cosmetic: the system-prompt
-  skill index truncates the description to 60 chars and loads it every
-  session, so anything past char 60 is silently cut and never routes. After
-  you write the description, COUNT the characters; if it is over 60, cut it
-  down before saving — do not ship a sentence and hope.
-    Good (<=60): `Search arXiv papers by keyword, author, or ID.`
-    Bad (123):   `A comprehensive skill that lets the agent search arXiv for
-                  academic papers using keywords, authors, and categories.`
- version: 0.1.0
- author: always the literal value `Hermes`. NEVER fill it from the host
-  environment — the OS/login username (e.g. the `user=` line in your
-  environment hints), git config, or any identity you can probe must not be
-  written. Skills get shared and published, so an environment-derived name is
-  a privacy leak the user never opted into; the skill names itself as Hermes.
- platforms: declare `[macos]`, `[linux]`, and/or `[windows]` IF the skill
-  uses OS-bound primitives (osascript/apt/systemctl => the matching OS; /proc,
-  os.setsid, signal.SIGKILL => linux; fcntl/termios => POSIX). Prefer fixing it
-  cross-platform first (tempfile.gettempdir(), pathlib.Path, psutil); gate only
-  when the dependency is genuinely platform-bound. Omit the field for portable
-  skills.
- metadata.hermes.tags: a few Capitalized, Relevant, Tags.
-
-Body section order (omit a section only if it genuinely has no content):
-1. "# <Human Title>" then a 2-3 sentence intro: what it does, what it does NOT
-   do, and the key dependency stance (e.g. "stdlib only").
-2. "## When to Use" — bullet list of concrete trigger phrases.
-3. "## Prerequisites" — exact env vars, install steps, credentials.
-4. "## How to Run" — the canonical invocation, framed through Hermes tools.
-5. "## Quick Reference" — a flat command/endpoint list, no narration.
-6. "## Procedure" — numbered steps with copy-paste-exact commands.
-7. "## Pitfalls" — known limits, rate limits, things that look broken but aren't.
-8. "## Verification" — a single command/check that proves the skill worked.
-
-Hermes-tool framing (this is what makes it a skill, not shell docs):
- Frame running scripts as "invoke through the `terminal` tool".
- Reference Hermes tools by name in backticks: `terminal`, `read_file`,
-  `write_file`, `search_files`, `patch`, `web_extract`, `web_search`,
-  `vision_analyze`, `browser_navigate`, `delegate_task`, `image_generate`,
-  `text_to_speech`, `cronjob`, `memory`, `skill_view`, `execute_code`.
- Do NOT name shell utilities the agent already has wrapped: say `read_file`
-  not cat/head/tail, `search_files` not grep/rg/find/ls, `patch` not sed/awk,
-  `web_extract` not curl-to-scrape, `write_file` not echo>file or heredocs.
- Third-party CLIs (ffmpeg, gh, an SDK) are fine inside a script file, but the
-  prose still frames them as "invoke through the `terminal` tool". If the
-  skill needs an MCP server, name it and document its setup in Prerequisites.
-
-Quality bar:
- Prefer exact commands, endpoint URLs, function signatures, and config keys
-  that appear VERBATIM in the source. NEVER invent flags, paths, or APIs — if
-  you didn't see it in the source, don't write it.
- Keep it tight and scannable: ~100 lines for a simple skill, ~200 for a
-  complex one. Don't re-paste the source docs.
- Don't write a router/index/hub skill that only points at other skills.
- Larger scripts/parsers belong in a `scripts/` file (add via
-  `skill_manage` write_file), referenced from SKILL.md by relative path — not
-  inlined for the agent to re-type every run. References go in `references/`,
-  templates in `templates/`."""
-
-
-def build_learn_prompt(user_request: str) -> str:
-    """Build the agent prompt for an open-ended ``/learn`` request.
-
-    Args:
-        user_request: the free-text the user gave after ``/learn`` — a
-            description of the workflow, paths, URLs, or "what I just did".
-
-    Returns:
-        A complete instruction the agent runs as a normal turn. The agent
-        gathers the described sources with its existing tools and authors the
-        skill via ``skill_manage``.
-    """
-    req = (user_request or "").strip()
-    if not req:
-        req = (
-            "the workflow we just went through in this conversation — review "
-            "the steps taken and distill them into a reusable skill"
-        )
-
-    return (
-        "[/learn] The user wants you to learn a reusable skill from the "
-        "source(s) they described below, and save it.\n\n"
-        f"WHAT TO LEARN FROM:\n{req}\n\n"
-        "Do this:\n"
-        "1. Gather the material. Resolve whatever the user named using the "
-        "tools you already have — `read_file`/`search_files` for local files "
-        "or directories, `web_extract` for URLs, the current conversation "
-        "history if they referred to something you just did, and the text "
-        "they pasted as-is. If the request is ambiguous about scope, make a "
-        "reasonable choice and note it; do not stall.\n"
-        "2. Author ONE SKILL.md and save it with the `skill_manage` tool "
-        "(action=\"create\"). Pick a sensible category. If the procedure needs "
-        "a non-trivial script, add it under the skill's `scripts/` with "
-        "`skill_manage` write_file and reference it by relative path.\n\n"
-        f"{_AUTHORING_STANDARDS}\n\n"
-        "When done, tell the user the skill name, its category, and a "
-        "one-line summary of what it captured."
-    )
--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@@ -46,39 +46,6 @@ logger = logging.getLogger(__name__)
 _SYNC_DRAIN_TIMEOUT_S = 5.0


-def normalize_tool_schema(schema: Any) -> Optional[Dict[str, Any]]:
-    """Return a function-tool dict with a resolvable top-level ``name``.
-
-    Context engines and memory providers expose tool schemas via
-    ``get_tool_schemas()``. The expected shape is a bare function schema
-    (``{"name": ..., "description": ..., "parameters": ...}``) which callers
-    wrap as ``{"type": "function", "function": schema}``.
-
-    Some providers instead return an entry that is *already* in OpenAI tool
-    form (``{"type": "function", "function": {"name": ...}}``). Wrapping that
-    a second time produces ``{"type": "function", "function": {"type":
-    "function", "function": {...}}}`` whose ``function`` has no top-level
-    ``name``. Strict providers (e.g. DeepSeek) reject the *entire* request
-    with ``tools[N].function: missing field name`` (HTTP 400), so one bad
-    schema disables the whole toolset and breaks every turn (#47707).
-
-    This helper normalizes both shapes to the bare function schema and
-    returns ``None`` for anything without a resolvable name, so callers can
-    skip-with-warning rather than appending a nameless tool.
-    """
-    if not isinstance(schema, dict):
-        return None
-    # Unwrap an already-wrapped OpenAI tool entry.
-    if schema.get("type") == "function" and isinstance(schema.get("function"), dict):
-        schema = schema["function"]
-        if not isinstance(schema, dict):
-            return None
-    name = schema.get("name", "")
-    if not name or not isinstance(name, str):
-        return None
-    return schema
-
-
 def memory_provider_tools_enabled(enabled_toolsets: Optional[List[str]]) -> bool:
    """Return whether external memory-provider tools should be exposed."""
    if enabled_toolsets is None:
@@ -125,17 +92,11 @@ def inject_memory_provider_tools(agent: Any) -> int:
        agent.valid_tool_names = valid_tool_names

    added = 0
-    for raw_schema in get_schemas():
-        schema = normalize_tool_schema(raw_schema)
-        if schema is None:
-            logger.warning(
-                "Memory provider returned a tool schema with no resolvable "
-                "name; skipping to avoid poisoning the request (%r)",
-                raw_schema,
-            )
+    for schema in get_schemas():
+        if not isinstance(schema, dict):
            continue
-        tool_name = schema["name"]
-        if tool_name in existing_tool_names:
+        tool_name = schema.get("name", "")
+        if not tool_name or tool_name in existing_tool_names:
            continue
        tools.append({"type": "function", "function": schema})
        valid_tool_names.add(tool_name)
@@ -409,11 +370,8 @@ class MemoryManager:
        _core_tool_names = set(_HERMES_CORE_TOOLS)

        # Index tool names → provider for routing
-        for raw_schema in provider.get_tool_schemas():
-            schema = normalize_tool_schema(raw_schema)
-            if schema is None:
-                continue
-            tool_name = schema["name"]
+        for schema in provider.get_tool_schemas():
+            tool_name = schema.get("name", "")
            if tool_name in _core_tool_names:
                logger.warning(
                    "Memory provider '%s' tool '%s' shadows a reserved core "
@@ -700,19 +658,11 @@ class MemoryManager:
        seen = set()
        for provider in self._providers:
            try:
-                for raw_schema in provider.get_tool_schemas():
-                    schema = normalize_tool_schema(raw_schema)
-                    if schema is None:
-                        logger.warning(
-                            "Memory provider '%s' returned a tool schema with "
-                            "no resolvable name; skipping (%r)",
-                            provider.name, raw_schema,
-                        )
-                        continue
-                    name = schema["name"]
+                for schema in provider.get_tool_schemas():
+                    name = schema.get("name", "")
                    if name in _core_tool_names:
                        continue
-                    if name not in seen:
+                    if name and name not in seen:
                        schemas.append(schema)
                        seen.add(name)
            except Exception as e:
--- a/agent/message_sanitization.py
+++ b/agent/message_sanitization.py
@@ -279,38 +279,6 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
    return "{}"


-def close_interrupted_tool_sequence(messages: list, final_response: Any = None) -> bool:
-    """Append a synthetic assistant turn when an interrupted tail is a tool result.
-
-    A turn cut short by ``/stop`` can leave the transcript ending on a raw
-    ``tool`` message (a tool finished, or its execution was cancelled, but the
-    model never streamed a closing assistant turn). Persisting that tail means
-    the next user message lands as ``… tool → user`` — a role-alternation
-    violation that strict providers (Gemini, Claude) react to by hallucinating
-    a continuation of the user's message and ignoring prior context, which
-    reads to the user as "lost context" (#48879).
-
-    ``finalize_turn`` closes this on the happy interrupt path, but the
-    retry/backoff/error interrupt aborts in ``conversation_loop`` ``return``
-    early and never reach it — this shared helper closes the sequence on all of
-    them. ``final_response`` is usually empty on an interrupt, so an explicit
-    placeholder is used rather than an empty-content assistant turn.
-
-    Mutates ``messages`` in place. Returns True if a closing turn was appended.
-    """
-    if not messages:
-        return False
-    last = messages[-1]
-    if not isinstance(last, dict) or last.get("role") != "tool":
-        return False
-    text = final_response if isinstance(final_response, str) else ""
-    messages.append({
-        "role": "assistant",
-        "content": text.strip() or "Operation interrupted.",
-    })
-    return True
-
-
 def _strip_non_ascii(text: str) -> str:
    """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.

@@ -463,7 +431,6 @@ def _sanitize_structure_non_ascii(payload: Any) -> bool:

 __all__ = [
    "_SURROGATE_RE",
-    "close_interrupted_tool_sequence",
    "_sanitize_surrogates",
    "_sanitize_structure_surrogates",
    "_sanitize_messages_surrogates",
--- a/agent/moa_loop.py
+++ b/agent/moa_loop.py
@@ -1,586 +0,0 @@
-"""Mixture-of-Agents runtime helpers for /moa turns.
-
-The slash command is deliberately not a model tool. It marks one user turn as
-MoA-enabled; the normal Hermes agent loop still owns tool calling and turn
-termination, while this module gathers reference-model context before each model
-iteration.
-"""
-
-from __future__ import annotations
-
-import hashlib
-import logging
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any
-
-from agent.auxiliary_client import call_llm
-from agent.transports import get_transport
-
-logger = logging.getLogger(__name__)
-
-# Upper bound on concurrent reference-model calls. References are independent
-# advisory calls (no tools, no inter-dependence), so we fan them out the same
-# way delegate_task runs a batch: all in flight at once, results collected when
-# every reference finishes. Presets rarely list more than a handful of
-# references; this cap just protects against a pathologically large preset
-# opening dozens of sockets at once.
-_MAX_REFERENCE_WORKERS = 8
-
-# Per-tool-result character budget for the advisory reference view. Tool
-# results can be huge (a full diff, a 5000-line file dump); replaying them
-# verbatim per reference per tool-loop step would blow the reference model's
-# context window and cost. We keep the agent's *actions* (tool calls) in full —
-# they are cheap, high-signal, and tell the reference what the agent did — but
-# preview each tool *result* head+tail so the reference still sees what came
-# back without replaying megabytes. The acting aggregator always gets the full,
-# untrimmed transcript; this budget only shapes the advisory copy.
-_REFERENCE_TOOL_RESULT_BUDGET = 4000
-
-# System prompt prepended to every reference-model call. References are
-# advisory — they do NOT act, call tools, or own the task. Without this
-# framing a reference receives the bare trimmed conversation and assumes it is
-# the acting agent: it then refuses ("I can't access repositories / URLs from
-# here") or tries to call tools it doesn't have. The prompt reframes the model
-# as an analyst whose job is to reason about the presented state and hand its
-# best thinking to the aggregator/orchestrator that will actually act.
-_REFERENCE_SYSTEM_PROMPT = (
-    "You are a reference advisor in a Mixture of Agents (MoA) process. You are "
-    "NOT the acting agent and you do NOT execute anything: you cannot call "
-    "tools, run commands, browse, or access files, repositories, or URLs, and "
-    "you should not try to or apologize for being unable to. A separate "
-    "aggregator/orchestrator model holds those capabilities and will take the "
-    "actual actions.\n\n"
-    "The conversation below is the current state of a task handled by that "
-    "acting agent. Your job is to give your most intelligent analysis of that "
-    "state: understand the goal, reason about the problem, and advise on what "
-    "to do next. Surface the best approach, concrete next steps and tool-use "
-    "strategy, likely pitfalls and risks, and anything the acting agent may "
-    "have missed or gotten wrong. Assume any referenced files, URLs, or "
-    "systems exist and reason about them from the context given rather than "
-    "asking for access.\n\n"
-    "Respond with your advice directly — no preamble, no disclaimers about "
-    "tools or access. Your response is private guidance handed to the "
-    "aggregator, not an answer shown to the user."
-)
-
-
-
-def _slot_label(slot: dict[str, str]) -> str:
-    return f"{slot.get('provider', '').strip()}:{slot.get('model', '').strip()}"
-
-
-def _slot_runtime(slot: dict[str, str]) -> dict[str, Any]:
-    """Resolve a reference/aggregator slot to real runtime call kwargs.
-
-    A MoA slot is just a model selection — it must be called the same way any
-    model is called elsewhere, not through a bare ``call_llm(provider=...,
-    model=...)`` that leaves base_url/api_key/api_mode unresolved and lets the
-    auxiliary auto-detector guess. We route the slot's provider through
-    ``resolve_runtime_provider`` (the canonical provider→api_mode/base_url/
-    api_key resolver the CLI, gateway, and delegate_task all use), so the slot
-    gets its provider's real API surface — e.g. MiniMax → anthropic_messages,
-    GPT-5/o-series → max_completion_tokens, custom endpoints → their base_url.
-
-    Returns the kwargs to pass through to ``call_llm`` (provider/model plus the
-    resolved base_url/api_key when available). Falls back to the bare
-    provider/model on any resolution error so a misconfigured slot still
-    attempts the call rather than aborting the whole MoA turn.
-    """
-    provider = str(slot.get("provider") or "").strip()
-    model = str(slot.get("model") or "").strip()
-    out: dict[str, Any] = {"provider": provider, "model": model}
-    try:
-        from hermes_cli.runtime_provider import resolve_runtime_provider
-
-        rt = resolve_runtime_provider(requested=provider, target_model=model)
-        resolved_provider = str(rt.get("provider") or provider).strip().lower()
-        # call_llm treats an explicit base_url as a custom endpoint. That is
-        # correct for ordinary OpenAI-compatible targets, but wrong for OAuth /
-        # provider-backed targets whose provider branch adds auth refresh,
-        # request metadata, or request-shape adapters. Keep those providers
-        # identified by name.
-        if resolved_provider in {"nous", "openai-codex", "xai-oauth"}:
-            return out
-        # Pass the resolved endpoint through so call_llm builds the request for
-        # the provider's actual API surface instead of auto-detecting. base_url
-        # routes call_llm to the right adapter (incl. anthropic_messages mode);
-        # api_key is the resolved credential for that provider.
-        if rt.get("base_url"):
-            out["base_url"] = rt["base_url"]
-        if rt.get("api_key"):
-            out["api_key"] = rt["api_key"]
-    except Exception as exc:  # pragma: no cover - defensive
-        logger.debug("MoA slot runtime resolution failed for %s: %s", _slot_label(slot), exc)
-    return out
-
-
-def _run_reference(
-    slot: dict[str, str],
-    ref_messages: list[dict[str, Any]],
-    *,
-    temperature: float | None = None,
-    max_tokens: int | None = None,
-) -> tuple[str, str]:
-    """Call one reference model and return ``(label, text)``.
-
-    The slot is resolved to its provider's real runtime (via ``_slot_runtime``)
-    and called through the same ``call_llm`` request-building path any model
-    uses, so per-model wire-format handling (anthropic_messages,
-    max_completion_tokens, fixed/forbidden temperature) applies identically to
-    a reference as it would if that model were the acting model. MoA imposes no
-    cap of its own (``max_tokens`` defaults to ``None`` → omitted → the model's
-    real maximum); ``temperature`` is only the user's configured preset value,
-    which call_llm may still override per model.
-
-    Never raises: a failed reference becomes a labelled note so the aggregator
-    can still act with partial context. Designed to run inside a thread pool —
-    ``call_llm`` is synchronous/blocking, so threads (not asyncio) are the right
-    concurrency primitive, mirroring ``delegate_task``'s batch fan-out.
-    """
-    label = _slot_label(slot)
-    try:
-        # Prepend the advisory-role system prompt so the reference understands
-        # it is analyzing state for an aggregator, not acting on the task. The
-        # trimmed view (_reference_messages) already strips the agent's own
-        # system prompt, so this is the only system message the reference sees.
-        messages = [{"role": "system", "content": _REFERENCE_SYSTEM_PROMPT}, *ref_messages]
-        response = call_llm(
-            task="moa_reference",
-            messages=messages,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            **_slot_runtime(slot),
-        )
-        return label, _extract_text(response) or "(empty response)"
-    except Exception as exc:
-        logger.warning("MoA reference model %s failed: %s", label, exc)
-        return label, f"[failed: {exc}]"
-
-
-def _run_references_parallel(
-    reference_models: list[dict[str, str]],
-    ref_messages: list[dict[str, Any]],
-    *,
-    temperature: float | None = None,
-    max_tokens: int | None = None,
-) -> list[tuple[str, str]]:
-    """Fan out all reference models in parallel, returning outputs in order.
-
-    Like ``delegate_task``'s batch mode, every reference is dispatched at once
-    and we block until all of them finish before handing the joined results to
-    the aggregator. Output order matches ``reference_models`` so the
-    ``Reference {idx}`` labelling stays stable. MoA presets that reference
-    another MoA preset are skipped here (recursion guard) with a labelled note.
-    """
-    if not reference_models:
-        return []
-
-    results: list[tuple[str, str] | None] = [None] * len(reference_models)
-    futures = {}
-    workers = min(_MAX_REFERENCE_WORKERS, len(reference_models))
-    with ThreadPoolExecutor(max_workers=workers) as executor:
-        for idx, slot in enumerate(reference_models):
-            if slot.get("provider") == "moa":
-                results[idx] = (
-                    _slot_label(slot),
-                    "[skipped: MoA presets cannot recursively reference MoA]",
-                )
-                continue
-            futures[
-                executor.submit(
-                    _run_reference,
-                    slot,
-                    ref_messages,
-                    temperature=temperature,
-                    max_tokens=max_tokens,
-                )
-            ] = idx
-        # Collect every reference before returning — the aggregator needs the
-        # complete set, so there is no early-exit / first-completed path here.
-        for future, idx in futures.items():
-            results[idx] = future.result()
-
-    return [r for r in results if r is not None]
-
-
-def _truncate_tool_result(text: str, budget: int = _REFERENCE_TOOL_RESULT_BUDGET) -> str:
-    """Head+tail preview of a tool result for the advisory view.
-
-    Keeps the first and last halves of the budget with a ``[... N chars
-    omitted ...]`` marker between them, so a reference sees both how the result
-    started and how it ended without replaying the whole payload.
-    """
-    if not text or len(text) <= budget:
-        return text
-    half = budget // 2
-    omitted = len(text) - 2 * half
-    return f"{text[:half]}\n[... {omitted} chars omitted ...]\n{text[-half:]}"
-
-
-def _render_tool_calls(tool_calls: Any) -> str:
-    """Render an assistant turn's tool_calls as readable text lines.
-
-    The advisory view cannot carry real ``tool_calls`` payloads (strict
-    providers reject tool_calls the reference never produced), so the agent's
-    actions are flattened to text the reference can read and reason about.
-    """
-    lines: list[str] = []
-    for tc in tool_calls or []:
-        fn = (tc.get("function") or {}) if isinstance(tc, dict) else {}
-        name = fn.get("name") or (tc.get("name") if isinstance(tc, dict) else "") or "tool"
-        args = fn.get("arguments")
-        if isinstance(args, str):
-            args_text = args
-        elif args is not None:
-            try:
-                import json
-
-                args_text = json.dumps(args, ensure_ascii=False)
-            except Exception:
-                args_text = str(args)
-        else:
-            args_text = ""
-        lines.append(f"[called tool: {name}({args_text})]" if args_text else f"[called tool: {name}]")
-    return "\n".join(lines)
-
-
-def _reference_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    """Build an advisory view of the conversation for reference models.
-
-    A reference gives an INFORMED judgement on the current state, so it must
-    see what the agent actually did — its tool calls AND the tool results that
-    came back — not just the agent's narration. We therefore preserve the whole
-    conversation flow, but flatten it into clean user/assistant *text* turns:
-
-      - system prompt: dropped (8K of Hermes boilerplate, not advisory signal).
-      - assistant turns: kept; any ``tool_calls`` are rendered inline as
-        ``[called tool: name(args)]`` text lines appended to the turn's text.
-      - ``tool``-role results: NOT dropped. Each is folded (head+tail preview,
-        see ``_truncate_tool_result``) into the *preceding* assistant turn as a
-        ``[tool result: ...]`` block, so the reference sees what came back.
-
-    This emits ZERO ``tool``-role messages and ZERO ``tool_calls`` arrays — only
-    plain user/assistant text — so strict providers (Mistral, Fireworks) that
-    reject orphan tool messages / unproduced tool_calls don't 400, while the
-    reference still has the full picture.
-
-    The view MUST end with a ``user`` turn. Anthropic (and OpenRouter→Anthropic)
-    interpret a trailing assistant turn as an assistant *prefill* to continue,
-    and no-prefill models (e.g. Claude Opus 4.8) reject it with
-    ``400 ... must end with a user message``. Rather than DELETE the agent's
-    latest context to satisfy that (which would blind the reference to the
-    current state), we APPEND a synthetic user turn asking the reference to
-    judge the state above. End-on-user is satisfied and no context is lost.
-
-    The acting aggregator always receives the full, untrimmed transcript; this
-    function only shapes the disposable advisory copy.
-    """
-    advisory_instruction = (
-        "[The conversation above is the current state of the task. Give your "
-        "most intelligent judgement: what is going on, what should happen next, "
-        "what risks or mistakes you see, and how the acting agent should "
-        "proceed.]"
-    )
-
-    rendered: list[dict[str, Any]] = []
-    last_user_content: str | None = None
-    for msg in messages:
-        role = msg.get("role")
-        content = msg.get("content")
-        text = content if isinstance(content, str) else ""
-
-        if role == "system":
-            continue
-        if role == "user":
-            if text.strip():
-                last_user_content = text
-            rendered.append({"role": "user", "content": text})
-        elif role == "assistant":
-            parts: list[str] = []
-            if text.strip():
-                parts.append(text.strip())
-            calls_text = _render_tool_calls(msg.get("tool_calls"))
-            if calls_text:
-                parts.append(calls_text)
-            # Empty assistant turns (no text, no calls) carry nothing advisory.
-            if parts:
-                rendered.append({"role": "assistant", "content": "\n".join(parts)})
-        elif role == "tool":
-            # Fold the tool result into the preceding assistant turn as text so
-            # the reference sees what came back, without emitting a tool-role
-            # message a reference never produced.
-            result_text = _truncate_tool_result(text)
-            block = f"[tool result: {result_text}]"
-            if rendered and rendered[-1].get("role") == "assistant":
-                rendered[-1]["content"] = rendered[-1]["content"] + "\n" + block
-            else:
-                # No assistant turn to attach to (e.g. a leading tool result);
-                # keep it as advisory context on its own assistant-role line.
-                rendered.append({"role": "assistant", "content": block})
-        # Any other role is ignored.
-
-    # End on a user turn: append a synthetic advisory request rather than
-    # deleting the agent's latest assistant context. This satisfies Anthropic's
-    # no-trailing-assistant-prefill rule while preserving full state.
-    if rendered and rendered[-1].get("role") == "assistant":
-        rendered.append({"role": "user", "content": advisory_instruction})
-    elif rendered and rendered[-1].get("role") == "user":
-        # Already ends on a user turn (fresh user prompt, no agent action yet).
-        # Leave it — the reference answers that prompt directly.
-        pass
-
-    if not rendered:
-        # Degenerate case: nothing rendered. Fall back to the latest user turn.
-        if last_user_content is not None:
-            return [{"role": "user", "content": last_user_content}]
-        for msg in reversed(messages):
-            if msg.get("role") == "user" and isinstance(msg.get("content"), str):
-                return [{"role": "user", "content": msg["content"]}]
-    return rendered
-
-
-
-def _extract_text(response: Any) -> str:
-    try:
-        transport = get_transport("chat_completions")
-        if transport is None:
-            raise RuntimeError("chat_completions transport unavailable")
-        normalized = transport.normalize_response(response)
-        text = (normalized.content or "").strip()
-        if text:
-            return text
-    except Exception:
-        pass
-    try:
-        content = response.choices[0].message.content
-        return (content or "").strip()
-    except Exception:
-        return ""
-
-
-def aggregate_moa_context(
-    *,
-    user_prompt: str,
-    api_messages: list[dict[str, Any]],
-    reference_models: list[dict[str, str]],
-    aggregator: dict[str, str],
-    temperature: float = 0.6,
-    aggregator_temperature: float = 0.4,
-    max_tokens: int | None = None,
-) -> str:
-    """Run configured reference models and synthesize their advice.
-
-    Failures are returned as model-specific notes instead of aborting the normal
-    agent loop; the main model can still act with partial context.
-
-    ``max_tokens`` is ``None`` by default: MoA does not cap reference or
-    aggregator output, so each model uses its own maximum. ``call_llm`` omits
-    the parameter entirely when it is ``None`` (see its docstring), which also
-    sidesteps providers that reject ``max_tokens`` outright. A hardcoded cap
-    here previously truncated long aggregator syntheses.
-    """
-    reference_outputs: list[tuple[str, str]] = []
-    ref_messages = _reference_messages(api_messages)
-    reference_outputs = _run_references_parallel(
-        reference_models,
-        ref_messages,
-        temperature=temperature,
-        max_tokens=max_tokens,
-    )
-
-    joined = "\n\n".join(
-        f"Reference {idx} — {label}:\n{text}"
-        for idx, (label, text) in enumerate(reference_outputs, start=1)
-    )
-    synth_prompt = (
-        "You are the aggregator in a Mixture of Agents process. Synthesize the "
-        "reference responses into concise, actionable guidance for the main "
-        "Hermes agent. Focus on next steps, tool-use strategy, risks, and any "
-        "disagreements. Do not answer the user directly unless that is all that "
-        "is needed; produce context the main agent should use in its normal loop.\n\n"
-        f"Original user prompt:\n{user_prompt}\n\n"
-        f"Reference responses:\n{joined}"
-    )
-
-    agg_label = _slot_label(aggregator)
-    try:
-        response = call_llm(
-            task="moa_aggregator",
-            messages=[{"role": "user", "content": synth_prompt}],
-            temperature=aggregator_temperature,
-            max_tokens=max_tokens,
-            **_slot_runtime(aggregator),
-        )
-        synthesis = _extract_text(response)
-    except Exception as exc:
-        logger.warning("MoA aggregator model %s failed: %s", agg_label, exc)
-        synthesis = ""
-
-    if not synthesis:
-        synthesis = joined
-
-    return (
-        "[Mixture of Agents context — use this as private guidance for the "
-        "normal Hermes agent loop. You may call tools, continue reasoning, or "
-        "finish normally.]\n"
-        f"Aggregator: {agg_label}\n"
-        f"References: {', '.join(_slot_label(slot) for slot in reference_models)}\n\n"
-        f"{synthesis.strip()}"
-    )
-
-
-class MoAChatCompletions:
-    """OpenAI-chat-compatible facade where the aggregator is the acting model."""
-
-    def __init__(self, preset_name: str, reference_callback: Any = None):
-        self.preset_name = preset_name or "default"
-        # Optional display hook. Called as reference outputs become available so
-        # frontends can show each reference model's answer as a labelled block
-        # before the aggregator acts. Signature:
-        #   reference_callback(event, **kwargs)
-        # where event is one of:
-        #   "moa.reference"   kwargs: index, count, label, text
-        #   "moa.aggregating" kwargs: aggregator (label), ref_count
-        # Never raises into the model call — display is best-effort.
-        self.reference_callback = reference_callback
-        # State-scoped reference cache. The agent loop calls create() once per
-        # tool-loop iteration; references should re-run whenever the task STATE
-        # advances — i.e. on every new user message AND every new tool result —
-        # so each reference judges the latest state. The advisory view
-        # (_reference_messages) now renders tool calls + results as text, so its
-        # signature changes on every new tool response; the cache key is that
-        # signature, so a new tool result is a cache MISS (references re-run)
-        # while a redundant create() call with identical state is a HIT (no
-        # re-run, no re-emit). This gives "fire on every user/tool response"
-        # for free, without re-firing on a pure no-op re-call.
-        self._ref_cache_key: tuple | None = None
-        self._ref_cache_outputs: list[tuple[str, str]] = []
-
-    def _emit(self, event: str, **kwargs: Any) -> None:
-        cb = self.reference_callback
-        if cb is None:
-            return
-        try:
-            cb(event, **kwargs)
-        except Exception as exc:  # pragma: no cover - display must never break the turn
-            logger.debug("MoA reference_callback failed for %s: %s", event, exc)
-
-    def create(self, **api_kwargs: Any) -> Any:
-        from hermes_cli.config import load_config
-        from hermes_cli.moa_config import resolve_moa_preset
-
-        preset = resolve_moa_preset(load_config().get("moa") or {}, self.preset_name)
-        messages = list(api_kwargs.get("messages") or [])
-        reference_models = preset.get("reference_models") or []
-        aggregator = preset.get("aggregator") or {}
-        # MoA does not cap reference or aggregator output: each model uses its
-        # own maximum. Passing max_tokens=None makes call_llm omit the parameter
-        # (it never caps by default), so a long aggregator synthesis is never
-        # truncated and providers that reject max_tokens don't 400.
-        temperature = float(preset.get("reference_temperature", 0.6) or 0.6)
-        aggregator_temperature = float(preset.get("aggregator_temperature", api_kwargs.get("temperature") or 0.4) or 0.4)
-
-        # When the preset is disabled, skip the reference fan-out and let the
-        # configured aggregator act alone — it is the preset's acting model, so
-        # a disabled MoA preset is simply "use the aggregator directly."
-        if not preset.get("enabled", True):
-            reference_models = []
-
-        reference_outputs: list[tuple[str, str]] = []
-        ref_messages = _reference_messages(messages)
-
-        # Turn-scoped cache: only run + display references when the advisory
-        # view changed (i.e. a new user turn). Within one turn the agent loop
-        # calls create() once per tool iteration with the same advisory view;
-        # reuse the cached outputs and skip both the re-run and the re-emit.
-        _sig = hashlib.sha256(
-            "\u0000".join(
-                f"{m.get('role')}:{m.get('content')}" for m in ref_messages
-            ).encode("utf-8", "replace")
-        ).hexdigest()
-        _cache_key = (self.preset_name, _sig, tuple(_slot_label(s) for s in reference_models))
-        _refs_from_cache = _cache_key == self._ref_cache_key and bool(self._ref_cache_outputs)
-
-        if _refs_from_cache:
-            reference_outputs = list(self._ref_cache_outputs)
-        else:
-            reference_outputs = _run_references_parallel(
-                reference_models,
-                ref_messages,
-                temperature=temperature,
-                max_tokens=None,
-            )
-            self._ref_cache_key = _cache_key
-            self._ref_cache_outputs = list(reference_outputs)
-
-            # Surface each reference model's answer to the display BEFORE the
-            # aggregator acts — once per turn (only on the iteration that
-            # actually ran them). The user sees one labelled block per
-            # reference (rendered like a thinking block) so the MoA process is
-            # visible rather than a silent pause. Best-effort: never blocks the
-            # turn.
-            _ref_count = len(reference_outputs)
-            for _idx, (_label, _text) in enumerate(reference_outputs, start=1):
-                self._emit(
-                    "moa.reference",
-                    index=_idx,
-                    count=_ref_count,
-                    label=_label,
-                    text=_text,
-                )
-            if _ref_count:
-                self._emit(
-                    "moa.aggregating",
-                    aggregator=_slot_label(aggregator),
-                    ref_count=_ref_count,
-                )
-
-        agg_messages = [dict(m) for m in messages]
-        if reference_outputs:
-            joined = "\n\n".join(
-                f"Reference {idx} — {label}:\n{text}"
-                for idx, (label, text) in enumerate(reference_outputs, start=1)
-            )
-            guidance = (
-                "[Mixture of Agents reference context]\n"
-                f"Preset: {self.preset_name}\n"
-                f"Aggregator/acting model: {_slot_label(aggregator)}\n"
-                f"References: {', '.join(label for label, _ in reference_outputs)}\n\n"
-                "Use the reference responses below as private context. You are the aggregator and acting model: "
-                "answer the user directly or call tools as needed.\n\n"
-                f"{joined}"
-            )
-            for msg in reversed(agg_messages):
-                if msg.get("role") == "user" and isinstance(msg.get("content"), str):
-                    msg["content"] = msg["content"] + "\n\n" + guidance
-                    break
-            else:
-                agg_messages.append({"role": "user", "content": guidance})
-
-        if aggregator.get("provider") == "moa":
-            raise RuntimeError("MoA aggregator cannot be another MoA preset")
-        agg_kwargs = dict(api_kwargs)
-        agg_kwargs["messages"] = agg_messages
-        # The aggregator is the acting model. Resolve its slot to the provider's
-        # real runtime (base_url/api_key/api_mode) and call it through the same
-        # request-building path any model uses — so per-model wire-format
-        # handling (anthropic_messages, max_completion_tokens, fixed/forbidden
-        # temperature) applies identically to it. MoA imposes no output cap:
-        # max_tokens is passed through from the caller (normally None → omitted
-        # → the model's real maximum). The preset's old hardcoded 4096 default
-        # is gone — it truncated long syntheses.
-        return call_llm(
-            task="moa_aggregator",
-            messages=agg_messages,
-            temperature=aggregator_temperature,
-            max_tokens=agg_kwargs.get("max_tokens"),
-            tools=agg_kwargs.get("tools"),
-            extra_body=agg_kwargs.get("extra_body"),
-            **_slot_runtime(aggregator),
-        )
-
-
-class MoAClient:
-    def __init__(self, preset_name: str, reference_callback: Any = None):
-        self.chat = type("_MoAChat", (), {})()
-        self.chat.completions = MoAChatCompletions(preset_name, reference_callback=reference_callback)
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -1646,34 +1646,6 @@ def get_model_context_length(
    if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0:
        return config_context_length

-    # 0a. MoA virtual provider — ``model`` is a preset name, not a real model,
-    # and ``base_url`` is the local virtual endpoint, so every probe below would
-    # miss and fall through to the 256K default. The aggregator is the acting
-    # model, so resolve the context window from the aggregator slot's real
-    # provider+model instead. References are advisory-only and never bound the
-    # acting context, so they're ignored here.
-    if (provider or "").strip().lower() == "moa":
-        try:
-            from hermes_cli.config import load_config
-            from hermes_cli.moa_config import resolve_moa_preset
-            from hermes_cli.runtime_provider import resolve_runtime_provider
-
-            preset = resolve_moa_preset(load_config().get("moa") or {}, model)
-            agg = preset.get("aggregator") or {}
-            agg_provider = str(agg.get("provider") or "").strip()
-            agg_model = str(agg.get("model") or "").strip()
-            if agg_model and agg_provider and agg_provider.lower() != "moa":
-                rt = resolve_runtime_provider(requested=agg_provider, target_model=agg_model)
-                return get_model_context_length(
-                    agg_model,
-                    base_url=rt.get("base_url", "") or "",
-                    api_key=rt.get("api_key", "") or "",
-                    provider=agg_provider,
-                )
-        except Exception:
-            logger.debug("MoA aggregator context-length resolution failed", exc_info=True)
-        # Fall through to the generic default if aggregator resolution failed.
-
    # 0b. custom_providers per-model override — check before any probe.
    # This closes the gap where /model switch and display paths used to fall
    # back to 128K despite the user having a per-model context_length set.
--- a/agent/oneshot.py
+++ b/agent/oneshot.py
@@ -1,158 +0,0 @@
-"""Shared one-off LLM requests for non-conversational helpers.
-
-A "one-shot" is a single, stateless model call that runs *outside* any
-conversation: it never touches a session's history, never breaks prompt
-caching, and returns plain text. UI surfaces use it for small generative
-chores — a commit message from a diff, a rename suggestion, a summary —
-where spinning up an agent turn would be wrong (it would pollute the thread)
-and hand-rolling an LLM call at every call site would be worse.
-
-Two ways to call it:
-
-  * ``run_oneshot(instructions=..., user_input=...)`` — caller supplies the
-    full prompt.
-  * ``run_oneshot(template="commit_message", variables={...})`` — caller
-    names a registered template and passes its variables; the template owns
-    the prompt engineering so it stays consistent across CLI/TUI/desktop.
-
-Model selection rides the same auxiliary plumbing as title generation
-(:func:`agent.auxiliary_client.call_llm`): pass ``main_runtime`` to inherit
-the live session's provider/model, otherwise the configured ``task`` (default
-``title_generation``) resolves a cheap/fast backend.
-"""
-
-import logging
-from typing import Any, Callable, Dict, Optional, Tuple
-
-from agent.auxiliary_client import call_llm, extract_content_or_reasoning
-
-logger = logging.getLogger(__name__)
-
-# A template turns a variables dict into a (instructions, user_input) pair.
-# Templates are plain callables (not str.format) so diff/code payloads with
-# literal "{" / "}" pass through untouched.
-PromptTemplate = Callable[[Dict[str, Any]], Tuple[str, str]]
-
-
-def _truncate(text: str, limit: int) -> str:
-    text = text or ""
-    if len(text) <= limit:
-        return text
-    return text[:limit].rstrip() + "\n…(truncated)"
-
-
-_COMMIT_INSTRUCTIONS = (
-    "You write git commit messages. Given a diff of staged changes, write ONE "
-    "concise Conventional Commits message describing what the change does and why.\n"
-    "Rules:\n"
-    "- Subject line: type(scope): summary — imperative mood, lower-case, no "
-    "trailing period, ≤ 72 characters. Types: feat, fix, refactor, perf, docs, "
-    "test, build, chore, style, ci.\n"
-    "- Omit the scope if it isn't obvious.\n"
-    "- Add a short body (wrapped at ~72 cols) ONLY when the change needs "
-    "explanation; skip it for small/obvious changes.\n"
-    "- Describe the actual change, never restate the diff line-by-line.\n"
-    "- Return ONLY the commit message text — no quotes, no markdown fences, no "
-    "preamble."
-)
-
-
-def _commit_message_template(variables: Dict[str, Any]) -> Tuple[str, str]:
-    diff = _truncate(str(variables.get("diff") or ""), 12000)
-    recent = _truncate(str(variables.get("recent_commits") or ""), 1500)
-
-    parts = []
-    if recent.strip():
-        parts.append(
-            "Recent commit subjects from this repo (match their style/conventions):\n"
-            f"{recent}"
-        )
-    parts.append("Diff to describe:\n" + (diff or "(no textual diff available)"))
-
-    # "Regenerate" must yield something new even on models that decode greedily
-    # / pin temperature server-side. A trailing nonce isn't enough, so we hand
-    # back the previous message and require a genuinely different one.
-    avoid = _truncate(str(variables.get("avoid") or "").strip(), 1000)
-    if avoid:
-        parts.append(
-            "You already proposed the message below and the user wants a "
-            "different one. Write a NEW message with different wording (and, if "
-            "reasonable, a different emphasis or scope framing) — do not repeat "
-            f"it:\n{avoid}"
-        )
-
-    return _COMMIT_INSTRUCTIONS, "\n\n".join(parts)
-
-
-# Registry of named templates. Add an entry here to give a new surface a
-# consistent, reusable prompt without teaching every caller the prompt text.
-PROMPT_TEMPLATES: Dict[str, PromptTemplate] = {
-    "commit_message": _commit_message_template,
-}
-
-
-def render_template(name: str, variables: Optional[Dict[str, Any]] = None) -> Tuple[str, str]:
-    """Resolve a registered template into (instructions, user_input).
-
-    Raises KeyError if the template name is unknown so callers fail loudly
-    instead of silently sending an empty prompt.
-    """
-    template = PROMPT_TEMPLATES.get(name)
-    if template is None:
-        raise KeyError(f"unknown one-shot template: {name}")
-    return template(variables or {})
-
-
-def run_oneshot(
-    *,
-    instructions: str = "",
-    user_input: str = "",
-    template: Optional[str] = None,
-    variables: Optional[Dict[str, Any]] = None,
-    task: str = "title_generation",
-    max_tokens: int = 1024,
-    temperature: Optional[float] = 0.3,
-    timeout: float = 60.0,
-    main_runtime: Optional[Dict[str, Any]] = None,
-) -> str:
-    """Run a single stateless LLM request and return its text.
-
-    Provide either a registered ``template`` (+ ``variables``) or an explicit
-    ``instructions`` / ``user_input`` pair. Returns the model's text answer,
-    stripped of surrounding whitespace and any wrapping code fence.
-
-    Raises RuntimeError when no LLM provider is configured (surfaced from
-    :func:`call_llm`) and KeyError for an unknown template name.
-    """
-    if template:
-        instructions, user_input = render_template(template, variables)
-
-    if not (instructions or "").strip() and not (user_input or "").strip():
-        raise ValueError("run_oneshot requires a template or instructions/user_input")
-
-    messages = []
-    if (instructions or "").strip():
-        messages.append({"role": "system", "content": instructions})
-    messages.append({"role": "user", "content": user_input or ""})
-
-    response = call_llm(
-        task=task,
-        messages=messages,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        timeout=timeout,
-        main_runtime=main_runtime,
-    )
-
-    text = (extract_content_or_reasoning(response) or "").strip()
-    return _strip_code_fence(text)
-
-
-def _strip_code_fence(text: str) -> str:
-    """Drop a single wrapping ``` fence the model may have added."""
-    if not text.startswith("```"):
-        return text
-    lines = text.splitlines()
-    if len(lines) >= 2 and lines[0].startswith("```") and lines[-1].strip() == "```":
-        return "\n".join(lines[1:-1]).strip()
-    return text
--- a/agent/pet/init.py
+++ b/agent/pet/init.py
@@ -1,51 +0,0 @@
-"""Petdex pet engine — shared core for the CLI, TUI, and desktop surfaces.
-
-Petdex (https://github.com/crafter-station/petdex) is a public gallery of
-animated sprite "pets" for coding agents.  Each pet is a ``pet.json`` plus a
-``spritesheet.{webp,png}`` of 192×208 px cells. Current Codex/petdex sheets use
-an 8-column × 9-row atlas; older Hermes/petdex sheets used an 8-row atlas.
-Hermes infers the row taxonomy from the sheet and maps agent activity onto
-idle/run/review/failed/wave/jump.
-
-This package is the **single source of truth** for the feature so the base
-CLI (Python) and TUI (Ink, via ``tui_gateway``) never duplicate the hard
-parts:
-
- :mod:`agent.pet.constants` — frame geometry + the :class:`PetState` enum.
- :mod:`agent.pet.state`     — map agent activity → a :class:`PetState`.
- :mod:`agent.pet.manifest`  — fetch the public petdex manifest.
- :mod:`agent.pet.store`     — install / list / resolve pets on disk
-                               (profile-aware via ``get_hermes_home()``).
- :mod:`agent.pet.render`    — decode a spritesheet and encode frames for a
-                               terminal (kitty / iTerm2 / sixel graphics
-                               protocols, with a Unicode half-block
-                               fallback).
-
-Rendering in the Electron desktop is necessarily TypeScript (canvas), but it
-reuses the same on-disk store and the same state semantics.
-
-The whole feature is a *display* concern: it adds no model tool, mutates no
-system prompt or toolset, and therefore has zero effect on prompt caching.
-"""
-
-from agent.pet.constants import (
-    DEFAULT_SCALE,
-    FRAME_H,
-    FRAME_W,
-    FRAMES_PER_STATE,
-    LOOP_MS,
-    STATE_ROWS,
-    PetState,
-)
-from agent.pet.state import derive_pet_state
-
-__all__ = [
-    "DEFAULT_SCALE",
-    "FRAME_H",
-    "FRAME_W",
-    "FRAMES_PER_STATE",
-    "LOOP_MS",
-    "STATE_ROWS",
-    "PetState",
-    "derive_pet_state",
-]
--- a/agent/pet/constants.py
+++ b/agent/pet/constants.py
@@ -1,167 +0,0 @@
-"""Pet sprite geometry + animation-state taxonomy.
-
-These values are the common petdex/Codex pet geometry. The real ``pet.json``
-usually only carries ``id``/``displayName``/``description``/``spritesheetPath``;
-row taxonomy is inferred from the atlas shape so Hermes can render both legacy
-8-row sheets and current 9-row Codex sheets.
-"""
-
-from __future__ import annotations
-
-from enum import Enum
-
-# Frame geometry (pixels). Current Codex/petdex spritesheets are 8 columns x 9
-# rows (1536x1872), while older Hermes/petdex sheets used 9 columns x 8 rows
-# (1728x1664). Renderers derive both row taxonomy and real column count from the
-# concrete sheet, so either shape works.
-FRAME_W = 192
-FRAME_H = 208
-
-# Frames consumed per animation state (the petdex web app uses CSS
-# ``steps(6)``).  A sheet may physically contain more columns; we only step
-# through the first ``FRAMES_PER_STATE``.
-FRAMES_PER_STATE = 6
-
-# Full-loop duration for one state, milliseconds (petdex default).
-LOOP_MS = 1100
-
-# Default on-screen scale relative to native frame size.  ``display.pet.scale``
-# is the single master scalar: the desktop canvas multiplies its native pixels
-# by it and every terminal surface derives its half-block/kitty column width
-# from it (see :func:`cols_for_scale`), so one number shrinks all three
-# interfaces together.  (petdex's own clients render at 0.7; we default smaller
-# so the kitty/GUI mascot stays a glanceable corner sprite.  The half-block
-# fallback can't shrink as far — see ``UNICODE_MIN_COLS`` — and clamps to its
-# legibility floor instead.)
-DEFAULT_SCALE = 0.33
-
-# User-settable scale bounds (``/pet scale``, desktop slider).  Floor keeps the
-# pet clickable/visible; ceiling stops a fat-fingered value from filling the
-# screen.  The unicode fallback additionally clamps to ``UNICODE_MIN_COLS``.
-MIN_SCALE = 0.1
-MAX_SCALE = 3.0
-
-
-def clamp_scale(scale: float) -> float:
-    """Clamp *scale* to ``[MIN_SCALE, MAX_SCALE]`` (the single validation point)."""
-    return max(MIN_SCALE, min(MAX_SCALE, scale))
-
-# Terminal cells one native frame spans at ``scale == 1.0``.  A cell is ~8px
-# wide, a frame is ``FRAME_W`` (192) px → 24 cells.  This mirrors the kitty
-# graphics placement (``scaled_px // 8``) so at full scale every renderer agrees.
-BASE_UNICODE_COLS = FRAME_W // 8
-
-# Legibility floor for the half-block fallback.  A half-block cell samples the
-# sprite at only 1 horizontal + 2 vertical taps, so below this width a 192×208
-# pet collapses into an unreadable blob *regardless* of scale.  kitty/GUI draw
-# true pixels and have no such floor — that's why the same ``scale: 0.33`` is
-# crisp there but mush in half-blocks.  ``scale`` shrinks the unicode pet down
-# TO this floor (and grows it above), instead of past it into noise.
-UNICODE_MIN_COLS = 16
-
-
-def cols_for_scale(scale: float) -> int:
-    """Half-block width implied by *scale*, clamped to the legibility floor.
-
-    Above the floor it tracks the kitty cell box (``scaled_px // 8``) so the two
-    renderers converge at larger sizes; below it the floor keeps the sprite
-    readable rather than letting it devolve into a blob.
-    """
-    return max(UNICODE_MIN_COLS, round(BASE_UNICODE_COLS * (scale or DEFAULT_SCALE)))
-
-
-def resolve_cols(scale: float, unicode_cols: int = 0) -> int:
-    """Resolve terminal width: explicit *unicode_cols* override, else from *scale*."""
-    return int(unicode_cols) if unicode_cols and int(unicode_cols) > 0 else cols_for_scale(scale)
-
-
-class PetState(str, Enum):
-    """Animation state a pet can be shown in.
-
-    These are Hermes' activity state names. They are not always identical to the
-    source atlas row names: Codex-format pets use rows like ``jumping`` /
-    ``running`` while the UI keeps the shorter ``jump`` / ``run`` names.
-    """
-
-    IDLE = "idle"
-    WAVE = "wave"
-    RUN = "run"
-    FAILED = "failed"
-    REVIEW = "review"
-    JUMP = "jump"
-    WAITING = "waiting"
-
-
-# Legacy Hermes/petdex row order (top -> bottom) used by the older 8-row,
-# 9-column atlas shape.
-LEGACY_STATE_ROWS: list[str] = [
-    PetState.IDLE.value,
-    PetState.WAVE.value,
-    PetState.RUN.value,
-    PetState.FAILED.value,
-    PetState.REVIEW.value,
-    PetState.JUMP.value,
-    "extra1",
-    "extra2",
-]
-
-# Current Petdex row order (top -> bottom) used by 1536x1872 atlases:
-# 8 columns x 9 rows of 192x208 cells.
-CODEX_STATE_ROWS: list[str] = [
-    PetState.IDLE.value,
-    "running-right",
-    "running-left",
-    "waving",
-    "jumping",
-    PetState.FAILED.value,
-    PetState.WAITING.value,
-    "running",
-    PetState.REVIEW.value,
-]
-
-# Default/fallback for callers without a sheet. Prefer the current 9-row Codex
-# format because generated pets and the public Codex pet contract use it.
-STATE_ROWS: list[str] = CODEX_STATE_ROWS
-
-# Canonical Hermes activity names -> accepted row-name aliases in descending
-# preference. This keeps our internal state names stable (`wave`/`jump`/`run`)
-# while matching Petdex's current `waving`/`jumping`/`running` taxonomy.
-STATE_ALIASES: dict[str, tuple[str, ...]] = {
-    PetState.IDLE.value: (PetState.IDLE.value,),
-    PetState.WAVE.value: (PetState.WAVE.value, "waving"),
-    PetState.JUMP.value: (PetState.JUMP.value, "jumping"),
-    PetState.RUN.value: (PetState.RUN.value, "running"),
-    PetState.FAILED.value: (PetState.FAILED.value,),
-    PetState.REVIEW.value: (PetState.REVIEW.value,),
-    PetState.WAITING.value: (PetState.WAITING.value,),
-}
-
-
-def state_aliases_for(state: "PetState | str") -> tuple[str, ...]:
-    """Return accepted row-name aliases for *state* (always non-empty)."""
-    value = state.value if isinstance(state, PetState) else str(state)
-    aliases = STATE_ALIASES.get(value)
-    return aliases if aliases else (value,)
-
-
-def state_rows_for_grid(row_count: int | None) -> list[str]:
-    """Return the row taxonomy for a spritesheet with *row_count* rows."""
-    try:
-        rows = int(row_count or 0)
-    except (TypeError, ValueError):
-        rows = 0
-
-    if rows >= len(CODEX_STATE_ROWS):
-        return CODEX_STATE_ROWS
-    return LEGACY_STATE_ROWS
-
-
-def state_row_index(state: "PetState | str", row_count: int | None = None) -> int:
-    """Return the spritesheet row index for *state* (clamped, never raises)."""
-    rows = state_rows_for_grid(row_count)
-    for name in state_aliases_for(state):
-        try:
-            return rows.index(name)
-        except ValueError:
-            continue
-    return 0  # fall back to the idle row
--- a/agent/pet/generate/init.py
+++ b/agent/pet/generate/init.py
@@ -1,29 +0,0 @@
-"""Pet generation — base-draft → hatch pipeline.
-
-Public surface used by the gateway RPCs, the CLI ``hermes pets generate``
-command, and tests:
-
- :func:`generate_base_drafts` / :func:`hatch_pet` — the two-step flow.
- :class:`HatchResult`, :class:`GenerationError`.
- :mod:`atlas` — deterministic frame extraction + atlas composition/validation.
-
-Image generation is delegated to the active reference-capable
-:class:`~agent.image_gen_provider.ImageGenProvider` (OpenAI gpt-image-2 or Krea);
-atlas assembly is fully deterministic so it's testable without any API calls.
-"""
-
-from __future__ import annotations
-
-from agent.pet.generate.imagegen import GenerationError
-from agent.pet.generate.orchestrate import (
-    HatchResult,
-    generate_base_drafts,
-    hatch_pet,
-)
-
-__all__ = [
-    "GenerationError",
-    "HatchResult",
-    "generate_base_drafts",
-    "hatch_pet",
-]
--- a/agent/pet/generate/atlas.py
+++ b/agent/pet/generate/atlas.py
--- a/agent/pet/generate/imagegen.py
+++ b/agent/pet/generate/imagegen.py
@@ -1,251 +0,0 @@
-"""Thin image-generation layer for pet sprites.
-
-Wraps the active :class:`~agent.image_gen_provider.ImageGenProvider` with the
-two things sprite generation needs that the agent-facing ``image_generate`` tool
-doesn't expose: **N variants** (loop) and **reference-image grounding** (so each
-animation row stays the same character as the chosen base).
-
-Reference grounding only works on providers that support it — currently OpenAI
-``gpt-image-2`` (image edits) and Krea (style references). We resolve to one of
-those and surface a clear, actionable error otherwise rather than silently
-producing an ungrounded, drifting pet.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from dataclasses import dataclass
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-# Providers that can ground generation on a reference image, in preference order
-# (Nous Portal → OpenAI → OpenRouter → …). OpenRouter/Nous run a quality-first
-# model chain and may fall back depending on account access and endpoint behavior,
-# so fidelity can vary by configured backend + model availability.
-_REF_CAPABLE = ("nous", "openai", "openai-codex", "openrouter", "krea")
-
-# Friendly display label per reference-capable provider, surfaced in the desktop
-# pet-gen picker.
-_PROVIDER_LABELS: dict[str, str] = {
-    "nous": "Nous Portal",
-    "openrouter": "OpenRouter",
-    "openai": "OpenAI",
-    "openai-codex": "OpenAI (Codex)",
-    "krea": "Krea",
-}
-
-
-def _forced_provider_from_env() -> str | None:
-    """Optional QA override to force a pet-gen backend.
-
-    `HERMES_PET_IMAGE_PROVIDER=<name>` (e.g. `openrouter`) bypasses the normal
-    active/default provider resolution for pet generation only. Unknown values are
-    ignored so existing users are unaffected.
-    """
-    forced = os.environ.get("HERMES_PET_IMAGE_PROVIDER", "").strip().lower()
-    return forced if forced in _REF_CAPABLE else None
-
-
-class GenerationError(RuntimeError):
-    """Raised on any image-generation failure (no provider, API error, IO)."""
-
-
-@dataclass(frozen=True)
-class SpriteProvider:
-    """Resolved provider plus whether it can take reference images."""
-
-    name: str
-    provider: object
-    supports_references: bool
-
-
-def _discover() -> None:
-    try:
-        from hermes_cli.plugins import _ensure_plugins_discovered
-
-        _ensure_plugins_discovered()
-    except Exception as exc:  # noqa: BLE001 - discovery is best-effort
-        logger.debug("image-gen plugin discovery failed: %s", exc)
-
-
-def resolve_provider(*, require_references: bool = True, prefer: str | None = None) -> SpriteProvider:
-    """Pick the image provider to use for sprite work.
-
-    Preference: an explicit *prefer* choice (the desktop pet-gen picker) when it's
-    reference-capable and configured, then the configured/active provider when
-    it's reference-capable, else the first available reference-capable provider.
-    With *require_references* off we fall back to any available provider (used for
-    prompt-only base drafts).
-    """
-    _discover()
-    from agent.image_gen_registry import get_active_provider, get_provider
-
-    # QA override: force one provider for pet-gen iteration regardless of the
-    # globally active image_gen backend.
-    forced = _forced_provider_from_env()
-    if forced:
-        chosen = get_provider(forced)
-        if chosen is not None and chosen.is_available():
-            return SpriteProvider(name=forced, provider=chosen, supports_references=True)
-
-    # An explicit user pick wins when it's reference-capable and has credentials;
-    # otherwise we ignore it and fall through to the normal resolution.
-    if prefer:
-        chosen = get_provider(prefer)
-        if prefer in _REF_CAPABLE and chosen is not None and chosen.is_available():
-            return SpriteProvider(name=prefer, provider=chosen, supports_references=True)
-
-    # Configured / active provider first.
-    active = None
-    try:
-        active = get_active_provider()
-    except Exception:  # noqa: BLE001
-        active = None
-    if active is not None:
-        name = getattr(active, "name", "")
-        if name in _REF_CAPABLE and active.is_available():
-            return SpriteProvider(name=name, provider=active, supports_references=True)
-
-    # Any available reference-capable provider.
-    for name in _REF_CAPABLE:
-        provider = get_provider(name)
-        if provider is not None and provider.is_available():
-            return SpriteProvider(name=name, provider=provider, supports_references=True)
-
-    if not require_references and active is not None and active.is_available():
-        return SpriteProvider(
-            name=getattr(active, "name", "unknown"), provider=active, supports_references=False
-        )
-
-    raise GenerationError(
-        "Pet generation needs an image backend that supports reference images. "
-        "Open `hermes tools` → Image Generation and configure Nous Portal, "
-        "OpenRouter, or OpenAI (gpt-image-2) with an API key."
-    )
-
-
-def list_sprite_providers() -> list[dict]:
-    """The reference-capable providers available to pick for pet generation.
-
-    Returns ``[{name, label, default}]`` for every ref-capable provider the user
-    actually has credentials for, in preference order, marking the one
-    :func:`resolve_provider` would choose with no explicit preference. Empty when
-    none is configured (the picker hides itself). Best-effort: discovery hiccups
-    yield an empty list.
-    """
-    _discover()
-    from agent.image_gen_registry import get_provider
-
-    try:
-        default_name = resolve_provider(require_references=True).name
-    except GenerationError:
-        default_name = ""
-
-    out: list[dict] = []
-    for name in _REF_CAPABLE:
-        provider = get_provider(name)
-        if provider is None or not provider.is_available():
-            continue
-        out.append(
-            {
-                "name": name,
-                "label": _PROVIDER_LABELS.get(name, name),
-                "default": name == default_name,
-            }
-        )
-    return out
-
-
-def _save_local(image_ref: str, *, prefix: str) -> Path:
-    """Return a local path for *image_ref*, downloading it if it's a URL."""
-    if image_ref.startswith(("http://", "https://")):
-        from agent.image_gen_provider import save_url_image
-
-        return Path(save_url_image(image_ref, prefix=prefix))
-    return Path(image_ref)
-
-
-def _rejected_background(error: str) -> bool:
-    """True when a provider error is specifically about the ``background`` param.
-
-    Transparent backgrounds are a per-model capability (e.g. some gpt-image tiers
-    reject ``background=transparent`` outright). We detect that one rejection so
-    we can retry without the flag rather than failing the whole pet — our chroma
-    key pass makes the result transparent regardless.
-    """
-    lowered = (error or "").lower()
-    return "background" in lowered and ("not supported" in lowered or "transparent" in lowered)
-
-
-def generate(
-    prompt: str,
-    *,
-    n: int = 1,
-    reference_images: list[Path] | None = None,
-    provider: SpriteProvider | None = None,
-    prefix: str = "pet_gen",
-    aspect_ratio: str = "square",
-) -> list[Path]:
-    """Generate *n* sprite images and return their local paths.
-
-    *reference_images* grounds the output on a base image (required for rows).
-    *aspect_ratio* picks the canvas: ``"square"`` for single-character base
-    drafts, ``"landscape"`` for multi-frame row strips (the wider 1536px canvas
-    gives every frame real horizontal room so winged poses don't have to be
-    shrunk to avoid touching their neighbors).
-    We *ask* for a transparent background, but fall back to an opaque generation
-    (cleaned up downstream by the chroma-key pass) on models that reject the
-    flag. Raises :class:`GenerationError` if nothing usable comes back.
-    """
-    sprite = provider or resolve_provider(require_references=bool(reference_images))
-    if reference_images and not sprite.supports_references:
-        raise GenerationError(
-            f"image backend '{sprite.name}' cannot use reference images; "
-            "configure OpenAI gpt-image-2 or Krea for pet generation"
-        )
-
-    refs = [str(p) for p in (reference_images or [])]
-
-    def _run(extra: dict) -> tuple[Path | None, str]:
-        kwargs: dict = {"aspect_ratio": aspect_ratio, **extra}
-        if refs:
-            # Providers disagree on the ref kwarg name: our OpenRouter/Nous
-            # backends read ``reference_images``, OpenAI's gpt-image-2 reads
-            # ``reference_image_urls``. Send both; each ignores the other.
-            kwargs["reference_images"] = refs
-            kwargs["reference_image_urls"] = refs
-        try:
-            result = sprite.provider.generate(prompt, **kwargs)
-        except Exception as exc:  # noqa: BLE001 - normalize provider crashes
-            logger.debug("provider.generate crashed: %s", exc)
-            return None, str(exc)
-        if not isinstance(result, dict) or not result.get("success"):
-            return None, (result or {}).get("error", "unknown error") if isinstance(result, dict) else "no result"
-        image_ref = result.get("image")
-        if not image_ref:
-            return None, "provider returned no image"
-        try:
-            return _save_local(str(image_ref), prefix=prefix), ""
-        except Exception as exc:  # noqa: BLE001
-            return None, f"could not save generated image: {exc}"
-
-    out: list[Path] = []
-    last_error = ""
-    allow_transparent = True
-    for _ in range(max(1, n)):
-        path, err = _run({"background": "transparent"} if allow_transparent else {})
-        # Model doesn't support the transparent flag → drop it for this and every
-        # remaining variant (no point re-probing a capability we just disproved).
-        if path is None and allow_transparent and _rejected_background(err):
-            allow_transparent = False
-            path, err = _run({})
-        if path is not None:
-            out.append(path)
-        else:
-            last_error = err
-
-    if not out:
-        raise GenerationError(last_error or "image generation produced no output")
-    return out
--- a/agent/pet/generate/orchestrate.py
+++ b/agent/pet/generate/orchestrate.py
@@ -1,358 +0,0 @@
-"""Pet generation orchestration — the base-draft → hatch flow.
-
-Two steps, mirroring the UX across every surface:
-
-1. :func:`generate_base_drafts` — a handful of prompt-only "what should this pet
-   look like" variants. Cheap; the user picks one (or retries for a fresh set).
-2. :func:`hatch_pet` — takes the chosen base and generates one grounded row
-   strip per Hermes state, slices each into frames, composes the atlas, validates
-   it, and writes the pet into the store.
-
-Splitting it this way bounds cost (4 cheap base calls per round; the ~6 row
-calls happen once, on the pet you actually keep) and gives each UI a natural
-preview/loading point.
-"""
-
-from __future__ import annotations
-
-import logging
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Callable
-
-from agent.pet.generate import atlas, imagegen, prompts
-from agent.pet.generate.imagegen import GenerationError, SpriteProvider
-
-logger = logging.getLogger(__name__)
-
-# (event, detail) — e.g. ("row", "idle"), ("compose", ""), ("save", "<slug>").
-ProgressFn = Callable[[str, str], None]
-
-# Image generations are independent network calls, so we fan them out instead of
-# blocking on each in turn — a hatch is ~8 row calls that would otherwise run
-# back-to-back and routinely blow past the client's RPC timeout. Capped so we
-# don't hammer the provider's rate limit (one cold call can still be slow).
-_MAX_PARALLEL_GENERATIONS = 4
-# How many times to (re)generate a single row before accepting a best-effort
-# slice. Early attempts demand clean per-pose gutters; the last is lenient so a
-# stubborn row still yields frames instead of dropping out entirely.
-_ROW_GEN_ATTEMPTS = 3
-_MIN_FILLED_STATES = 6
-_REQUIRED_STATES = frozenset({"idle", "running-right", "waving"})
-
-
-@dataclass(frozen=True)
-class HatchResult:
-    """Outcome of a successful :func:`hatch_pet`."""
-
-    slug: str
-    display_name: str
-    spritesheet: Path
-    states: list[str]
-    validation: dict
-
-
-def _harden_transparency(path: Path) -> Path:
-    """Key out any solid backdrop the provider painted; save as an RGBA PNG.
-
-    ``background=transparent`` is requested on every call, but image models honor
-    it inconsistently — some still paint a flat (often near-white) backdrop. We
-    run the same chroma-key pass the row extractor uses so every base draft the
-    user picks between (and the reference the rows are grounded on) is a clean
-    cutout. Best-effort: a decode failure leaves the original untouched.
-    """
-    from PIL import Image
-
-    try:
-        with Image.open(path) as opened:
-            keyed = atlas.remove_background(opened.convert("RGBA"))
-        # Zero the RGB of any leftover semi-transparent edge pixels so a keyed
-        # draft has no colored halo when composited on the dark UI.
-        keyed = atlas._clear_transparent_rgb(keyed)
-        out = path.with_suffix(".png")
-        keyed.save(out, format="PNG")
-        return out
-    except Exception as exc:  # noqa: BLE001 - cosmetic; fall back to the raw image
-        logger.debug("base draft transparency hardening failed for %s: %s", path, exc)
-        return path
-
-
-def generate_base_drafts(
-    concept: str,
-    *,
-    n: int = 4,
-    style: str = "auto",
-    reference_images: list[Path] | None = None,
-    provider: SpriteProvider | None = None,
-    on_draft: Callable[[int, Path], None] | None = None,
-    is_cancelled: Callable[[], bool] | None = None,
-) -> list[Path]:
-    """Generate *n* candidate base looks for *concept*; returns image paths.
-
-    Each draft is hardened to a transparent cutout (see :func:`_harden_transparency`).
-    Drafts are generated concurrently and *on_draft(index, path)* fires as each
-    one finishes (not at the end) so callers can stream previews to the UI
-    instead of leaving it blank until the whole batch is done.
-
-    *is_cancelled*, when supplied, is polled cooperatively: a draft that hasn't
-    started yet is skipped, and once it trips we stop staging/streaming further
-    drafts and cancel any queued work (already-in-flight provider calls can't be
-    hard-killed, but their results are dropped).
-    """
-    # A user reference image (e.g. their own pet) grounds every draft, so it
-    # needs a reference-capable provider — same requirement as the row passes.
-    refs = reference_images or None
-    sprite = provider or imagegen.resolve_provider(require_references=bool(refs))
-    cancelled = is_cancelled or (lambda: False)
-
-    # Each draft is its own one-shot generation, run concurrently so the user
-    # waits for one image, not N. A single draft failing must not sink the set.
-    # Each gets a distinct variation nudge so the options aren't near-duplicates.
-    logger.info("pet generate: drafting %d base looks for %r (style=%s)", n, concept, style)
-
-    def _one(index: int) -> tuple[int, Path | None, str | None]:
-        if cancelled():
-            return index, None, None
-        t0 = time.monotonic()
-        variation = prompts.BASE_VARIATIONS[index % len(prompts.BASE_VARIATIONS)]
-        prompt = prompts.build_base_prompt(concept, style=style, variation=variation)
-        try:
-            out = imagegen.generate(prompt, n=1, reference_images=refs, provider=sprite, prefix="pet_base")
-        except Exception as exc:  # noqa: BLE001 - tolerate a single failed draft
-            logger.warning("pet generate: draft %d failed after %.1fs: %s", index, time.monotonic() - t0, exc)
-            return index, None, str(exc)
-        if not out:
-            logger.warning("pet generate: draft %d produced no image", index)
-            return index, None, "the image provider returned no image"
-        logger.info("pet generate: draft %d ready in %.1fs", index, time.monotonic() - t0)
-        return index, _harden_transparency(out[0]), None
-
-    workers = max(1, min(n, _MAX_PARALLEL_GENERATIONS))
-    results: dict[int, Path] = {}
-    errors: list[str] = []
-    with ThreadPoolExecutor(max_workers=workers) as pool:
-        futures = [pool.submit(_one, i) for i in range(n)]
-        # as_completed runs in *this* (the caller's) thread, so on_draft — and any
-        # gateway event it emits — inherits the request's bound transport, unlike
-        # the worker threads above.
-        for fut in as_completed(futures):
-            if cancelled():
-                logger.info("pet generate: cancelled — dropping remaining drafts")
-                for pending in futures:
-                    pending.cancel()
-                break
-            index, path, err = fut.result()
-            if path is None:
-                if err:
-                    errors.append(err)
-                continue
-            results[index] = path
-            if on_draft is not None:
-                try:
-                    on_draft(index, path)
-                except Exception as exc:  # noqa: BLE001 - progress is best-effort
-                    logger.debug("on_draft callback failed: %s", exc)
-
-    drafts = [results[i] for i in sorted(results)]
-    if not drafts and not cancelled():
-        # Surface *why* — every draft failed for a reason (a content-policy refusal
-        # on a name like "minion", a provider/auth error, …); the most common one
-        # is the representative cause. Far more useful than "no usable drafts".
-        raise GenerationError(_drafts_failed_reason(errors))
-    return drafts
-
-
-def _drafts_failed_reason(errors: list[str]) -> str:
-    """The representative reason a draft round produced nothing, humanized."""
-    if not errors:
-        return "image generation produced no usable drafts"
-    from collections import Counter
-
-    return _humanize_image_error(Counter(errors).most_common(1)[0][0])
-
-
-def _humanize_image_error(error: str) -> str:
-    """Turn a raw provider error into a friendly, actionable sentence.
-
-    The big one is moderation: image models refuse trademarked characters and
-    real people (e.g. "minion"), which reads as an opaque 400 otherwise.
-    """
-    low = error.lower()
-    if any(s in low for s in ("moderation_blocked", "safety system", "content policy", "content_policy")):
-        return (
-            "The image provider blocked this prompt — its safety filter rejects "
-            "trademarked characters and real people. Try an original description."
-        )
-    if any(s in low for s in ("api key", "unauthorized", "401", "auth")):
-        return "The image provider rejected the request — check your API key in Settings → Providers."
-    if "rate limit" in low or "429" in low:
-        return "The image provider is rate-limiting — wait a moment and try again."
-    # Otherwise the first line, trimmed of the noisy provider envelope.
-    return error.splitlines()[0].strip()[:200]
-
-
-def hatch_pet(
-    *,
-    base_image: str | Path,
-    slug: str,
-    display_name: str = "",
-    description: str = "",
-    concept: str = "",
-    style: str = "auto",
-    on_progress: ProgressFn | None = None,
-    provider: SpriteProvider | None = None,
-    is_cancelled: Callable[[], bool] | None = None,
-) -> HatchResult:
-    """Turn an approved base image into a full, installed Hermes pet.
-
-    Generates a grounded row strip per state, extracts frames, composes +
-    validates the atlas, and registers it. The idle row falls back to the base
-    look so the pet always renders. Raises :class:`GenerationError` on failure.
-
-    *is_cancelled*, when supplied, is polled cooperatively: rows that haven't
-    started are skipped, queued rows are cancelled, and once every row is done we
-    abort (raising :class:`GenerationError`) before composing/saving so a stopped
-    hatch never writes a half-built pet.
-    """
-    base = Path(base_image)
-    if not base.is_file():
-        raise GenerationError(f"base image not found: {base}")
-
-    sprite = provider or imagegen.resolve_provider(require_references=True)
-    progress = on_progress or (lambda *_: None)
-    cancelled = is_cancelled or (lambda: False)
-    label = concept or display_name or slug
-
-    frames_by_state: dict[str, list] = {}
-    total_rows = len(atlas.ROW_SPECS)
-    logger.info("pet hatch %r: generating %d animation rows", slug, total_rows)
-
-    # Generate every state's row strip concurrently — they're independent
-    # grounded calls, so the hatch waits for the slowest row, not their sum. A
-    # single row failing is tolerated (idle is guaranteed below).
-    def _gen_row(spec: tuple[str, int, int]) -> tuple[str, list | None]:
-        state, _row, count = spec
-        if cancelled():
-            return state, None
-        t0 = time.monotonic()
-        last_exc: Exception | None = None
-        # Self-healing: a model occasionally returns a row whose poses are touching
-        # (no clean gutters), which slices badly. We retry such rolls; only the
-        # final attempt falls back to lenient ``auto`` slicing so a stubborn row
-        # still yields *something* rather than dropping the whole row.
-        for attempt in range(_ROW_GEN_ATTEMPTS):
-            if cancelled():
-                return state, None
-            strict = attempt < _ROW_GEN_ATTEMPTS - 1
-            try:
-                strips = imagegen.generate(
-                    prompts.build_row_prompt(state, count, label, style=style),
-                    n=1,
-                    reference_images=[base],
-                    provider=sprite,
-                    prefix=f"pet_row_{state}",
-                    # Wider canvas → each frame gets real horizontal room, so winged
-                    # poses keep a full, healthy size and still leave clean gutters.
-                    aspect_ratio="landscape",
-                )
-                # ``components`` requires clean per-pose gutters (raises otherwise),
-                # so a touching roll is rejected and regenerated; the last attempt
-                # uses ``auto`` (equal-slot fallback, never raises). Raw (fit=False)
-                # so normalize_cells registers the whole pet at once.
-                method = "components" if strict else "auto"
-                frames = atlas.extract_strip_frames(strips[0], count, method=method, fit=False)
-                logger.info(
-                    "pet hatch %r: row %r ready in %.1fs (attempt %d)",
-                    slug, state, time.monotonic() - t0, attempt + 1,
-                )
-                return state, frames
-            except Exception as exc:  # noqa: BLE001 - retried; one bad row is tolerated
-                last_exc = exc
-                logger.warning(
-                    "pet hatch %r: row %r attempt %d/%d failed: %s",
-                    slug, state, attempt + 1, _ROW_GEN_ATTEMPTS, exc,
-                )
-        logger.warning(
-            "pet hatch %r: row %r gave up after %.1fs: %s",
-            slug, state, time.monotonic() - t0, last_exc,
-        )
-        return state, None
-
-    # running-left is derived by mirroring running-right (guaranteed-consistent
-    # and one fewer generation), so we don't generate it directly.
-    generated_specs = [spec for spec in atlas.ROW_SPECS if spec[0] != "running-left"]
-
-    workers = max(1, min(len(generated_specs), _MAX_PARALLEL_GENERATIONS))
-    done = 0
-    with ThreadPoolExecutor(max_workers=workers) as pool:
-        futures = [pool.submit(_gen_row, spec) for spec in generated_specs]
-        # as_completed runs on the caller (request) thread, so progress events
-        # emitted here inherit the request transport — unlike the worker threads.
-        for fut in as_completed(futures):
-            if cancelled():
-                logger.info("pet hatch %r: cancelled — dropping remaining rows", slug)
-                for pending in futures:
-                    pending.cancel()
-                break
-            state, frames = fut.result()
-            done += 1
-            progress("row", f"{state}:{done}:{total_rows}")
-            if frames:
-                frames_by_state[state] = frames
-
-    if cancelled():
-        raise GenerationError("hatch cancelled")
-
-    # Derive running-left from the approved running-right row (per-frame mirror,
-    # preserving order/timing). Missing running-right is rejected below; a pet
-    # without its canonical walk cycle is a failed hatch, not a shippable mascot.
-    right = frames_by_state.get("running-right")
-    if right:
-        done += 1
-        progress("row", f"running-left:{done}:{total_rows}")
-        frames_by_state["running-left"] = atlas.mirror_frames(right)
-        logger.info("pet hatch %r: row 'running-left' mirrored from running-right", slug)
-    else:
-        logger.warning("pet hatch %r: no running-right to mirror; left walk left empty", slug)
-
-    # Idle is the resting state the renderer falls back to — guarantee it.
-    if not frames_by_state.get("idle"):
-        progress("row", "idle-fallback")
-        frames_by_state["idle"] = [atlas.single_frame(base, fit=False)]
-
-    progress("compose", "")
-    logger.info("pet hatch %r: composing atlas from %d states", slug, len(frames_by_state))
-    # One shared scale + baseline across every state so the pet never slides or
-    # pulses size between frames; compose just packs the normalized cells.
-    sheet = atlas.compose_atlas(atlas.normalize_cells(frames_by_state))
-    validation = atlas.validate_atlas(sheet)
-    if not validation["ok"]:
-        raise GenerationError("; ".join(validation["errors"]) or "atlas validation failed")
-    filled_states = set(validation["filled_states"])
-    missing_required = sorted(_REQUIRED_STATES - filled_states)
-    if missing_required:
-        raise GenerationError(f"missing required animation row(s): {', '.join(missing_required)}")
-    if len(filled_states) < _MIN_FILLED_STATES:
-        raise GenerationError(
-            f"only {len(filled_states)}/{len(atlas.ROW_SPECS)} animation rows were usable; regenerate"
-        )
-
-    from agent.pet import store
-
-    progress("save", slug)
-    logger.info("pet hatch %r: saving pet", slug)
-    pet = store.register_local_pet(
-        sheet,
-        slug=slug,
-        display_name=display_name or slug,
-        description=description,
-    )
-    return HatchResult(
-        slug=pet.slug,
-        display_name=pet.display_name,
-        spritesheet=pet.spritesheet,
-        states=validation["filled_states"],
-        validation=validation,
-    )
--- a/agent/pet/generate/prompts.py
+++ b/agent/pet/generate/prompts.py
@@ -1,183 +0,0 @@
-"""Prompt builders for pet generation.
-
-Two prompt shapes: a *base* prompt (prompt-only, produces the canonical look the
-user picks between) and per-*state* *row* prompts (grounded on the chosen base,
-produce one horizontal strip of N poses). Prompts stay concise and
-sprite-production oriented; the identity lock and "one transparent row" framing
-matter more than flowery description.
-
-We generate the full petdex/Codex nine-state set (see
-:data:`agent.pet.generate.atlas.ROW_SPECS`) so a hatched pet is a valid
-``petdex submit`` spritesheet.
-"""
-
-from __future__ import annotations
-
-# What each petdex/Codex state should depict (kept short — these go straight into
-# the row prompt). Phrased to avoid the common sprite-gen failure modes (detached
-# effects, motion lines, shadows). Critical distinction: ``running`` is the
-# *working* state (in place), while ``running-right`` / ``running-left`` are the
-# actual directional walk/run cycles.
-STATE_ACTIONS: dict[str, str] = {
-    "idle": "a calm idle loop: subtle breathing, a tiny blink or gentle bob, no big gestures",
-    "running-right": (
-        "a sideways walk/run locomotion cycle moving to the RIGHT: the character "
-        "faces and travels right with clear directional steps, a smooth gait loop"
-    ),
-    "running-left": (
-        "a sideways walk/run locomotion cycle moving to the LEFT: the character "
-        "faces and travels left with clear directional steps (the mirror of the "
-        "right-facing run)"
-    ),
-    "waving": "a friendly greeting: raising a paw/hand/limb to wave, clear up-and-down gesture",
-    "jumping": "a happy celebration jump: anticipation, lift off the ground, peak, and land",
-    "failed": "a sad or deflated reaction: slumped, dejected, small frown — readable but not noisy",
-    "waiting": (
-        "an expectant 'waiting on you' pose: looking up/out as if asking for input "
-        "or approval — distinct from idle and review"
-    ),
-    "running": (
-        "focused active work, staying IN PLACE (NOT walking or foot-running): "
-        "leaning in, concentrating, busy 'thinking / processing / typing' energy"
-    ),
-    "review": "careful inspection: a focused lean, head tilt, studying something intently",
-}
-
-_STYLE_HINTS: dict[str, str] = {
-    # Default to the popular petdex look: crisp 16-bit PIXEL ART, not the smooth
-    # 2D illustration (let alone 3D render) gpt-image reaches for by default.
-    "auto": (
-        " Style: crisp 16-bit PIXEL-ART game sprite — visible square pixels, a small "
-        "limited palette, clean dark outline, flat cel shading, chunky chibi "
-        "proportions, like a classic SNES/JRPG party member or a petdex.dev mascot. "
-        "Absolutely NOT 3D-rendered, NOT a smooth painted or vector illustration, "
-        "NOT photorealistic — no soft gradients, no realistic lighting, no figurine look."
-    ),
-    "pixel": " Render in clean 16-bit pixel-art style with visible square pixels and a limited palette.",
-    "plush": " Render as a soft plush toy.",
-    "clay": " Render as a claymation / soft 3D clay figure.",
-    "sticker": " Render as a glossy die-cut sticker.",
-    "flat-vector": " Render in flat vector mascot style.",
-    "3d-toy": " Render as a glossy 3D toy.",
-    "painterly": " Render in a soft painterly style.",
-}
-
-_BACKGROUND = (
-    "Center the character on a SINGLE flat, uniform, high-contrast chroma-key "
-    "background — pure hot magenta #FF00FF (only if magenta appears on the "
-    "character, use pure green #00FF00 instead). The background is ONE continuous "
-    "even color that completely surrounds the character with NO gradient, "
-    "vignette, texture, pattern, scenery, shadow, ground line, frame, border, "
-    "panel, comic cell, gutter line, grid, or divider of any kind, so it keys out "
-    "cleanly. The background color must not appear anywhere on the character. "
-    "No text, no labels, no speech bubbles, no UI."
-)
-
-
-def style_hint(style: str | None) -> str:
-    return _STYLE_HINTS.get((style or "auto").strip().lower(), "")
-
-
-# Row strips are generated on the wider landscape canvas (see imagegen.generate /
-# orchestrate). The extra width is what lets each pose stay a healthy size AND
-# leave a real gutter — used here only to cite concrete pixel numbers.
-_ASSUMED_STRIP_WIDTH = 1536
-
-
-def _spacing_spec(frame_count: int) -> tuple[int, int]:
-    """(per-pose width px, gap px) for a row of *frame_count* poses.
-
-    Pixel counts alone don't hold — the model fills each slot edge-to-edge with
-    the full wingspan, so neighbors touch even when bodies are spaced. The lever
-    that works is proportional containment on a wide canvas: give each pose its
-    own equal cell and keep the ENTIRE silhouette (wings/tail/halo included)
-    inside it. On the 1536px landscape strip ~70% occupancy still leaves a
-    generous gutter, so the pet stays a normal, good-looking size — no shrinking.
-    """
-    slots = max(1, frame_count)
-    slot_w = _ASSUMED_STRIP_WIDTH / slots
-    pose_px = round(slot_w * 0.7)
-    gap_px = max(48, round(slot_w * 0.3))
-    return pose_px, gap_px
-
-
-# Per-draft nudges so the 4 base options are actually distinct — gpt-image returns
-# near-duplicates for a single prompt. We vary the *look* (palette, build,
-# expression, accents), NOT the pose, so the chosen base still grounds clean,
-# consistent animation rows.
-BASE_VARIATIONS: tuple[str, ...] = (
-    "",
-    "a distinctly different colour palette and markings",
-    "a heavier, broader silhouette with sturdier proportions",
-    "a different facial structure and expression matching the concept tone, with unique accent/accessory details",
-    "a leaner, taller build and an alternate colour scheme",
-    "bolder, more saturated colours and a stronger expression matching the concept tone",
-)
-
-
-def build_base_prompt(concept: str, *, style: str | None = "auto", variation: str = "") -> str:
-    """The base look: a single, clean, centered full-body mascot.
-
-    *variation* differentiates one draft from the next (see :data:`BASE_VARIATIONS`).
-    """
-    concept = (concept or "a distinctive mascot creature").strip()
-    nudge = f" Make this design distinct: {variation}." if variation else ""
-    return (
-        f"A stylized mascot pet character: {concept}. "
-        "Honor the requested tone and mood exactly (cute, eerie, scary, menacing, whimsical, etc.) "
-        "while staying non-graphic. "
-        "Compact, whole-body silhouette that reads clearly at small size, "
-        "clear readable facial features, simple consistent palette. "
-        # A neutral, symmetric, at-rest stance makes the cleanest identity anchor
-        "Neutral front-facing standing pose, upright and symmetric, arms/limbs "
-        "relaxed at the sides, feet together on the ground, any cape/accessories "
-        "hanging straight and still."
-        f"{nudge} "
-        f"{_BACKGROUND}{style_hint(style)}"
-    )
-
-
-def build_row_prompt(state: str, frame_count: int, concept: str, *, style: str | None = "auto") -> str:
-    """A row strip: *frame_count* poses of the SAME character, left→right.
-
-    The attached base image is the identity source of truth; the prompt locks
-    species, palette, face, and props to it.
-    """
-    action = STATE_ACTIONS.get(state, "a simple idle pose")
-    concept = (concept or "the mascot").strip()
-    pose_px, gap_px = _spacing_spec(frame_count)
-    return (
-        f"Using the attached reference image as the exact same character "
-        f"(same species, face, colors, markings, proportions, and props), "
-        "preserving the same emotional tone/mood (e.g., scary stays scary, cute stays cute), "
-        f"draw a single WIDE horizontal strip of {frame_count} animation frames showing {action}. "
-        f"LAYOUT: arrange {frame_count} poses in ONE horizontal row at equal spacing, "
-        "each pose centered in its own imaginary equal region. Draw NO panel borders, "
-        "NO comic cells, NO boxes, NO vertical divider/gutter lines, NO grid, NO frame "
-        "outlines between poses — the backdrop is one unbroken flat field behind all of them. "
-        "Fill the WHOLE strip with the SAME single flat chroma-key color as the attached "
-        "reference image's background (identical hue in every frame, no per-pose color shifts). "
-        f"SPACING (critical): draw each pose at a consistent, healthy, clearly "
-        f"visible size (roughly {pose_px}px wide on a {_ASSUMED_STRIP_WIDTH}px "
-        f"strip) — do NOT shrink it tiny — but keep its ENTIRE silhouette "
-        f"(wings, tail, halo, horns, cape, every appendage) fully INSIDE its own "
-        f"cell. Leave at least {gap_px}px of empty chroma-key background between "
-        f"neighboring silhouettes at their closest point (wingtip to wingtip), and "
-        f"the same empty margin before the first pose and after the last. If a wing, "
-        f"cape, or tail would reach into a neighbor, FOLD or angle it inward rather "
-        f"than letting it cross the gap. Silhouettes must NEVER touch, overlap, "
-        f"share a shadow, share a ground line, share motion trails, or merge into "
-        f"one connected shape. "
-        # Registration: a clean sprite sheet keeps the character locked in place
-        # so only the action moves — this is what stops the loop sliding/pulsing.
-        "REGISTRATION (critical): the character is the SAME height and SAME width "
-        "in every frame, drawn at the SAME scale, centered over the SAME point, "
-        "with all feet aligned to the SAME invisible horizontal baseline across the "
-        "whole strip — this baseline is conceptual ONLY: draw NO ground line, floor, "
-        "platform, horizon, or contact shadow beneath the feet. Keep the body's center, size, and stance fixed frame to "
-        "frame — ONLY the limbs/features the action needs may move. Capes, cloaks, "
-        "bags, and scarves stay in the SAME place and shape every frame (no "
-        "swinging, flowing, or drifting) unless the action itself requires it. No "
-        "pose is cropped at the strip edges. "
-        f"{_BACKGROUND}{style_hint(style)}"
-    )
--- a/agent/pet/manifest.py
+++ b/agent/pet/manifest.py
@@ -1,165 +0,0 @@
-"""Fetch the public petdex manifest.
-
-``https://petdex.dev/api/manifest`` 307-redirects to a JSON document on R2:
-
-    {
-      "generatedAt": "...",
-      "total": 2926,
-      "pets": [
-        {"slug": "boba", "displayName": "Boba", "kind": "creature",
-         "submittedBy": "railly",
-         "spritesheetUrl": "https://assets.petdex.dev/.../spritesheet.webp",
-         "petJsonUrl": "https://assets.petdex.dev/.../pet.json",
-         "zipUrl": "https://assets.petdex.dev/.../boba.zip"},
-        ...
-      ]
-    }
-
-Read-only and unauthenticated; no credentials involved.
-"""
-
-from __future__ import annotations
-
-import logging
-import threading
-import time
-from dataclasses import dataclass
-
-logger = logging.getLogger(__name__)
-
-MANIFEST_URL = "https://petdex.dev/api/manifest"
-
-_DEFAULT_TIMEOUT = 10.0
-
-# In-process cache for the (large, slow, identical-per-call) manifest. The list
-# is a static CDN object that barely changes, yet a single session can ask for
-# it many times — every gallery open, plus a full re-fetch per install/select
-# (``find_entry``). A short TTL collapses those into one network hit without
-# going stale for long. Cleared by :func:`clear_cache` (tests).
-_MANIFEST_TTL = 300.0
-_cache: tuple[float, list[ManifestEntry]] | None = None
-
-_prefetch_lock = threading.Lock()
-_prefetching = False
-
-
-def clear_cache() -> None:
-    """Drop the cached manifest (forces the next fetch to hit the network)."""
-    global _cache
-    _cache = None
-
-
-def _cache_is_warm() -> bool:
-    return _cache is not None and time.monotonic() - _cache[0] < _MANIFEST_TTL
-
-
-def prefetch(*, timeout: float = _DEFAULT_TIMEOUT) -> None:
-    """Warm the manifest cache in a daemon thread — idempotent, never blocks.
-
-    The desktop picker calls this when it loads the (instant) local-only gallery
-    so the full petdex catalog is usually cached by the time it's requested,
-    without ever holding up the user's own pets on a network round-trip.
-    """
-    global _prefetching
-
-    if _cache_is_warm():
-        return
-
-    with _prefetch_lock:
-        if _prefetching:
-            return
-        _prefetching = True
-
-    def _run() -> None:
-        global _prefetching
-        try:
-            fetch_manifest(timeout=timeout)
-        except Exception as exc:  # noqa: BLE001 - best-effort warm
-            logger.debug("petdex manifest prefetch failed: %s", exc)
-        finally:
-            _prefetching = False
-
-    threading.Thread(target=_run, name="petdex-prefetch", daemon=True).start()
-
-
-@dataclass(frozen=True)
-class ManifestEntry:
-    """A single pet's row in the manifest."""
-
-    slug: str
-    display_name: str
-    kind: str
-    submitted_by: str
-    spritesheet_url: str
-    pet_json_url: str
-    zip_url: str
-
-    @classmethod
-    def from_dict(cls, data: dict) -> "ManifestEntry":
-        return cls(
-            slug=str(data.get("slug", "")).strip(),
-            display_name=str(data.get("displayName", "") or data.get("slug", "")),
-            kind=str(data.get("kind", "") or "pet"),
-            submitted_by=str(data.get("submittedBy", "") or ""),
-            spritesheet_url=str(data.get("spritesheetUrl", "") or ""),
-            pet_json_url=str(data.get("petJsonUrl", "") or ""),
-            zip_url=str(data.get("zipUrl", "") or ""),
-        )
-
-
-class ManifestError(RuntimeError):
-    """Raised when the manifest can't be fetched or parsed."""
-
-
-def fetch_manifest(*, timeout: float = _DEFAULT_TIMEOUT, force: bool = False) -> list[ManifestEntry]:
-    """Return every approved pet from the public manifest.
-
-    Cached in-process for ``_MANIFEST_TTL`` seconds (pass ``force=True`` to
-    bypass). Follows the 307 redirect to R2.  Raises :class:`ManifestError` on
-    any network/parse failure so callers can surface a clean message.
-    """
-    global _cache
-
-    if not force and _cache is not None and time.monotonic() - _cache[0] < _MANIFEST_TTL:
-        return _cache[1]
-
-    try:
-        import httpx
-    except ImportError as exc:  # pragma: no cover - httpx is a core dep
-        raise ManifestError("httpx is required to fetch the petdex manifest") from exc
-
-    try:
-        resp = httpx.get(
-            MANIFEST_URL,
-            timeout=timeout,
-            follow_redirects=True,
-            headers={"User-Agent": "hermes-agent-petdex"},
-        )
-        resp.raise_for_status()
-        payload = resp.json()
-    except Exception as exc:  # noqa: BLE001 - normalize to one error type
-        raise ManifestError(f"could not fetch petdex manifest: {exc}") from exc
-
-    pets = payload.get("pets") if isinstance(payload, dict) else None
-    if not isinstance(pets, list):
-        raise ManifestError("petdex manifest had no 'pets' array")
-
-    entries: list[ManifestEntry] = []
-    for raw in pets:
-        if not isinstance(raw, dict):
-            continue
-        entry = ManifestEntry.from_dict(raw)
-        if entry.slug and entry.spritesheet_url:
-            entries.append(entry)
-
-    _cache = (time.monotonic(), entries)
-    return entries
-
-
-def find_entry(slug: str, *, timeout: float = _DEFAULT_TIMEOUT) -> ManifestEntry | None:
-    """Return the manifest entry for *slug*, or ``None`` if not listed."""
-    slug = slug.strip().lower()
-    for entry in fetch_manifest(timeout=timeout):
-        if entry.slug.lower() == slug:
-            return entry
-    return None
--- a/agent/pet/render.py
+++ b/agent/pet/render.py
@@ -1,618 +0,0 @@
-"""Decode a pet spritesheet and encode frames for a terminal.
-
-Shared by the base CLI (writes the escape bytes to its own stdout) and the
-TUI (``tui_gateway`` ships the encoded bytes to Ink, which writes them) so the
-decode + capability-detection + protocol-encoding logic exists exactly once.
-
-Supported output modes, in fidelity order:
-
- ``kitty``   — the kitty graphics protocol (kitty, Ghostty, WezTerm).
- ``iterm``   — iTerm2 inline images (iTerm2, WezTerm).
- ``sixel``   — DEC sixel (xterm -ti vt340, foot, mlterm, WezTerm, …).
- ``unicode`` — 24-bit half-block downscale; works in any truecolor terminal.
-
-Frame decoding requires Pillow (a core Hermes dependency).  If Pillow or the
-spritesheet is unavailable the renderer degrades to ``unicode`` text or an
-empty string rather than raising.
-"""
-
-from __future__ import annotations
-
-import base64
-import io
-import logging
-import os
-import sys
-from functools import lru_cache
-from pathlib import Path
-
-from agent.pet.constants import (
-    DEFAULT_SCALE,
-    FRAME_H,
-    FRAME_W,
-    FRAMES_PER_STATE,
-    PetState,
-    state_row_index,
-)
-
-logger = logging.getLogger(__name__)
-
-# Public render-mode names accepted by ``display.pet.render_mode``.
-RENDER_MODES = ("auto", "kitty", "iterm", "sixel", "unicode", "off")
-
-
-# ─────────────────────────────────────────────────────────────────────────
-# Terminal capability detection
-# ─────────────────────────────────────────────────────────────────────────
-
-def detect_terminal_graphics() -> str:
-    """Best-effort detection of the richest graphics protocol available.
-
-    Env-based (non-blocking — we never issue a DA1/terminal query that could
-    hang a pipe).  Returns one of ``kitty`` / ``iterm`` / ``sixel`` /
-    ``unicode``.  Conservative: unknown terminals get ``unicode``, which works
-    anywhere with truecolor.
-    """
-    term = os.environ.get("TERM", "").lower()
-    term_program = os.environ.get("TERM_PROGRAM", "").lower()
-
-    # The VS Code / Cursor integrated terminal sets TERM_PROGRAM=vscode
-    # authoritatively but does NOT scrub the terminal env vars it inherits when
-    # launched from another emulator (ITERM_SESSION_ID, KITTY_WINDOW_ID, …).
-    # Trusting those leaks emits an image protocol the embedded xterm.js can't
-    # display — you get a blank frame. Inline images there are opt-in
-    # (terminal.integrated.enableImages), so default to half-blocks, which
-    # always render in its truecolor grid. Users who enabled images can pin
-    # display.pet.render_mode explicitly.
-    if term_program == "vscode":
-        return "unicode"
-
-    # kitty graphics protocol
-    if os.environ.get("KITTY_WINDOW_ID") or "kitty" in term or "ghostty" in term:
-        return "kitty"
-    if term_program in {"ghostty"}:
-        return "kitty"
-
-    # WezTerm speaks both kitty and iterm; prefer kitty (richer placement).
-    if term_program == "wezterm" or os.environ.get("WEZTERM_PANE"):
-        return "kitty"
-
-    # iTerm2 inline images
-    if term_program == "iterm.app" or os.environ.get("ITERM_SESSION_ID"):
-        return "iterm"
-
-    # sixel-capable terminals (env heuristics only)
-    if term_program in {"mintty"} or "foot" in term or "mlterm" in term:
-        return "sixel"
-    if "sixel" in term:
-        return "sixel"
-
-    return "unicode"
-
-
-def resolve_mode(configured: str | None, *, stream=None) -> str:
-    """Resolve the effective render mode from config + the environment.
-
-    ``configured`` is ``display.pet.render_mode`` (``auto`` → detect).  Returns
-    ``off`` when not attached to a TTY (no point emitting graphics into a pipe
-    or logfile).
-    """
-    mode = (configured or "auto").strip().lower()
-    if mode not in RENDER_MODES:
-        mode = "auto"
-    if mode == "off":
-        return "off"
-
-    stream = stream or sys.stdout
-    try:
-        if not (hasattr(stream, "isatty") and stream.isatty()):
-            return "off"
-    except (ValueError, OSError):
-        return "off"
-
-    if mode == "auto":
-        return detect_terminal_graphics()
-    return mode
-
-
-# ─────────────────────────────────────────────────────────────────────────
-# Frame decoding
-# ─────────────────────────────────────────────────────────────────────────
-
-def _open_sheet(path: Path):
-    from PIL import Image
-
-    img = Image.open(path)
-    return img.convert("RGBA")
-
-
-# Max alpha at/below which a frame counts as blank padding.  petdex sheets are
-# left-packed: a state with fewer real frames than ``FRAMES_PER_STATE`` fills
-# the trailing columns with fully transparent cells.  Animating into one flashes
-# the pet blank, so we stop the row at the first such gap.
-_BLANK_ALPHA = 8
-
-
-def _frame_is_blank(frame) -> bool:
-    """True if *frame* has no meaningfully opaque pixel (transparent padding)."""
-    return frame.getchannel("A").getextrema()[1] <= _BLANK_ALPHA
-
-
-@lru_cache(maxsize=16)
-def _raw_frames(
-    sheet_path: str,
-    state_value: str,
-    frame_w: int,
-    frame_h: int,
-    frames_per_state: int,
-) -> tuple:
-    """Cropped, padding-trimmed RGBA frames for one state row (unscaled).
-
-    Steps across the row until the first blank column so pets with ragged
-    per-state frame counts never animate into empty padding.  Cached; returns
-    ``()`` on any decode failure.
-    """
-    try:
-        sheet = _open_sheet(Path(sheet_path))
-        cols = max(1, sheet.width // frame_w)
-        rows = max(1, sheet.height // frame_h)
-        row = state_row_index(state_value, rows)
-        top = row * frame_h
-        # Clamp the row to the sheet (some pets ship fewer rows than the 8 the
-        # taxonomy reserves).
-        if top + frame_h > sheet.height:
-            top = max(0, sheet.height - frame_h)
-
-        frames = []
-        for i in range(min(frames_per_state, cols)):
-            left = i * frame_w
-            frame = sheet.crop((left, top, left + frame_w, top + frame_h))
-            if _frame_is_blank(frame):
-                break  # trailing transparent padding — real frames end here
-            frames.append(frame)
-        return tuple(frames)
-    except Exception as exc:  # noqa: BLE001 - cosmetic feature, never fatal
-        logger.debug("pet frame decode failed (%s, %s): %s", sheet_path, state_value, exc)
-        return ()
-
-
-@lru_cache(maxsize=8)
-def _frames_for(
-    sheet_path: str,
-    state_value: str,
-    frame_w: int,
-    frame_h: int,
-    frames_per_state: int,
-    scale_w: int,
-    scale_h: int,
-):
-    """Return padding-trimmed RGBA frames for one state row, scaled.
-
-    Thin scaling layer over :func:`_raw_frames`; both are cached so repeated
-    frame requests during animation are free.
-    """
-    raw = _raw_frames(sheet_path, state_value, frame_w, frame_h, frames_per_state)
-    if not raw or (scale_w, scale_h) == (frame_w, frame_h):
-        return list(raw)
-    from PIL import Image
-
-    return [f.resize((scale_w, scale_h), Image.LANCZOS) for f in raw]
-
-
-def state_frame_counts(
-    sheet_path: str | Path,
-    *,
-    frame_w: int = FRAME_W,
-    frame_h: int = FRAME_H,
-    frames_per_state: int = FRAMES_PER_STATE,
-) -> dict[str, int]:
-    """Map each driven :class:`PetState` → its real (padding-trimmed) frame count.
-
-    The single source of truth for "how many frames does this state actually
-    have?".  The CLI/TUI consume the trimmed frame lists directly; the gateway
-    ships this map to the desktop canvas, which steps its own loop.
-    """
-    return {
-        state.value: len(
-            _raw_frames(str(sheet_path), state.value, frame_w, frame_h, frames_per_state)
-        )
-        for state in PetState
-    }
-
-
-# ─────────────────────────────────────────────────────────────────────────
-# Encoders
-# ─────────────────────────────────────────────────────────────────────────
-
-def _png_bytes(frame) -> bytes:
-    buf = io.BytesIO()
-    frame.save(buf, format="PNG")
-    return buf.getvalue()
-
-
-def _kitty_apc(ctrl: str, data: str) -> str:
-    """Emit a kitty APC escape for *data*, chunked into ≤4096-byte ``m`` pieces."""
-    chunk = 4096
-    if len(data) <= chunk:
-        return f"\x1b_G{ctrl},m=0;{data}\x1b\\"
-    out = [f"\x1b_G{ctrl},m=1;{data[:chunk]}\x1b\\"]
-    rest = data[chunk:]
-    while rest:
-        piece, rest = rest[:chunk], rest[chunk:]
-        out.append(f"\x1b_Gm={1 if rest else 0};{piece}\x1b\\")
-    return "".join(out)
-
-
-def _encode_kitty(frame, *, cell_cols: int | None = None, cell_rows: int | None = None) -> str:
-    """Encode one frame via the kitty graphics protocol (transmit + display).
-
-    ``a=T`` transmits & displays at the cursor; ``c``/``r`` request a display
-    box in terminal cells so successive frames overwrite the same area.
-    """
-    ctrl = "f=100,a=T,q=2"
-    if cell_cols:
-        ctrl += f",c={cell_cols}"
-    if cell_rows:
-        ctrl += f",r={cell_rows}"
-    return _kitty_apc(ctrl, base64.standard_b64encode(_png_bytes(frame)).decode("ascii"))
-
-
-# ─────────────────────────────────────────────────────────────────────────
-# kitty Unicode placeholders
-#
-# Ink (the TUI's React-for-terminal layer) owns the screen and measures every
-# cell's width, so it can't host raw kitty image escapes (no width to count,
-# clobbered on the next repaint). kitty's *Unicode placeholder* protocol is the
-# grid-safe path: transmit the image once (q=2, virtual placement U=1), then the
-# host app prints ordinary-width placeholder cells (U+10EEEE + diacritics) whose
-# foreground color encodes the image id. Ink counts those as width-1 text, so
-# layout stays correct and the terminal paints the image underneath.
-#   https://sw.kovidgoyal.net/kitty/graphics-protocol/#unicode-placeholders
-# ─────────────────────────────────────────────────────────────────────────
-
-_KITTY_PLACEHOLDER = "\U0010eeee"
-
-# Row/column diacritics, in order (index → diacritic). Verbatim from kitty's
-# gen/rowcolumn-diacritics.txt (Unicode 6.0.0, combining class 230). Index i is
-# the diacritic that encodes the number i; we only ever need the row index.
-_ROWCOL_DIACRITICS: tuple[int, ...] = (
-    0x0305, 0x030D, 0x030E, 0x0310, 0x0312, 0x033D, 0x033E, 0x033F, 0x0346, 0x034A,
-    0x034B, 0x034C, 0x0350, 0x0351, 0x0352, 0x0357, 0x035B, 0x0363, 0x0364, 0x0365,
-    0x0366, 0x0367, 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F,
-    0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0592, 0x0593, 0x0594, 0x0595, 0x0597,
-    0x0598, 0x0599, 0x059C, 0x059D, 0x059E, 0x059F, 0x05A0, 0x05A1, 0x05A8, 0x05A9,
-    0x05AB, 0x05AC, 0x05AF, 0x05C4, 0x0610, 0x0611, 0x0612, 0x0613, 0x0614, 0x0615,
-    0x0616, 0x0617, 0x0657, 0x0658, 0x0659, 0x065A, 0x065B, 0x065D, 0x065E, 0x06D6,
-    0x06D7, 0x06D8, 0x06D9, 0x06DA, 0x06DB, 0x06DC, 0x06DF, 0x06E0, 0x06E1, 0x06E2,
-    0x06E4, 0x06E7, 0x06E8, 0x06EB, 0x06EC, 0x0730, 0x0732, 0x0733, 0x0735, 0x0736,
-    0x073A, 0x073D, 0x073F, 0x0740, 0x0741, 0x0743, 0x0745, 0x0747, 0x0749, 0x074A,
-    0x07EB, 0x07EC, 0x07ED, 0x07EE, 0x07EF, 0x07F0, 0x07F1, 0x07F3, 0x0816, 0x0817,
-    0x0818, 0x0819, 0x081B, 0x081C, 0x081D, 0x081E, 0x081F, 0x0820, 0x0821, 0x0822,
-    0x0823, 0x0825, 0x0826, 0x0827, 0x0829, 0x082A, 0x082B, 0x082C, 0x082D, 0x0951,
-    0x0953, 0x0954, 0x0F82, 0x0F83, 0x0F86, 0x0F87, 0x135D, 0x135E, 0x135F, 0x17DD,
-    0x193A, 0x1A17, 0x1A75, 0x1A76, 0x1A77, 0x1A78, 0x1A79, 0x1A7A, 0x1A7B, 0x1A7C,
-    0x1B6B, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73, 0x1CD0, 0x1CD1,
-    0x1CD2, 0x1CDA, 0x1CDB, 0x1CE0, 0x1DC0, 0x1DC1, 0x1DC3, 0x1DC4, 0x1DC5, 0x1DC6,
-    0x1DC7, 0x1DC8, 0x1DC9, 0x1DCB, 0x1DCC, 0x1DD1, 0x1DD2, 0x1DD3, 0x1DD4, 0x1DD5,
-    0x1DD6, 0x1DD7, 0x1DD8, 0x1DD9, 0x1DDA, 0x1DDB, 0x1DDC, 0x1DDD, 0x1DDE, 0x1DDF,
-    0x1DE0, 0x1DE1, 0x1DE2, 0x1DE3, 0x1DE4, 0x1DE5, 0x1DE6, 0x1DFE, 0x20D0, 0x20D1,
-    0x20D4, 0x20D5, 0x20D6, 0x20D7, 0x20DB, 0x20DC, 0x20E1, 0x20E7, 0x20E9, 0x20F0,
-    0x2CEF, 0x2CF0, 0x2CF1, 0x2DE0, 0x2DE1, 0x2DE2, 0x2DE3, 0x2DE4, 0x2DE5, 0x2DE6,
-    0x2DE7, 0x2DE8, 0x2DE9, 0x2DEA, 0x2DEB, 0x2DEC, 0x2DED, 0x2DEE, 0x2DEF, 0x2DF0,
-    0x2DF1, 0x2DF2, 0x2DF3, 0x2DF4, 0x2DF5, 0x2DF6, 0x2DF7, 0x2DF8, 0x2DF9, 0x2DFA,
-    0x2DFB, 0x2DFC, 0x2DFD, 0x2DFE, 0x2DFF, 0xA66F, 0xA67C, 0xA67D, 0xA6F0, 0xA6F1,
-    0xA8E0, 0xA8E1, 0xA8E2, 0xA8E3, 0xA8E4, 0xA8E5, 0xA8E6, 0xA8E7, 0xA8E8, 0xA8E9,
-    0xA8EA, 0xA8EB, 0xA8EC, 0xA8ED, 0xA8EE, 0xA8EF, 0xA8F0, 0xA8F1, 0xAAB0, 0xAAB2,
-    0xAAB3, 0xAAB7, 0xAAB8, 0xAABE, 0xAABF, 0xAAC1, 0xFE20, 0xFE21, 0xFE22, 0xFE23,
-    0xFE24, 0xFE25, 0xFE26, 0x10A0F, 0x10A38, 0x1D185, 0x1D186, 0x1D187, 0x1D188,
-    0x1D189, 0x1D1AA, 0x1D1AB, 0x1D1AC, 0x1D1AD, 0x1D242, 0x1D243, 0x1D244,
-)
-
-
-def kitty_image_id(slug: str) -> int:
-    """Stable per-pet image id in ``[1, 0x7FFF]``.
-
-    The id is encoded in the placeholder's 24-bit foreground color, so it must
-    be non-zero and fit comfortably under ``0xFFFFFF``. A small CRC keeps it
-    deterministic per slug (so re-renders reuse the same terminal-side image)
-    while making collisions between two different pets unlikely.
-    """
-    import zlib
-
-    return (zlib.crc32(slug.encode("utf-8")) % 0x7FFE) + 1
-
-
-def kitty_color_hex(image_id: int) -> str:
-    """Hex foreground color (``#rrggbb``) that encodes *image_id* for kitty."""
-    return "#%06x" % (image_id & 0xFFFFFF)
-
-
-def kitty_placeholder_rows(cols: int, rows: int) -> list[str]:
-    """Build the placeholder text grid for an *rows*×*cols* image.
-
-    Each line is one row of the grid: the first cell carries the row diacritic
-    (column defaults to 0), and the remaining ``cols-1`` bare placeholders let
-    the terminal auto-increment the column. The foreground color (the image id)
-    is applied by the caller / Ink, not embedded here.
-    """
-    cols = max(1, cols)
-    out: list[str] = []
-    for r in range(max(1, rows)):
-        idx = min(r, len(_ROWCOL_DIACRITICS) - 1)
-        first = _KITTY_PLACEHOLDER + chr(_ROWCOL_DIACRITICS[idx])
-        out.append(first + _KITTY_PLACEHOLDER * (cols - 1))
-    return out
-
-
-def _encode_kitty_virtual(frame, *, image_id: int, cols: int, rows: int) -> str:
-    """Transmit a frame as a kitty *virtual* placement for Unicode placeholders.
-
-    ``a=T`` transmits and creates the placement in one shot; ``U=1`` marks it
-    virtual (no on-screen output, cursor untouched); ``q=2`` suppresses the
-    terminal's OK/error replies that would otherwise corrupt the host app's
-    output. Re-sending with the same ``i`` replaces the image, so the static
-    placeholder cells animate underneath.
-    """
-    ctrl = f"a=T,U=1,i={image_id},c={cols},r={rows},f=100,q=2"
-    return _kitty_apc(ctrl, base64.standard_b64encode(_png_bytes(frame)).decode("ascii"))
-
-
-def _encode_iterm(frame, *, cell_cols: int | None = None, cell_rows: int | None = None) -> str:
-    """Encode one frame as an iTerm2 inline image (OSC 1337 File)."""
-    payload = base64.standard_b64encode(_png_bytes(frame)).decode("ascii")
-    size = len(payload)
-    args = [f"inline=1", f"size={size}", "preserveAspectRatio=1"]
-    if cell_cols:
-        args.append(f"width={cell_cols}")
-    if cell_rows:
-        args.append(f"height={cell_rows}")
-    return f"\x1b]1337;File={';'.join(args)}:{payload}\x07"
-
-
-def _encode_sixel(frame) -> str:
-    """Encode one frame as DEC sixel.
-
-    Quantizes to an adaptive palette (≤255 colors) and emits the sixel band
-    stream.  Pillow has no sixel writer, so this is a compact hand-rolled
-    encoder.  Transparent pixels render as background (color register skipped).
-    """
-    from PIL import Image
-
-    rgba = frame
-    # Composite onto transparent-as-skip: track alpha to decide background.
-    pal = rgba.convert("RGB").quantize(colors=255, method=Image.MEDIANCUT)
-    palette = pal.getpalette() or []
-    px = pal.load()
-    alpha = rgba.getchannel("A").load()
-    w, h = pal.size
-
-    out = ["\x1bP0;1;0q", '"1;1;%d;%d' % (w, h)]
-    # Color register definitions (sixel uses 0..100 scale).
-    used = sorted({px[x, y] for y in range(h) for x in range(w)})
-    for idx in used:
-        r = palette[idx * 3] if idx * 3 < len(palette) else 0
-        g = palette[idx * 3 + 1] if idx * 3 + 1 < len(palette) else 0
-        b = palette[idx * 3 + 2] if idx * 3 + 2 < len(palette) else 0
-        out.append("#%d;2;%d;%d;%d" % (idx, r * 100 // 255, g * 100 // 255, b * 100 // 255))
-
-    # Emit in 6-row bands.
-    for band in range(0, h, 6):
-        for color_idx in used:
-            line = ["#%d" % color_idx]
-            run_char = None
-            run_len = 0
-
-            def flush():
-                nonlocal run_char, run_len
-                if run_char is None:
-                    return
-                if run_len > 3:
-                    line.append("!%d%s" % (run_len, run_char))
-                else:
-                    line.append(run_char * run_len)
-                run_char, run_len = None, 0
-
-            for x in range(w):
-                bits = 0
-                for bit in range(6):
-                    y = band + bit
-                    if y < h and alpha[x, y] > 32 and px[x, y] == color_idx:
-                        bits |= 1 << bit
-                ch = chr(63 + bits)
-                if ch == run_char:
-                    run_len += 1
-                else:
-                    flush()
-                    run_char, run_len = ch, 1
-            flush()
-            out.append("".join(line) + "$")  # carriage return within band
-        out.append("-")  # next band
-    out.append("\x1b\\")
-    return "".join(out)
-
-
-_HALF_BLOCK = "▀"
-
-# A single half-block cell: top pixel + bottom pixel as (r, g, b, a) tuples.
-Cell = tuple[tuple[int, int, int, int], tuple[int, int, int, int]]
-
-
-def _downscale_cells(frame, *, target_cols: int) -> list[list[Cell]]:
-    """Downscale a frame to a grid of half-block cells.
-
-    Each cell pairs a top and bottom pixel so one terminal row encodes two
-    pixel rows.  Returns rows of ``((tr,tg,tb,ta),(br,bg,bb,ba))`` — the
-    framework-neutral representation shared by the ANSI encoder (CLI) and the
-    structured ``cells`` API (Ink).
-    """
-    from PIL import Image
-
-    target_cols = max(4, target_cols)
-    aspect = frame.height / max(1, frame.width)
-    target_rows = max(2, int(round(target_cols * aspect * 0.5)) * 2)
-    small = frame.resize((target_cols, target_rows), Image.LANCZOS).convert("RGBA")
-    px = small.load()
-
-    grid: list[list[Cell]] = []
-    for y in range(0, target_rows, 2):
-        row: list[Cell] = []
-        for x in range(target_cols):
-            top = px[x, y]
-            bottom = px[x, y + 1] if y + 1 < target_rows else (0, 0, 0, 0)
-            row.append((top, bottom))
-        grid.append(row)
-    return grid
-
-
-def _encode_unicode(frame, *, target_cols: int) -> str:
-    """Downscale to truecolor ANSI half-blocks (one char = 2 vertical pixels)."""
-    lines: list[str] = []
-    for row in _downscale_cells(frame, target_cols=target_cols):
-        cells: list[str] = []
-        for (tr, tg, tb, ta), (br, bg, bb, ba) in row:
-            if ta < 32 and ba < 32:
-                cells.append("\x1b[0m ")  # fully transparent → blank
-                continue
-            cells.append(f"\x1b[38;2;{tr};{tg};{tb}m\x1b[48;2;{br};{bg};{bb}m{_HALF_BLOCK}")
-        lines.append("".join(cells) + "\x1b[0m")
-    return "\n".join(lines)
-
-
-# ─────────────────────────────────────────────────────────────────────────
-# Public renderer
-# ─────────────────────────────────────────────────────────────────────────
-
-class PetRenderer:
-    """Holds a pet's spritesheet and yields encoded frames per (state, index).
-
-    Construct once per pet, then call :meth:`frame` on an animation timer.
-    Cheap to call repeatedly — decoded frames are cached.
-    """
-
-    def __init__(
-        self,
-        spritesheet: str | Path,
-        *,
-        mode: str = "unicode",
-        scale: float = DEFAULT_SCALE,
-        unicode_cols: int = 20,
-        frame_w: int = FRAME_W,
-        frame_h: int = FRAME_H,
-        frames_per_state: int = FRAMES_PER_STATE,
-    ) -> None:
-        self.spritesheet = str(spritesheet)
-        self.mode = mode if mode in RENDER_MODES else "unicode"
-        self.scale = scale
-        self.unicode_cols = unicode_cols
-        self.frame_w = frame_w
-        self.frame_h = frame_h
-        self.frames_per_state = frames_per_state
-
-    @property
-    def available(self) -> bool:
-        return self.mode != "off" and Path(self.spritesheet).is_file()
-
-    def frame_count(self, state: PetState | str) -> int:
-        return len(self._frames(state))
-
-    def _frames(self, state: PetState | str):
-        value = state.value if isinstance(state, PetState) else str(state)
-        scale_w = max(1, int(self.frame_w * self.scale))
-        scale_h = max(1, int(self.frame_h * self.scale))
-        return _frames_for(
-            self.spritesheet,
-            value,
-            self.frame_w,
-            self.frame_h,
-            self.frames_per_state,
-            scale_w,
-            scale_h,
-        )
-
-    def cells(self, state: PetState | str, index: int, *, cols: int | None = None) -> list[list[Cell]]:
-        """Return one frame as a half-block cell grid (framework-neutral).
-
-        Used by the TUI, which renders the grid with native Ink color props
-        instead of raw ANSI.  Returns ``[]`` when no frame is available.
-        """
-        frames = self._frames(state)
-        if not frames:
-            return []
-        frame = frames[index % len(frames)]
-        return _downscale_cells(frame, target_cols=cols or self.unicode_cols)
-
-    def _cell_box(self, frame) -> tuple[int, int]:
-        """Terminal cell box for a scaled frame (~8×16 px per cell).
-
-        Must match :meth:`frame` graphics sizing — kitty stretches the image to
-        fill ``c``×``r`` cells, so these must reflect the scaled pixel
-        dimensions, not a native-aspect column count (that upscales small pets).
-        """
-        return max(1, frame.width // 8), max(1, frame.height // 16)
-
-    def kitty_payload(self, state: PetState | str, *, image_id: int) -> dict | None:
-        """Build the kitty Unicode-placeholder payload for one state.
-
-        Returns ``{cols, rows, placeholder, frames}`` where ``frames`` is a
-        list of transmit escapes (one per animation frame, all reusing
-        ``image_id``) and ``placeholder`` is the static text grid Ink paints.
-        Placement geometry is derived from the scaled frame pixels (via
-        :meth:`_cell_box`), not ``unicode_cols`` — kitty upscales to fill
-        ``c``×``r`` cells. ``None`` when no frame is available.
-        """
-        frames = self._frames(state)
-        if not frames:
-            return None
-        cols, rows = self._cell_box(frames[0])
-        return {
-            "cols": cols,
-            "rows": rows,
-            "placeholder": kitty_placeholder_rows(cols, rows),
-            "frames": [
-                _encode_kitty_virtual(f, image_id=image_id, cols=cols, rows=rows) for f in frames
-            ],
-        }
-
-    def frame(self, state: PetState | str, index: int) -> str:
-        """Return the encoded escape string for one frame, or ``""``.
-
-        ``index`` is taken modulo the available frame count so callers can pass
-        a free-running counter.
-        """
-        if self.mode == "off":
-            return ""
-        frames = self._frames(state)
-        if not frames:
-            return ""
-        frame = frames[index % len(frames)]
-        cell_cols, cell_rows = self._cell_box(frame)
-
-        try:
-            if self.mode == "kitty":
-                return _encode_kitty(frame, cell_cols=cell_cols, cell_rows=cell_rows)
-            if self.mode == "iterm":
-                return _encode_iterm(frame, cell_cols=cell_cols, cell_rows=cell_rows)
-            if self.mode == "sixel":
-                return _encode_sixel(frame)
-            return _encode_unicode(frame, target_cols=self.unicode_cols)
-        except Exception as exc:  # noqa: BLE001 - degrade silently
-            logger.debug("pet frame encode failed (mode=%s): %s", self.mode, exc)
-            return ""
-
-
-def build_renderer(
-    spritesheet: str | Path,
-    *,
-    configured_mode: str | None = None,
-    scale: float = DEFAULT_SCALE,
-    unicode_cols: int = 20,
-    stream=None,
-) -> PetRenderer:
-    """Convenience factory: resolve the mode from config+env, then construct."""
-    mode = resolve_mode(configured_mode, stream=stream)
-    return PetRenderer(
-        spritesheet,
-        mode=mode,
-        scale=scale,
-        unicode_cols=unicode_cols,
-    )
--- a/agent/pet/state.py
+++ b/agent/pet/state.py
@@ -1,81 +0,0 @@
-"""Map agent activity → a :class:`PetState`.
-
-This is the one place the "what is the agent doing right now?" → "which
-animation row?" decision lives.  Each surface feeds it the signals it already
-tracks:
-
- CLI    — ``KawaiiSpinner`` waiting/thinking state + tool outcomes.
- TUI    — gateway ``tool.start/complete`` + ``message.delta/complete`` events.
- Desktop — the ``$busy``/``$awaitingResponse``/tool-event nanostores
-            (re-implemented in TS, but mirroring this priority order).
-
-Keeping the priority order here (and documenting it) lets the TypeScript
-mirror stay faithful without a second design.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Iterable
-from typing import Any
-
-from agent.pet.constants import PetState
-
-
-def todos_all_done(todos: Iterable[Any] | None) -> bool:
-    """True iff there's ≥1 todo and every one is completed/cancelled.
-
-    The "celebrate" beat (``JUMP``) fires when a plan finishes; this mirrors
-    the TUI's ``isTodoDone`` so the trigger is defined once across surfaces.
-    Accepts dicts (``{"status": ...}``) or objects with a ``status`` attr.
-    """
-    items = list(todos or [])
-    if not items:
-        return False
-
-    def _status(t: Any) -> Any:
-        return t.get("status") if isinstance(t, dict) else getattr(t, "status", None)
-
-    return all(_status(t) in ("completed", "cancelled") for t in items)
-
-
-def derive_pet_state(
-    *,
-    busy: bool = False,
-    awaiting_input: bool = False,
-    error: bool = False,
-    celebrate: bool = False,
-    just_completed: bool = False,
-    tool_running: bool = False,
-    reasoning: bool = False,
-) -> PetState:
-    """Resolve the animation state from coarse activity signals.
-
-    Priority (highest first) — only one row can show at a time, so the most
-    salient signal wins:
-
-    1. ``error``          → ``FAILED``  (a tool/turn just failed)
-    2. ``celebrate``      → ``JUMP``    (explicit success beat, e.g. todos done)
-    3. ``just_completed`` → ``WAVE``    (turn finished cleanly / greeting)
-    4. ``awaiting_input`` → ``WAITING`` (blocked on the user — a clarify/approval
-       prompt is open; this outranks the in-flight signals below because the turn
-       is paused on *you*, even though a tool is technically mid-call)
-    5. ``tool_running``   → ``RUN``     (a tool is executing)
-    6. ``reasoning``      → ``REVIEW``  (model is thinking / reading)
-    7. ``busy``           → ``RUN``     (turn in flight, unspecified work)
-    8. otherwise          → ``IDLE``
-    """
-    if error:
-        return PetState.FAILED
-    if celebrate:
-        return PetState.JUMP
-    if just_completed:
-        return PetState.WAVE
-    if awaiting_input:
-        return PetState.WAITING
-    if tool_running:
-        return PetState.RUN
-    if reasoning:
-        return PetState.REVIEW
-    if busy:
-        return PetState.RUN
-    return PetState.IDLE
--- a/agent/pet/store.py
+++ b/agent/pet/store.py
@@ -1,503 +0,0 @@
-"""On-disk pet store — install / list / resolve pets.
-
-Pets live under ``get_hermes_home()/pets/<slug>/`` so every profile gets its
-own set (we deliberately do **not** reuse petdex's ``~/.codex/pets`` default —
-that's owned by the petdex npm CLI and isn't profile-aware).  Each installed
-pet directory holds:
-
-    pets/<slug>/
-        pet.json            # {id, displayName, description, spritesheetPath}
-        spritesheet.webp    # (or .png)
-
-The active pet is resolved from the caller-supplied ``display.pet.slug`` config
-value (falling back to the first installed pet), so this module stays free of
-the config loader.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-from dataclasses import dataclass
-from pathlib import Path
-
-from hermes_constants import get_hermes_home
-
-logger = logging.getLogger(__name__)
-
-_DOWNLOAD_TIMEOUT = 60.0
-
-
-class PetStoreError(RuntimeError):
-    """Raised on install/IO failures."""
-
-
-@dataclass(frozen=True)
-class InstalledPet:
-    """A pet present on disk."""
-
-    slug: str
-    display_name: str
-    description: str
-    directory: Path
-    spritesheet: Path
-    created_by: str = ""  # "generator" for pets hatched locally; "" for petdex installs
-
-    @property
-    def exists(self) -> bool:
-        return self.spritesheet.is_file()
-
-    @property
-    def generated(self) -> bool:
-        return self.created_by == "generator"
-
-
-def pets_dir() -> Path:
-    """Return the profile-scoped pets directory (created on demand)."""
-    path = get_hermes_home() / "pets"
-    path.mkdir(parents=True, exist_ok=True)
-    return path
-
-
-def _read_pet_json(directory: Path) -> dict:
-    pet_json = directory / "pet.json"
-    if not pet_json.is_file():
-        return {}
-    try:
-        return json.loads(pet_json.read_text(encoding="utf-8"))
-    except (OSError, ValueError) as exc:
-        logger.debug("unreadable pet.json in %s: %s", directory, exc)
-        return {}
-
-
-def _resolve_spritesheet(directory: Path, meta: dict) -> Path:
-    """Find the spritesheet for a pet dir.
-
-    Honors ``spritesheetPath`` from pet.json, else probes the conventional
-    filenames (``spritesheet.{webp,png}`` and petdex R2's ``sprite.webp``).
-    """
-    declared = str(meta.get("spritesheetPath", "") or "").strip()
-    if declared:
-        candidate = directory / declared
-        if candidate.is_file():
-            return candidate
-    for name in ("spritesheet.webp", "spritesheet.png", "sprite.webp", "sprite.png"):
-        candidate = directory / name
-        if candidate.is_file():
-            return candidate
-    # Default expectation even if missing, so callers get a stable path.
-    return directory / "spritesheet.webp"
-
-
-def _safe_slug(slug: str) -> str:
-    """Normalize a slug to a single bare path segment.
-
-    Pet slugs index into ``pets_dir()/<slug>/`` for load/remove, so a value
-    carrying path separators (``../``, absolute paths) could escape the pets
-    directory. Strip every separator and reject ``.``/``..`` so callers can
-    only ever name a direct child of the pets directory.
-    """
-    segment = Path(str(slug).strip()).name
-    if segment in ("", ".", ".."):
-        return ""
-    return segment
-
-
-def load_pet(slug: str) -> InstalledPet | None:
-    """Return the :class:`InstalledPet` for *slug*, or ``None`` if absent."""
-    slug = _safe_slug(slug)
-    if not slug:
-        return None
-    directory = pets_dir() / slug
-    if not directory.is_dir():
-        return None
-    meta = _read_pet_json(directory)
-    return InstalledPet(
-        slug=slug,
-        display_name=str(meta.get("displayName", "") or slug),
-        description=str(meta.get("description", "") or ""),
-        directory=directory,
-        spritesheet=_resolve_spritesheet(directory, meta),
-        created_by=str(meta.get("createdBy", "") or ""),
-    )
-
-
-def installed_pets() -> list[InstalledPet]:
-    """Return every installed pet (dirs containing a usable spritesheet)."""
-    out: list[InstalledPet] = []
-    for child in sorted(pets_dir().iterdir()):
-        if not child.is_dir():
-            continue
-        pet = load_pet(child.name)
-        if pet and pet.exists:
-            out.append(pet)
-    return out
-
-
-def resolve_active_pet(configured_slug: str | None = None) -> InstalledPet | None:
-    """Resolve which pet to display.
-
-    Precedence: the configured slug (``display.pet.slug``) if it's installed,
-    otherwise the first installed pet alphabetically, otherwise ``None``.
-    """
-    if configured_slug:
-        pet = load_pet(configured_slug.strip())
-        if pet and pet.exists:
-            return pet
-    pets = installed_pets()
-    return pets[0] if pets else None
-
-
-def install_pet(slug: str, *, force: bool = False, timeout: float = _DOWNLOAD_TIMEOUT) -> InstalledPet:
-    """Download *slug* from the manifest into the pets directory.
-
-    Idempotent: a fully-installed pet is returned as-is unless *force*.  Raises
-    :class:`PetStoreError` / :class:`~agent.pet.manifest.ManifestError` on
-    failure.
-    """
-    from agent.pet.manifest import find_entry
-
-    slug = _safe_slug(slug)
-    if not slug:
-        raise PetStoreError("invalid pet slug")
-    existing = load_pet(slug)
-    if existing and existing.exists and not force:
-        return existing
-
-    entry = find_entry(slug, timeout=timeout)
-    if entry is None:
-        raise PetStoreError(f"pet '{slug}' is not in the petdex manifest")
-
-    # Host-pin every asset URL to petdex. The manifest is trusted (HTTPS from
-    # petdex.dev), but pin the asset hosts too so a compromised/spoofed manifest
-    # can't redirect the download at an arbitrary host. Matches thumbnail_png.
-    if not _is_petdex_host(entry.spritesheet_url):
-        raise PetStoreError(f"refusing non-petdex spritesheet host for '{slug}'")
-
-    directory = pets_dir() / slug
-    directory.mkdir(parents=True, exist_ok=True)
-
-    sprite_ext = ".png" if entry.spritesheet_url.lower().split("?")[0].endswith(".png") else ".webp"
-    sprite_path = directory / f"spritesheet{sprite_ext}"
-
-    _download(entry.spritesheet_url, sprite_path, timeout=timeout)
-
-    # Fetch the upstream pet.json if present; otherwise synthesize a minimal
-    # one so the local layout is self-describing.
-    meta: dict = {}
-    if entry.pet_json_url and _is_petdex_host(entry.pet_json_url):
-        try:
-            meta = _download_json(entry.pet_json_url, timeout=timeout)
-        except Exception as exc:  # noqa: BLE001 - non-fatal, fall back below
-            logger.debug("pet.json fetch failed for %s: %s", slug, exc)
-    if not isinstance(meta, dict) or not meta:
-        meta = {"id": slug, "displayName": entry.display_name, "description": ""}
-    meta["spritesheetPath"] = sprite_path.name
-    meta.setdefault("id", slug)
-    meta.setdefault("displayName", entry.display_name)
-    (directory / "pet.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
-
-    pet = load_pet(slug)
-    if pet is None or not pet.exists:
-        raise PetStoreError(f"install of '{slug}' did not produce a spritesheet")
-    return pet
-
-
-def slugify(name: str) -> str:
-    """Lowercase, hyphenate, and strip a display name into a filesystem slug."""
-    slug = re.sub(r"[^a-z0-9]+", "-", (name or "").strip().lower()).strip("-")
-    return slug or "pet"
-
-
-def unique_slug(name: str) -> str:
-    """A :func:`slugify` result that doesn't collide with an existing pet dir."""
-    base = slugify(name)
-    slug = base
-    counter = 2
-    while (pets_dir() / slug).exists():
-        slug = f"{base}-{counter}"
-        counter += 1
-    return slug
-
-
-def _write_spritesheet(source, dest: Path) -> None:
-    """Write *source* (PIL image, bytes, or path) as a lossless WebP at *dest*."""
-    if isinstance(source, (bytes, bytearray)):
-        dest.write_bytes(bytes(source))
-        return
-
-    from PIL import Image
-
-    if isinstance(source, (str, Path)):
-        with Image.open(source) as opened:
-            image = opened.convert("RGBA")
-    else:
-        image = source.convert("RGBA")
-    image.save(dest, format="WEBP", lossless=True, quality=100, method=6, exact=True)
-
-
-def register_local_pet(
-    spritesheet,
-    *,
-    slug: str,
-    display_name: str = "",
-    description: str = "",
-) -> InstalledPet:
-    """Write a locally-generated pet into the store and return it.
-
-    *spritesheet* may be a PIL image, raw WebP/PNG bytes, or a path. The pet
-    appears in :func:`installed_pets` immediately, and because :func:`install_pet`
-    returns an already-on-disk pet before consulting the manifest, it can be
-    adopted (``pet.select`` / ``/pet <slug>``) without a manifest entry.
-    """
-    slug = slugify(slug)
-    directory = pets_dir() / slug
-    directory.mkdir(parents=True, exist_ok=True)
-    sprite_path = directory / "spritesheet.webp"
-    try:
-        _write_spritesheet(spritesheet, sprite_path)
-    except Exception as exc:  # noqa: BLE001 - normalize to one error type
-        raise PetStoreError(f"could not write spritesheet for '{slug}': {exc}") from exc
-
-    meta = {
-        "id": slug,
-        "displayName": display_name or slug,
-        "description": description or "",
-        "spritesheetPath": sprite_path.name,
-        "createdBy": "generator",
-    }
-    (directory / "pet.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
-
-    pet = load_pet(slug)
-    if pet is None or not pet.exists:
-        raise PetStoreError(f"register of generated pet '{slug}' did not produce a spritesheet")
-    return pet
-
-
-def export_pet(slug: str) -> tuple[str, bytes]:
-    """Zip an installed pet's folder (pet.json + spritesheet) → (filename, bytes).
-
-    Dotfiles (cached thumbs, backups) are skipped so the archive is a clean,
-    re-importable pet package. Raises :class:`PetStoreError` if not installed.
-    """
-    import io
-    import zipfile
-
-    root = pets_dir()
-    directory = root / slug.strip()
-    # Guard against traversal: the target must be a direct child of pets_dir.
-    if directory.resolve().parent != root.resolve() or not directory.is_dir():
-        raise PetStoreError(f"pet '{slug}' is not installed")
-
-    name = directory.name
-    buf = io.BytesIO()
-    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as archive:
-        for path in sorted(directory.iterdir()):
-            if path.is_file() and not path.name.startswith("."):
-                archive.write(path, f"{name}/{path.name}")
-    return f"{name}.zip", buf.getvalue()
-
-
-_THUMB_FRAME_W = 192
-_THUMB_FRAME_H = 208
-_THUMB_W = 96  # rendered ~40px; 2x+ keeps it crisp on HiDPI
-
-
-def _thumbs_dir() -> Path:
-    path = pets_dir() / ".thumbs"
-    path.mkdir(parents=True, exist_ok=True)
-    return path
-
-
-def _is_petdex_host(url: str) -> bool:
-    """True only for petdex.dev hosts — bounds server-side fetch (anti-SSRF)."""
-    from urllib.parse import urlparse
-
-    try:
-        host = (urlparse(url).hostname or "").lower()
-    except ValueError:
-        return False
-    return host == "petdex.dev" or host.endswith(".petdex.dev")
-
-
-def thumbnail_png(slug: str, *, source_url: str = "", timeout: float = 30.0) -> bytes | None:
-    """Return a small idle-frame PNG for *slug*, cached on disk.
-
-    Crops the top-left (idle, frame 0) cell of the spritesheet and downsamples
-    it to a thumbnail. Source preference: an installed spritesheet on disk, else
-    *source_url* — but only when it points at petdex (so the gateway never
-    fetches an arbitrary client-supplied URL). Returns ``None`` when there's no
-    usable source or Pillow/network fails; callers render a placeholder.
-
-    Doing this server-side sidesteps the renderer's CSP / R2 hotlink limits that
-    break a direct ``<img src=cdn>`` and lets the result ride the authenticated
-    gateway as a same-origin data URL.
-    """
-    slug = slug.strip()
-    if not slug:
-        return None
-
-    cache = _thumbs_dir() / f"{slug}.png"
-    if cache.is_file():
-        try:
-            return cache.read_bytes()
-        except OSError:
-            pass
-
-    sheet_bytes: bytes | None = None
-    pet = load_pet(slug)
-    if pet and pet.exists:
-        try:
-            sheet_bytes = pet.spritesheet.read_bytes()
-        except OSError:
-            sheet_bytes = None
-
-    if sheet_bytes is None and source_url and _is_petdex_host(source_url):
-        try:
-            import httpx
-
-            resp = httpx.get(
-                source_url,
-                timeout=timeout,
-                follow_redirects=True,
-                headers={"User-Agent": "hermes-agent-petdex"},
-            )
-            resp.raise_for_status()
-            sheet_bytes = resp.content
-        except Exception as exc:  # noqa: BLE001 - cosmetic, degrade to placeholder
-            logger.debug("thumb fetch failed for %s: %s", slug, exc)
-
-    if not sheet_bytes:
-        return None
-
-    try:
-        import io
-
-        from PIL import Image
-
-        with Image.open(io.BytesIO(sheet_bytes)) as im:
-            frame = im.convert("RGBA").crop(
-                (0, 0, min(_THUMB_FRAME_W, im.width), min(_THUMB_FRAME_H, im.height))
-            )
-            height = round(_THUMB_W * _THUMB_FRAME_H / _THUMB_FRAME_W)
-            frame = frame.resize((_THUMB_W, height), Image.NEAREST)
-            buf = io.BytesIO()
-            frame.save(buf, format="PNG")
-            data = buf.getvalue()
-    except Exception as exc:  # noqa: BLE001
-        logger.debug("thumb crop failed for %s: %s", slug, exc)
-        return None
-
-    try:
-        cache.write_bytes(data)
-    except OSError:
-        pass
-    return data
-
-
-def remove_pet(slug: str) -> bool:
-    """Delete an installed pet directory.  Returns True if anything was removed."""
-    import shutil
-
-    slug = _safe_slug(slug)
-    if not slug:
-        return False
-
-    # The cached thumbnail lives in pets/.thumbs/<slug>.png — OUTSIDE the pet
-    # dir, so rmtree won't catch it. Drop it too, or a later pet that reuses this
-    # slug renders this one's stale thumbnail.
-    try:
-        (_thumbs_dir() / f"{slug}.png").unlink(missing_ok=True)
-    except OSError:
-        pass
-
-    directory = pets_dir() / slug
-    if not directory.is_dir():
-        return False
-    shutil.rmtree(directory, ignore_errors=True)
-    return not directory.exists()
-
-
-def rename_pet(slug: str, display_name: str) -> str | None:
-    """Rename a pet's ``displayName`` AND realign its slug/dir to match.
-
-    Generated pets are hatched under a provisional, prompt-derived slug; when
-    the user names the pet on the reveal screen we make that name the real
-    identity so lists/subtitles show what they typed, not the prompt. The dir is
-    renamed to ``slugify(name)`` (and the cached thumbnail moved alongside it)
-    whenever that yields a free, different slug — otherwise the slug is left as
-    is. Returns the resulting slug on success, or ``None`` on failure.
-    """
-    slug = _safe_slug(slug)
-    display_name = (display_name or "").strip()
-    if not slug or not display_name:
-        return None
-    directory = pets_dir() / slug
-    pet_json = directory / "pet.json"
-    if not pet_json.is_file():
-        return None
-    try:
-        meta = json.loads(pet_json.read_text(encoding="utf-8"))
-    except (OSError, ValueError):
-        meta = {}
-    if not isinstance(meta, dict):
-        meta = {}
-    meta["displayName"] = display_name
-
-    new_slug = slug
-    desired = slugify(display_name)
-    if desired and desired != slug and not (pets_dir() / desired).exists():
-        try:
-            directory.rename(pets_dir() / desired)
-            try:
-                (_thumbs_dir() / f"{slug}.png").rename(_thumbs_dir() / f"{desired}.png")
-            except OSError:
-                pass
-            directory = pets_dir() / desired
-            pet_json = directory / "pet.json"
-            new_slug = desired
-            meta["id"] = new_slug
-        except OSError:
-            new_slug = slug  # keep the provisional slug if the move fails
-
-    try:
-        pet_json.write_text(json.dumps(meta, indent=2), encoding="utf-8")
-    except OSError:
-        return None
-    return new_slug
-
-
-def _download(url: str, dest: Path, *, timeout: float) -> None:
-    import httpx
-
-    try:
-        with httpx.stream(
-            "GET",
-            url,
-            timeout=timeout,
-            follow_redirects=True,
-            headers={"User-Agent": "hermes-agent-petdex"},
-        ) as resp:
-            resp.raise_for_status()
-            tmp = dest.with_suffix(dest.suffix + ".part")
-            with tmp.open("wb") as fh:
-                for chunk in resp.iter_bytes():
-                    fh.write(chunk)
-            tmp.replace(dest)
-    except Exception as exc:  # noqa: BLE001
-        raise PetStoreError(f"download failed for {url}: {exc}") from exc
-
-
-def _download_json(url: str, *, timeout: float) -> dict:
-    import httpx
-
-    resp = httpx.get(
-        url,
-        timeout=timeout,
-        follow_redirects=True,
-        headers={"User-Agent": "hermes-agent-petdex"},
-    )
-    resp.raise_for_status()
-    data = resp.json()
-    return data if isinstance(data, dict) else {}
--- a/agent/process_bootstrap.py
+++ b/agent/process_bootstrap.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import os
 import sys
 import urllib.request
-from typing import Any, Optional
+from typing import Optional

 from utils import base_url_hostname, normalize_proxy_url

@@ -142,46 +142,6 @@ def _get_proxy_for_base_url(base_url: Optional[str]) -> Optional[str]:
    return proxy


-def build_keepalive_http_client(
-    base_url: str = "",
-    *,
-    async_mode: bool = False,
-) -> Optional[Any]:
-    """Build an httpx client for OpenAI SDK calls with env-only proxy policy.
-
-    Uses explicit ``HTTPS_PROXY`` / ``NO_PROXY`` env vars via
-    ``_get_proxy_for_base_url``. A custom transport disables httpx's default
-    ``trust_env`` path, so macOS system proxy settings from
-    ``urllib.request.getproxies()`` (which omit the ExceptionsList) are not
-    applied. Mirrors ``AIAgent._build_keepalive_http_client``.
-    """
-    try:
-        import httpx
-        import socket
-
-        if "api.githubcopilot.com" in str(base_url or "").lower():
-            client_cls = httpx.AsyncClient if async_mode else httpx.Client
-            return client_cls()
-
-        sock_opts = [(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)]
-        if hasattr(socket, "TCP_KEEPIDLE"):
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 30))
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 10))
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3))
-        elif hasattr(socket, "TCP_KEEPALIVE"):
-            sock_opts.append((socket.IPPROTO_TCP, socket.TCP_KEEPALIVE, 30))
-
-        proxy = _get_proxy_for_base_url(base_url)
-        transport_cls = httpx.AsyncHTTPTransport if async_mode else httpx.HTTPTransport
-        client_cls = httpx.AsyncClient if async_mode else httpx.Client
-        return client_cls(
-            transport=transport_cls(socket_options=sock_opts),
-            proxy=proxy,
-        )
-    except Exception:
-        return None
-
-
 def _install_safe_stdio() -> None:
    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
    for stream_name in ("stdout", "stderr"):
@@ -204,5 +164,4 @@ __all__ = [
    "_install_safe_stdio",
    "_get_proxy_from_env",
    "_get_proxy_for_base_url",
-    "build_keepalive_http_client",
 ]
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -243,10 +243,7 @@ KANBAN_GUIDANCE = (
    "- **Workspace.** `cd $HERMES_KANBAN_WORKSPACE` first. For a `worktree` kind "
    "with no `.git`, `git worktree add <path> "
    "${HERMES_KANBAN_BRANCH:-wt/$HERMES_KANBAN_TASK}` from the main repo, then "
-    "cd there. For a project-linked task the workspace is a fresh "
-    "`<repo>/.worktrees/<task-id>` and `$HERMES_KANBAN_BRANCH` a deterministic "
-    "`<project-slug>/<task-id>` — the main repo is two levels up, so run "
-    "`git worktree add` from there.\n"
+    "cd there.\n"
    "- **Deliverables.** Files a human wants go in "
    "`kanban_complete(artifacts=[<absolute paths>])` (top-level param; paths in "
    "`metadata` are NOT uploaded). Files must exist at completion.\n"
@@ -617,12 +614,7 @@ DEVELOPER_ROLE_MODELS = ("gpt-5", "codex")
 PLATFORM_HINTS = {
    "whatsapp": (
        "You are on a text messaging communication platform, WhatsApp. "
-        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
-        "`code`, ```code blocks```, [links](url)) is auto-converted to "
-        "WhatsApp's native syntax (*bold*, _italic_, ~strike~, monospace) — "
-        "feel free to write in markdown, and use bullet lists ('- item') "
-        "freely. Tables are NOT supported — prefer bullet lists or labeled "
-        "key:value pairs. "
+        "Please do not use markdown as it does not render. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. The file "
        "will be sent as a native WhatsApp attachment — images (.jpg, .png, "
@@ -687,11 +679,7 @@ PLATFORM_HINTS = {
    ),
    "signal": (
        "You are on a text messaging communication platform, Signal. "
-        "Standard markdown (**bold**, *italic*, ~~strike~~, # headers, "
-        "`code`, ```code blocks```) is auto-converted to Signal's native "
-        "rich formatting — feel free to write in markdown, and use bullet "
-        "lists ('- item') freely (they render as • bullets). Tables are NOT "
-        "supported — prefer bullet lists or labeled key:value pairs. "
+        "Please do not use markdown as it does not render. "
        "You can send media files natively: to deliver a file to the user, "
        "include MEDIA:/absolute/path/to/file in your response. Images "
        "(.png, .jpg, .webp) appear as photos, audio as attachments, and other "
@@ -721,24 +709,7 @@ PLATFORM_HINTS = {
        "(those are only intercepted on messaging platforms like Telegram, "
        "Discord, Slack, etc.; on the CLI they render as literal text). "
        "When referring to a file you created or changed, just state its "
-        "absolute path in plain text; the user can open it from there. "
-        "Cron jobs scheduled from this session are LOCAL-ONLY: their output is "
-        "saved (viewable via cronjob action='list') but is NOT delivered back "
-        "into this terminal — there is no live-delivery channel here. If the "
-        "user wants to be notified when a job runs, the job's `deliver` must "
-        "target a gateway-connected messaging platform (e.g. deliver='telegram' "
-        "or 'all'). Do not promise the user that a deliver='origin' or "
-        "default-deliver cron job will message them in this session."
-    ),
-    "tui": (
-        "You are running in the Hermes terminal UI (TUI). "
-        "Cron jobs scheduled from this session are LOCAL-ONLY: their output is "
-        "saved (viewable via cronjob action='list') but is NOT delivered back "
-        "into this TUI session — there is no live-delivery channel here. If the "
-        "user wants to be notified when a job runs, the job's `deliver` must "
-        "target a gateway-connected messaging platform (e.g. deliver='telegram' "
-        "or 'all'). Do not promise the user that a deliver='origin' or "
-        "default-deliver cron job will message them in this session."
+        "absolute path in plain text; the user can open it from there."
    ),
    "sms": (
        "You are communicating via SMS. Keep responses concise and use plain text "
@@ -926,7 +897,8 @@ def _probe_remote_backend(env_type: str) -> str | None:
    try:
        # Import locally: tools/ imports are heavy and only relevant when a
        # non-local backend is actually configured.
-        from tools.terminal_tool import _create_environment, _get_env_config  # type: ignore
+        from tools.terminal_tool import _get_env_config  # type: ignore
+        from tools.environments import get_environment  # type: ignore
    except Exception as e:
        logger.debug("Backend probe unavailable (import failed): %s", e)
        _BACKEND_PROBE_CACHE[cache_key] = ""
@@ -934,59 +906,7 @@ def _probe_remote_backend(env_type: str) -> str | None:

    try:
        config = _get_env_config()
-        # Build the environment the same way tools/terminal_tool.py does for a
-        # live command: select the backend image, then assemble ssh/container
-        # config from the env-derived dict. (There is no `get_environment`
-        # factory — the real entry point is `_create_environment`.)
-        if env_type == "docker":
-            image = config.get("docker_image", "")
-        elif env_type == "singularity":
-            image = config.get("singularity_image", "")
-        elif env_type == "modal":
-            image = config.get("modal_image", "")
-        elif env_type == "daytona":
-            image = config.get("daytona_image", "")
-        else:
-            image = ""
-
-        ssh_config = None
-        if env_type == "ssh":
-            ssh_config = {
-                "host": config.get("ssh_host", ""),
-                "user": config.get("ssh_user", ""),
-                "port": config.get("ssh_port", 22),
-                "key": config.get("ssh_key", ""),
-                "persistent": config.get("ssh_persistent", False),
-            }
-
-        container_config = None
-        if env_type in {"docker", "singularity", "modal", "daytona"}:
-            container_config = {
-                "container_cpu": config.get("container_cpu", 1),
-                "container_memory": config.get("container_memory", 5120),
-                "container_disk": config.get("container_disk", 51200),
-                "container_persistent": config.get("container_persistent", True),
-                "modal_mode": config.get("modal_mode", "auto"),
-                "docker_volumes": config.get("docker_volumes", []),
-                "docker_mount_cwd_to_workspace": config.get("docker_mount_cwd_to_workspace", False),
-                "docker_forward_env": config.get("docker_forward_env", []),
-                "docker_env": config.get("docker_env", {}),
-                "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
-                "docker_extra_args": config.get("docker_extra_args", []),
-                "docker_persist_across_processes": config.get("docker_persist_across_processes", True),
-                "docker_orphan_reaper": config.get("docker_orphan_reaper", True),
-            }
-
-        env = _create_environment(
-            env_type=env_type,
-            image=image,
-            cwd=config.get("cwd", ""),
-            timeout=config.get("timeout", 180),
-            ssh_config=ssh_config,
-            container_config=container_config,
-            task_id="prompt-backend-probe",
-            host_cwd=config.get("host_cwd"),
-        )
+        env = get_environment(config)
        # Single-line POSIX probe — works on any Unixy backend. Wrapped in
        # `2>/dev/null` so a missing binary doesn't pollute the output.
        probe_cmd = (
--- a/agent/reasoning_timeouts.py
+++ b/agent/reasoning_timeouts.py
@@ -1,216 +0,0 @@
-"""Per-reasoning-model stale-timeout floor for known reasoning models.
-
-Reasoning models (those that emit extended thinking blocks before their
-first content token) routinely exceed Hermes's default chat-model
-stale detectors:
-
-* Stream stale detector:   ``HERMES_STREAM_STALE_TIMEOUT``     default 180s
-                           ``agent/chat_completion_helpers.py:2544``
-* Non-stream stale detector: ``HERMES_API_CALL_STALE_TIMEOUT``  default 90s
-                           ``run_agent.py:1140``
-
-For NVIDIA Nemotron 3 Ultra on the hosted NIM gateway the empirical
-upstream idle kill is ~120s (first-party reproduction at
-NVIDIA/NemoClaw#4846 — TTFB ~31s, stream dies at 120s). The same
-failure mode exists on OpenAI o1/o3, Anthropic Opus 4.x thinking,
-DeepSeek R1, Qwen QwQ, xAI Grok reasoning — every cloud reasoning
-model hits upstream-proxies / load-balancers with idle timeouts
-shorter than the model's thinking phase. Result: the stale detector
-kills the connection mid-think, surfacing as
-``BrokenPipeError``/``RemoteProtocolError`` on the next read.
-
-This module provides a floor that the existing stale-detector scaling
-blocks consult via :func:`get_reasoning_stale_timeout_floor` and
-apply as ``max(default, floor)``. It is a FLOOR:
-
-* Never overrides explicit user config (``providers.<id>.models.<model>.stale_timeout_seconds``
-  or ``request_timeout_seconds`` already wins — this code never runs
-  in that branch).
-* Never lowers an existing threshold.
-* Has zero effect on non-reasoning models — they are not in the
-  allowlist and the resolver returns ``None``.
-
-Matching uses start-anchored regex on the slug-only component of
-the model name (after stripping any aggregator prefix like
-``openai/``, ``x-ai/``, ``anthropic/``).  The right-anchor matches
-end-of-string or a ``-``/``.``/``_`` slug separator, so ``qwen3-235b``
-matches the ``qwen3`` family entry (a future model slug would be
-``qwen3-235b-instruct`` and would also match) but ``some-other-qwen3``
-does NOT match ``qwen3`` (the ``-qwen3`` is not at start of slug).
-
-The ``o1`` case is the most delicate: a model named
-``llama-4-70b-o1-preview`` is a hypothetical community derivative that
-should NOT trigger the reasoning-model floor for the user (the user
-chose a non-OpenAI model, not a reasoning model).  The start-of-slug
-anchor naturally excludes this — the matched ``o1-preview`` is at
-position 11 of the slug, not at position 0.  The previous substring-
-with-trailing-hyphen design would have over-matched here, which is
-why start-of-slug anchoring is the right shape.
-
-Fixes #52217.
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Optional
-
-
-# (slug, floor_seconds).  Each slug is matched as a discrete
-# word-boundary component via the wrapper regex in ``_match_any``
-# below.  Order is irrelevant — the first regex match wins.
-_REASONING_STALE_TIMEOUT_FLOORS: tuple[tuple[str, int], ...] = (
-    # NVIDIA Nemotron — reasoning models behind hosted NIM with
-    # documented 60-180s upstream idle kill (NVIDIA/NemoClaw#4846:
-    # 120s measured).
-    ("nemotron-3-ultra", 600),
-    ("nemotron-3-super", 600),
-    ("nemotron-3-nano",  300),
-    # DeepSeek — R1 reasoning model on hosted NIM / DeepSeek direct.
-    ("deepseek-r1", 600),
-    ("deepseek-reasoner", 600),
-    # Qwen — QwQ reasoning + Qwen3 thinking variants.  QwQ-32B
-    # preview is the stable slug; ``qwen3`` covers the family of
-    # thinking-mode Qwen3 models (qwen3-235b-a22b, qwen3-32b, etc.)
-    # without over-matching every Qwen3 instruct variant — the
-    # right-anchor requires the slug to be at the start of the
-    # remaining model name, so ``qwen3-235b-instruct`` (instruct is
-    # NOT a thinking variant) would still match.  Acceptable
-    # trade-off: instruct variants of qwen3 get the 180s floor
-    # even though they don't reason.  The cost is a slightly longer
-    # wait on a hung provider; the alternative (matching only
-    # ``qwen3-.*-thinking``) breaks the moment NVIDIA or Alibaba
-    # ships a slightly different naming shape.
-    ("qwq-32b", 300),
-    ("qwen3", 180),
-    # OpenAI o-series — known multi-minute TTFB.  Each variant
-    # enumerated explicitly so bare ``o1`` doesn't over-match
-    # ``olmo-1`` or hypothetical future community derivatives.
-    ("o1", 600),
-    ("o1-mini", 600),
-    ("o1-pro", 600),
-    ("o1-preview", 600),
-    ("o3", 600),
-    ("o3-pro", 600),
-    ("o3-mini", 300),
-    ("o4-mini", 300),
-    # Anthropic Claude 4.x thinking variants.  Anchored at
-    # ``claude-opus-4`` so non-thinking Claude 3.x or future
-    # non-reasoning Claude variants don't match.
-    ("claude-opus-4", 240),
-    ("claude-sonnet-4.5", 180),
-    ("claude-sonnet-4.6", 180),
-    # xAI Grok reasoning variants.  Explicit reasoning-only keys
-    # plus one for the ``non-reasoning`` variant so users picking
-    # the fast variant don't get the 300s floor.  Bare ``grok-3``,
-    # ``grok-4`` etc. don't match — only the explicit reasoning /
-    # non-reasoning pairs.
-    ("grok-4-fast-reasoning", 300),
-    ("grok-4.20-reasoning", 300),
-    ("grok-4-fast-non-reasoning", 180),
-)
-
-
-# Pre-compile each pattern.  Wrapper = start-of-slug + slug + end-or-
-# separator, where ``start-of-slug`` means start-of-string OR
-# immediately after the last ``/`` (aggregator separator) and
-# ``end-or-separator`` means end-of-string OR a ``-``/``.``/``_``.
-#
-# Why start-of-slug and not start-of-string: aggregator prefixes
-# like ``openai/`` should not affect matching — the slug identity is
-# the part after the last ``/``.  Stripping the aggregator prefix in
-# :func:`get_reasoning_stale_timeout_floor` before regex matching
-# gives the wrapper a clean start-of-string anchor.
-#
-# Why end-or-separator on the right: ``openai/o3-mini`` must match
-# the ``o3-mini`` slug (the right anchor is end-of-string).  And
-# ``openai/o3-mini-2025-01-31`` must also match ``o3-mini`` (the right
-# anchor is the ``-`` separator).  But ``openai/o3-mini-fork`` should
-# NOT match ``o3-mini`` if we wanted to exclude forks — though the
-# pattern ``o3-mini-fork`` would be matched as a derivative anyway,
-# so we accept that community forks inheriting the same prefix are
-# treated as reasoning models (a reasonable default — the upstream
-# gateway timing is the same).
-_PATTERN_CACHE: dict[str, re.Pattern[str]] = {}
-
-
-def _get_pattern(slug: str) -> re.Pattern[str]:
-    compiled = _PATTERN_CACHE.get(slug)
-    if compiled is None:
-        compiled = re.compile(
-            r"^"
-            + re.escape(slug)
-            + r"(?:$|[\-._])"
-        )
-        _PATTERN_CACHE[slug] = compiled
-    return compiled
-
-
-def _match_any(model_lower: str) -> Optional[float]:
-    """Return the floor for the first matching slug, else None.
-
-    Each table entry is matched as a start-of-slug prefix with the
-    slug-separator-or-end-of-string right-anchor.  Table iteration
-    order is irrelevant: longest slug wins (so ``o3-mini`` beats
-    ``o3`` on a model like ``openai/o3-mini``).
-    """
-    # Sort by slug length descending so longer / more-specific slugs
-    # win on shared prefixes (o3-mini beats o3).
-    sorted_floors = sorted(
-        _REASONING_STALE_TIMEOUT_FLOORS, key=lambda kv: -len(kv[0])
-    )
-    for slug, floor in sorted_floors:
-        if _get_pattern(slug).search(model_lower):
-            return float(floor)
-    return None
-
-
-def get_reasoning_stale_timeout_floor(model: object) -> Optional[float]:
-    """Return the stale-timeout floor (seconds) for a known reasoning model.
-
-    Returns ``None`` when the model is not in the allowlist or the
-    argument is empty / not a string.  Matching uses
-    word-boundary-anchored regex on the lowercased model name, so
-    ``openai/o3-mini`` matches the ``o3-mini`` slug but
-    ``olmo-1`` does NOT match ``o1`` (the ``o1`` substring is not
-    at a word boundary inside ``olmo-1``).
-
-    Aggregator prefixes (``openai/``, ``x-ai/``, ``anthropic/`` etc.)
-    are preserved through matching — the ``/`` is itself a word
-    boundary, so ``openai/o3-mini`` matches ``o3-mini`` because the
-    ``/`` before ``o3-mini`` satisfies the left-anchor alternation.
-
-    This is a FLOOR — callers must apply it as ``max(default, floor)``
-    and only when no explicit user-configured per-model
-    ``stale_timeout_seconds`` exists.
-
-    >>> get_reasoning_stale_timeout_floor("nvidia/nemotron-3-ultra-550b-a55b")
-    600.0
-    >>> get_reasoning_stale_timeout_floor("openai/o3-mini")
-    300.0
-    >>> get_reasoning_stale_timeout_floor("deepseek/deepseek-r1")
-    600.0
-    >>> get_reasoning_stale_timeout_floor("qwen/qwen3-235b-a22b-thinking")
-    180.0
-    >>> get_reasoning_stale_timeout_floor("x-ai/grok-4-fast-reasoning")
-    300.0
-    >>> get_reasoning_stale_timeout_floor("anthropic/claude-opus-4-6")
-    240.0
-    >>> get_reasoning_stale_timeout_floor("gpt-4o") is None
-    True
-    >>> get_reasoning_stale_timeout_floor("olmo-1") is None
-    True
-    >>> get_reasoning_stale_timeout_floor(None) is None
-    True
-    """
-    if not model or not isinstance(model, str):
-        return None
-    name = model.strip().lower()
-    if not name:
-        return None
-    # Strip aggregator prefix (everything before and including the
-    # last ``/``).  The wrapper regex anchors at start-of-string, so
-    # the slug identity is the bare model name.
-    if "/" in name:
-        name = name.rsplit("/", 1)[1]
-    return _match_any(name)
--- a/agent/redact.py
+++ b/agent/redact.py
@@ -10,7 +10,6 @@ the first 6 and last 4 characters for debuggability.
 import logging
 import os
 import re
-import shlex

 logger = logging.getLogger(__name__)

@@ -108,60 +107,12 @@ _PREFIX_PATTERNS = [
    r"ntn_[A-Za-z0-9]{10,}",            # Notion internal integration token
 ]

-# ENV assignment patterns: KEY=value where KEY contains a secret-like name.
-# Uppercase keys tolerate spaces around "=" (e.g. ``FOO_SECRET = bar``) because
-# an all-caps key is almost never prose/code.
+# ENV assignment patterns: KEY=value where KEY contains a secret-like name
 _SECRET_ENV_NAMES = r"(?:API_?KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIAL|AUTH)"
 _ENV_ASSIGN_RE = re.compile(
    rf"([A-Z0-9_]{{0,50}}{_SECRET_ENV_NAMES}[A-Z0-9_]{{0,50}})\s*=\s*(['\"]?)(\S+)\2",
 )

-# Lowercase / dotted / hyphenated config keys from config files
-# (application.properties, .env, YAML-ish dumps): ``spring.datasource.password=secret``,
-# ``app.api.key=xyz``, ``password=secret``. The uppercase _ENV_ASSIGN_RE above
-# never matched these, so config-file passwords leaked verbatim (issue #16413).
-#
-# These run only in a config-file context, NOT in prose, code, or URLs — three
-# carve-outs preserved from the original design (#4367 + the documented
-# web-URL passthrough below):
-#   1. The value is bounded by ``[^\s&]`` (stops at whitespace AND ``&``) so
-#      form-urlencoded bodies are handled pair-by-pair (by _redact_form_body),
-#      not greedily swallowed.
-#   2. _CFG_DOTTED_RE only matches when the key is NAMESPACED (contains a dot),
-#      which is unambiguously a config key — never a prose word.
-#   3. _CFG_ANCHORED_RE matches a bare secret-word key only at line start
-#      (optionally after ``export``), so conversational ``I have password=foo``
-#      mid-sentence is left alone.
-# The colon-form URL guard (skip when ``://`` present) lives at the call site.
-_SECRET_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential|auth)"
-_CFG_VALUE = r"(['\"]?)([^\s&]+?)\2(?=[\s&]|$)"
-# Namespaced (dotted) key: the secret word may sit anywhere in a dotted path.
-_CFG_DOTTED_RE = re.compile(
-    rf"((?:[A-Za-z0-9_\-]+\.)+[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*"
-    rf"|[A-Za-z0-9_.\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_.\-]*\.[A-Za-z0-9_.\-]+)"
-    rf"={_CFG_VALUE}",
-    re.IGNORECASE,
-)
-# Line-anchored bare key: ``password=…`` / ``export api_key=…`` at start of line.
-_CFG_ANCHORED_RE = re.compile(
-    rf"(^[ \t]*(?:export[ \t]+)?[A-Za-z0-9_\-]*{_SECRET_CFG_NAMES}[A-Za-z0-9_\-]*)={_CFG_VALUE}",
-    re.IGNORECASE | re.MULTILINE,
-)
-
-# Unquoted YAML / colon config (e.g. ``password: secret``,
-# ``spring.datasource.password: hunter2``). The secret keyword must be part of
-# the KEY (anchored to the start of the line/indent), and the value is a single
-# whitespace-free token — so prose like ``note: secret meeting`` (keyword in the
-# value) and ``error: token expired`` are left alone. Bare ``auth`` is excluded
-# from the key set so ``Authorization:`` / ``author:`` don't match (the former
-# is masked by _AUTH_HEADER_RE); ``auth_token``/``auth-token`` still match via
-# the ``token`` keyword. Quoted values defer to _JSON_FIELD_RE via the lookahead.
-_YAML_CFG_NAMES = r"(?:api[ _.\-]?key|token|secret|passwd|password|credential)"
-_YAML_ASSIGN_RE = re.compile(
-    rf"(^[ \t]*[A-Za-z0-9_.\-]*{_YAML_CFG_NAMES}[A-Za-z0-9_.\-]*)(:[ \t]*)(?!['\"])([^\s&]+)",
-    re.IGNORECASE | re.MULTILINE,
-)
-
 # JSON field patterns: "apiKey": "value", "token": "value", etc.
 _JSON_KEY_NAMES = r"(?:api_?[Kk]ey|token|secret|password|access_token|refresh_token|auth_token|bearer|secret_value|raw_secret|secret_input|key_material)"
 _JSON_FIELD_RE = re.compile(
@@ -174,15 +125,8 @@ _JSON_FIELD_RE = re.compile(
 # while the header name and scheme word are preserved for debuggability. The
 # previous rule only matched ``Bearer``, so ``Basic <base64 user:pass>`` and
 # ``token <pat>`` leaked verbatim into logs/transcripts.
-#
-# The credential class excludes quote characters (``"`` / ``'``): a token sitting
-# flush against a closing quote (``"Authorization: Bearer sk-..."``) must not pull
-# that quote into the match, or masking turns value corruption into *syntax*
-# corruption — the closing quote vanishes and the command/string no longer parses
-# (unterminated quote → shell EOF / Python SyntaxError). Real credentials never
-# contain ``"`` or ``'``, so excluding them is safe. See #43083.
 _AUTH_HEADER_RE = re.compile(
-    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?([^\s\"']+)",
+    r"((?:Proxy-)?Authorization:\s*)([A-Za-z][\w.+-]*\s+)?(\S+)",
    re.IGNORECASE,
 )

@@ -210,15 +154,9 @@ _PRIVATE_KEY_RE = re.compile(
 )

 # Database connection strings: protocol://user:PASSWORD@host
-# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password.
-# The userinfo and password groups forbid whitespace ([^:\s]+ / [^@\s]+) so the
-# match can never span a line break. A real DSN password never contains
-# whitespace; without this bound the greedy [^@]+ would scan past the end of a
-# code line to the next stray "@" (e.g. a Python decorator), swallowing
-# intervening lines and corrupting tool OUTPUT for any source containing a
-# postgresql:// f-string template. See issue #33801.
+# Catches postgres, mysql, mongodb, redis, amqp URLs and redacts the password
 _DB_CONNSTR_RE = re.compile(
-    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:\s]+:)([^@\s]+)(@)",
+    r"((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp)://[^:]+:)([^@]+)(@)",
    re.IGNORECASE,
 )

@@ -402,40 +340,7 @@ def _redact_form_body(text: str) -> str:
    return _redact_query_string(text.strip())


-def _mask_token_nonreusable(token: str) -> str:
-    """Redact a prefix-matched credential to a NON-REUSABLE sentinel.
-
-    Unlike :func:`_mask_token` (which keeps head/tail chars — fine for logs
-    that are never fed back into a config), this emits a marker that:
-
-    * cannot be mistaken for a usable-but-truncated key, so an agent that
-      reads it from a config file and writes it back does NOT corrupt the
-      stored credential into a dead 13-char string (issue #35519); and
-    * still does not leak the secret material (no head/tail chars).
-
-    The vendor prefix label is preserved for debuggability so the agent can
-    still tell *which* credential is present (e.g. a GitHub PAT vs an OpenAI
-    key) without seeing any of its bytes.
-    """
-    if not token:
-        return "«redacted-secret»"
-    # Preserve only the recognizable vendor prefix label (e.g. "ghp_", "sk-"),
-    # never any of the random secret body.
-    label = ""
-    for sub in _PREFIX_SUBSTRINGS:
-        if token.startswith(sub):
-            label = sub
-            break
-    return f"«redacted:{label}…»" if label else "«redacted-secret»"
-
-
-def redact_sensitive_text(
-    text: str,
-    *,
-    force: bool = False,
-    code_file: bool = False,
-    file_read: bool = False,
-) -> str:
+def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = False) -> str:
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
@@ -448,17 +353,6 @@ def redact_sensitive_text(
    constants, "apiKey": "test" fixtures). Prefix patterns, auth headers,
    private keys, DB connstrings, JWTs, and URL secrets are still redacted.

-    Set file_read=True for file *content* returned to the agent (read_file /
-    search_files / cat). Secrets are STILL redacted — they are never exposed —
-    but prefix-matched credentials are replaced with a non-reusable sentinel
-    (``«redacted:ghp_…»``) instead of a head/tail-preserving mask
-    (``ghp_S1...Pn2T``). The old mask looked like a real-but-truncated key, so
-    an agent reading it from config.yaml and writing it back silently corrupted
-    the stored credential into a dead 13-char value → 401 (issue #35519). The
-    sentinel is syntactically invalid as a token, so it can't be mistaken for a
-    usable key or written back as one. Implies code_file=True (config/data
-    files shouldn't trigger the source-code ENV/JSON false-positive paths).
-
    Performance: each regex pattern is gated behind a cheap substring
    pre-check (e.g. ``"=" in text`` for ENV assignments, ``"://" in text``
    for URLs, ``"eyJ" in text`` for JWTs). On a typical hermes log line
@@ -477,15 +371,9 @@ def redact_sensitive_text(
    if not (force or _REDACT_ENABLED):
        return text

-    # file_read content shouldn't hit the source-code ENV/JSON false-positive
-    # paths either (it's config/data, not log lines).
-    if file_read:
-        code_file = True
-
    # Known prefixes (sk-, ghp_, etc.) — gate on substring presence
    if _has_known_prefix_substring(text):
-        _prefix_sub = _mask_token_nonreusable if file_read else _mask_token
-        text = _PREFIX_RE.sub(lambda m: _prefix_sub(m.group(1)), text)
+        text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)

    # ENV assignments: OPENAI_API_KEY=***  (skip for code files — false positives)
    if not code_file:
@@ -494,13 +382,6 @@ def redact_sensitive_text(
                name, quote, value = m.group(1), m.group(2), m.group(3)
                return f"{name}={quote}{_mask_token(value)}{quote}"
            text = _ENV_ASSIGN_RE.sub(_redact_env, text)
-            # Lowercase/dotted config keys (issue #16413). Skip URLs entirely —
-            # web-URL query params are intentionally passed through (see note
-            # near the bottom of this function); _DB_CONNSTR_RE still guards
-            # connection-string passwords.
-            if "://" not in text:
-                text = _CFG_DOTTED_RE.sub(_redact_env, text)
-                text = _CFG_ANCHORED_RE.sub(_redact_env, text)

        # JSON fields: "apiKey": "***"  (skip for code files — false positives)
        if ":" in text and '"' in text:
@@ -509,15 +390,6 @@ def redact_sensitive_text(
                return f'{key}: "{_mask_token(value)}"'
            text = _JSON_FIELD_RE.sub(_redact_json, text)

-        # Unquoted YAML / colon config: password: ***  (after JSON so quoted
-        # values are handled there; the lookahead in _YAML_ASSIGN_RE skips
-        # quotes). Skip URLs — web-URL query params pass through by design.
-        if ":" in text and "://" not in text:
-            def _redact_yaml(m):
-                key, sep, value = m.group(1), m.group(2), m.group(3)
-                return f"{key}{sep}{_mask_token(value)}"
-            text = _YAML_ASSIGN_RE.sub(_redact_yaml, text)
-
    # Authorization headers — _AUTH_HEADER_RE matches any scheme after
    # "[Proxy-]Authorization:" case-insensitively, so "uthorization" is the
    # cheapest substring gate that covers every casing without a casefold().
@@ -547,22 +419,9 @@ def redact_sensitive_text(
    if "BEGIN" in text and "-----" in text:
        text = _PRIVATE_KEY_RE.sub("[REDACTED PRIVATE KEY]", text)

-    # Database connection string passwords. With code_file=True, a password
-    # group that is a pure ``{...}`` brace expression is an f-string template
-    # reference (e.g. f"postgresql://{user}:{pass}@{host}"), not a literal
-    # credential — preserve it. Literal passwords are still redacted. The regex
-    # forbids whitespace in the password group, so a single-line template's
-    # group(2) is exactly the brace expression. See issue #33801.
+    # Database connection string passwords
    if "://" in text:
-        if code_file:
-            def _redact_db(m):
-                pw = m.group(2)
-                if pw.startswith("{") and pw.endswith("}"):
-                    return m.group(0)
-                return f"{m.group(1)}***{m.group(3)}"
-            text = _DB_CONNSTR_RE.sub(_redact_db, text)
-        else:
-            text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)
+        text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text)

    # JWT tokens (eyJ... — base64-encoded JSON headers)
    if "eyJ" in text:
@@ -593,66 +452,6 @@ def redact_sensitive_text(
    return text


-# Commands whose stdout is an environment-variable dump (KEY=value lines),
-# NOT source code. For these, terminal-output redaction must run the
-# ENV-assignment pass (code_file=False) so opaque tokens with no recognized
-# vendor prefix (e.g. ``MY_SERVICE_TOKEN=abc123randomstring``) are still
-# masked. For all other commands, code_file=True is used to avoid mangling
-# legitimate source/config dumps (``MAX_TOKENS=100``, ``"apiKey": "x"``
-# fixtures, ``postgresql://{user}`` f-string templates). See issue #43025.
-_ENV_DUMP_COMMANDS = frozenset({"env", "printenv", "set", "export", "declare"})
-
-
-def is_env_dump_command(command: str | None) -> bool:
-    """Return True if ``command`` dumps environment variables to stdout.
-
-    Detects ``env`` / ``printenv`` / ``set`` / ``export`` / ``declare`` as the
-    first token of any segment in a pipeline or sequence (``;`` / ``&&`` /
-    ``||`` / ``|``). Conservative: a parse failure or anything unrecognized
-    returns False (callers then fall back to the safer code_file=True path,
-    which still masks prefix-shaped keys).
-    """
-    if not command or not isinstance(command, str):
-        return False
-    # Split on shell separators, then inspect the first token of each segment.
-    segments = re.split(r"[|;&]+", command)
-    for seg in segments:
-        seg = seg.strip()
-        if not seg:
-            continue
-        try:
-            tokens = shlex.split(seg)
-        except ValueError:
-            tokens = seg.split()
-        if tokens and tokens[0] in _ENV_DUMP_COMMANDS:
-            return True
-    return False
-
-
-def redact_terminal_output(
-    output: str, command: str | None = None, *, force: bool = False
-) -> str:
-    """Redact secrets from terminal/process stdout.
-
-    Single redaction policy for ALL terminal-output surfaces — foreground
-    ``terminal`` results AND background ``process(action=poll/log/wait)``
-    output — so they can't diverge. Picks ``code_file`` based on whether
-    ``command`` is an environment dump:
-
-    - env-dump command (``env``/``printenv``/``set``/``export``/``declare``)
-      → ``code_file=False`` so the ENV-assignment pass masks opaque tokens.
-    - anything else (or unknown command) → ``code_file=True`` to avoid
-      false positives on source/config dumps.
-
-    ``force=True`` bypasses the global ``security.redact_secrets`` preference
-    for safety boundaries that must never emit raw credentials.
-    """
-    if not output:
-        return output
-    code_file = not is_env_dump_command(command or "")
-    return redact_sensitive_text(output, force=force, code_file=code_file)
-
-
 # Substrings used to gate ``_PREFIX_RE`` execution. If none of these appear in
 # the input string, the prefix regex cannot match anything, so we skip it.
 # False positives are fine (they just run the regex, which then matches
--- a/agent/replay_cleanup.py
+++ b/agent/replay_cleanup.py
@@ -1,140 +0,0 @@
-"""Replay-history sanitization shared across resume code paths.
-
-When a session's last turn dies mid-tool-loop — the process is killed by a
-restart/shutdown command, a stale-timeout fires, or an interrupt lands before
-the tool result is written — the persisted transcript can end with a dangling
-``assistant(tool_calls)`` (no matching ``tool`` answer) or an interrupted
-``assistant→tool`` block.  On resume the model sees that broken tail and
-re-issues the unanswered call, producing an endless "thinking"/reboot loop
-(#49201, #29086).
-
-These pure helpers strip those tails before the history is replayed to the
-model.  They were originally local to ``gateway/run.py`` (which fixed the
-messaging-gateway path) and are extracted here so every resume surface — the
-messaging gateway AND the TUI/WebUI gateway — shares the same cleanup instead
-of the WebUI path silently skipping it.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import Any, Dict, List
-
-logger = logging.getLogger(__name__)
-
-
-def is_interrupted_tool_result(content: Any) -> bool:
-    """Return True if a tool result indicates the tool was interrupted."""
-    if not isinstance(content, str):
-        return False
-    lowered = content.lower()
-    if "[command interrupted]" in lowered:
-        return True
-    if "exit_code" in lowered and ("130" in lowered or "-1" in lowered):
-        return "interrupt" in lowered
-    return False
-
-
-def strip_interrupted_tool_tails(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Strip interrupted assistant→tool sequences from replay history.
-
-    Older interrupted gateway turns can be followed by a queued real user
-    message, so the interrupted assistant/tool block is not necessarily the
-    final tail by the time we rebuild replay history.  Remove any contiguous
-    assistant(tool_calls) + tool-result block that contains an interrupted tool
-    result, while preserving successful tool-call sequences intact.
-    """
-    if not agent_history:
-        return agent_history
-
-    cleaned: List[Dict[str, Any]] = []
-    i = 0
-    n = len(agent_history)
-    while i < n:
-        msg = agent_history[i]
-        if msg.get("role") == "assistant" and "tool_calls" in msg:
-            j = i + 1
-            tool_results: List[Dict[str, Any]] = []
-            while j < n and agent_history[j].get("role") == "tool":
-                tool_results.append(agent_history[j])
-                j += 1
-            if tool_results and any(
-                is_interrupted_tool_result(m.get("content", ""))
-                for m in tool_results
-            ):
-                logger.debug(
-                    "Stripping interrupted assistant→tool replay block "
-                    "(indices %d–%d, tool_results=%d)",
-                    i, j - 1, len(tool_results),
-                )
-                i = j
-                continue
-        if msg.get("role") == "tool" and is_interrupted_tool_result(msg.get("content", "")):
-            logger.debug("Stripping orphan interrupted tool result from replay history")
-            i += 1
-            continue
-        cleaned.append(msg)
-        i += 1
-
-    return cleaned
-
-
-def strip_dangling_tool_call_tail(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
-
-    When a tool call itself kills the gateway process (``docker restart``,
-    ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
-    is terminated by SIGKILL *mid-call* — before the tool result is ever
-    written and before the orderly shutdown rewind
-    (``_drop_trailing_empty_response_scaffolding``) can run.  The last thing
-    persisted is the ``assistant`` message that issued the ``tool_calls``,
-    with zero matching ``tool`` rows.
-
-    On resume the model sees an unanswered tool call at the tail and naturally
-    re-issues it — which restarts the gateway again, producing the infinite
-    reboot loop in #49201.  ``strip_interrupted_tool_tails`` does not catch
-    this because there is no tool result to inspect for an interrupt marker.
-
-    This strips that dangling tail at the source so there is nothing for the
-    model to re-execute.  It only acts when the tail is an
-    ``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
-    results — a completed assistant→tool pair (any tool answers present) is
-    left untouched so genuine mid-progress tool loops still resume.
-    """
-    if not agent_history:
-        return agent_history
-
-    last = agent_history[-1]
-    if not (
-        isinstance(last, dict)
-        and last.get("role") == "assistant"
-        and last.get("tool_calls")
-    ):
-        return agent_history
-
-    logger.debug(
-        "Stripping dangling unanswered assistant(tool_calls) tail "
-        "(%d call(s)) — process likely killed mid-tool-call by a "
-        "restart/shutdown command (#49201)",
-        len(last.get("tool_calls") or []),
-    )
-    return agent_history[:-1]
-
-
-def sanitize_replay_history(
-    agent_history: List[Dict[str, Any]],
-) -> List[Dict[str, Any]]:
-    """Apply both replay-tail strippers in the canonical order.
-
-    Convenience entry point for resume code paths: removes interrupted
-    assistant→tool blocks anywhere in the history, then removes a dangling
-    unanswered ``assistant(tool_calls)`` tail.  Returns the same list object
-    when there is nothing to strip.
-    """
-    if not agent_history:
-        return agent_history
-    return strip_dangling_tool_call_tail(strip_interrupted_tool_tails(agent_history))
--- a/agent/retry_utils.py
+++ b/agent/retry_utils.py
@@ -8,7 +8,6 @@ rate-limited provider concurrently.
 import random
 import threading
 import time
-from typing import Any

 # Monotonic counter for jitter seed uniqueness within the same process.
 # Protected by a lock to avoid race conditions in concurrent retry paths
@@ -16,14 +15,6 @@ from typing import Any
 _jitter_counter = 0
 _jitter_lock = threading.Lock()

-# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
-# ("The service may be temporarily overloaded...") for otherwise valid
-# Hermes requests. Short retries tend to hammer the same overloaded window;
-# after a few normal retries, progressively widen the wait window. Keep the
-# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
-# not sit silent for 20+ minutes.
-_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
-

 def jittered_backoff(
    attempt: int,
@@ -64,66 +55,3 @@ def jittered_backoff(
    jitter = rng.uniform(0, jitter_ratio * delay)

    return delay + jitter
-
-
-def _error_text(error: Any) -> str:
-    """Best-effort flattened provider error text for retry classification."""
-    parts = [
-        error,
-        getattr(error, "message", None),
-        getattr(error, "body", None),
-        getattr(error, "response", None),
-    ]
-    return " ".join(str(part) for part in parts if part is not None).lower()
-
-
-def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
-    """Return True for Z.AI Coding Plan transient overload 429s.
-
-    The coding-plan endpoint reports overload as HTTP 429 with body code 1305
-    and message "The service may be temporarily overloaded...". Treat only
-    that narrow shape specially so ordinary quota/billing 429s still fail fast
-    through the existing classifier.
-    """
-    base = (base_url or "").lower()
-    model_name = (model or "").lower()
-    status = getattr(error, "status_code", None)
-    text = _error_text(error)
-    return (
-        status == 429
-        and "api.z.ai/api/coding/paas/v4" in base
-        and "glm-5.2" in model_name
-        and ("1305" in text or "temporarily overloaded" in text)
-    )
-
-
-def adaptive_rate_limit_backoff(
-    attempt: int,
-    *,
-    base_url: str | None,
-    model: str | None,
-    error: Any,
-    default_wait: float,
-    short_attempts: int = 3,
-) -> tuple[float, str | None]:
-    """Provider-aware rate-limit backoff.
-
-    For most providers this returns ``default_wait`` unchanged. For Z.AI
-    Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
-    the normal short exponential schedule, then switch to progressively longer
-    waits (30s → 60s → 90s → 120s, capped) plus light jitter.
-
-    ``attempt`` is 1-based, matching the retry loop's logged attempt number.
-    Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
-    for status/log decoration when a provider-specific policy fired.
-    """
-    if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
-        return default_wait, None
-    if attempt <= short_attempts:
-        return default_wait, "zai_coding_overload_short"
-
-    idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
-    base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
-    # A smaller jitter ratio keeps long waits readable while still avoiding
-    # synchronized retry storms across concurrent Hermes sessions.
-    return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"
--- a/agent/shell_hooks.py
+++ b/agent/shell_hooks.py
@@ -122,8 +122,6 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 try:
    import fcntl  # POSIX only; Windows falls back to best-effort without flock.
 except ImportError:  # pragma: no cover
@@ -443,7 +441,6 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
        return result

    t0 = time.monotonic()
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        proc = subprocess.run(
            argv,
@@ -452,7 +449,6 @@ def _spawn(spec: ShellHookSpec, stdin_json: str) -> Dict[str, Any]:
            timeout=spec.timeout,
            text=True,
            shell=False,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        result["timed_out"] = True
--- a/agent/skill_preprocessing.py
+++ b/agent/skill_preprocessing.py
@@ -5,8 +5,6 @@ import re
 import subprocess
 from pathlib import Path

-from hermes_cli._subprocess_compat import IS_WINDOWS, windows_hide_flags
-
 logger = logging.getLogger(__name__)

 # Matches ${HERMES_SKILL_DIR} / ${HERMES_SESSION_ID} tokens in SKILL.md.
@@ -68,7 +66,6 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
    Failures return a short ``[inline-shell error: ...]`` marker instead of
    raising, so one bad snippet can't wreck the whole skill message.
    """
-    _popen_kwargs = {"creationflags": windows_hide_flags()} if IS_WINDOWS else {}
    try:
        completed = subprocess.run(
            ["bash", "-c", command],
@@ -78,7 +75,6 @@ def run_inline_shell(command: str, cwd: Path | None, timeout: int) -> str:
            timeout=max(1, int(timeout)),
            check=False,
            stdin=subprocess.DEVNULL,
-            **_popen_kwargs,
        )
    except subprocess.TimeoutExpired:
        return f"[inline-shell timeout after {timeout}s: {command}]"
--- a/agent/skill_utils.py
+++ b/agent/skill_utils.py
@@ -507,34 +507,6 @@ def get_all_skills_dirs() -> List[Path]:
    return dirs


-def _resolve_for_skill_ownership(path) -> Path:
-    path_obj = path if isinstance(path, Path) else Path(str(path))
-    try:
-        return path_obj.expanduser().resolve()
-    except (OSError, RuntimeError):
-        return path_obj.expanduser().absolute()
-
-
-def is_external_skill_path(path) -> bool:
-    """Return True when ``path`` lives under a configured external skills dir.
-
-    ``skills.external_dirs`` are externally owned: Hermes can discover and view
-    their skills, and foreground user-directed tool calls may still edit them,
-    but autonomous lifecycle maintenance must treat them as read-only. This
-    helper centralizes the ownership boundary so curator/reporting/tool paths do
-    not each need to re-interpret the config.
-    """
-    candidate = _resolve_for_skill_ownership(path)
-    for root in get_external_skills_dirs():
-        resolved_root = _resolve_for_skill_ownership(root)
-        try:
-            candidate.relative_to(resolved_root)
-            return True
-        except ValueError:
-            continue
-    return False
-
-
 # ── Condition extraction ──────────────────────────────────────────────────


--- a/agent/telemetry/init.py
+++ b/agent/telemetry/init.py
@@ -1,30 +0,0 @@
-"""Hermes telemetry & observability.
-
-Local-first observability, on by default. The ``telemetry`` plugin registers Hermes
-lifecycle hooks and hands typed events to the fire-and-forget ``emitter`` (queue ->
-background writer -> JSONL + state.db ``tel_*`` index). The emitter never blocks or
-raises into a model/tool call (the hot-path invariant).
-
-Events record the observed model ids, provider names, and tool names. ``metrics``
-derives rollups for /usage and /insights; ``rollup`` builds the per-run summaries shown
-by ``hermes telemetry preview``. ``redaction`` + ``exporter_bulk`` + ``otlp_exporter``
-handle export to an operator-chosen destination. ``policy`` holds the consent
-constants and the aggregate upload gate (no uploader ships).
-"""
-
-from __future__ import annotations
-
-from . import emitter, events, metrics, policy, spans
-
-emit = emitter.emit
-get_emitter = emitter.get_emitter
-
-__all__ = [
-    "emitter",
-    "events",
-    "metrics",
-    "policy",
-    "spans",
-    "emit",
-    "get_emitter",
-]
--- a/agent/telemetry/emitter.py
+++ b/agent/telemetry/emitter.py
@@ -1,318 +0,0 @@
-"""Local telemetry emitter: fire-and-forget queue + background writer.
-
-The emitter is the single seam between instrumentation (the telemetry plugin's hook
-callbacks) and durable storage. Its contract is the hot-path invariant:
-
-    ``emit()`` MUST return in O(microseconds), MUST NOT block on disk/network, and
-    MUST NEVER raise into the caller. A telemetry failure is logged locally and
-    dropped — it can never affect a model call, a tool call, or a session.
-
-Mechanism:
-  * ``emit(event)`` does a non-blocking ``queue.put_nowait`` wrapped in a bare except.
-    On a full queue it drops the *oldest* event and counts the drop.
-  * A daemon thread drains the queue and writes each event to two places:
-      1. the append-only JSONL log (source of truth)
-      2. the ``tel_*`` SQLite tables in state.db (rebuildable index)
-  * The writer uses its own sqlite connection to state.db, separate from SessionDB,
-    so telemetry writes never contend with or corrupt session writes.
-
-Local telemetry only. Nothing here uploads anywhere.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import queue
-import sqlite3
-import threading
-import time
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-_MAX_QUEUE = 10_000  # ring-buffer depth; oldest dropped when full
-_DRAIN_BATCH = 256
-
-
-def _default_dir() -> Path:
-    """Resolve the telemetry dir under the active HERMES_HOME (profile-safe)."""
-    from hermes_constants import get_hermes_home
-    return get_hermes_home() / "telemetry"
-
-
-def _default_db_path() -> Path:
-    """Resolve state.db under the active HERMES_HOME (profile-safe)."""
-    from hermes_constants import get_hermes_home
-    return get_hermes_home() / "state.db"
-
-
-# Map a telemetry event dict (its "event" tag) to (table, column-ordered insert).
-# Only the columns the indexer knows about are written; unknown keys are ignored,
-# so an event carrying extra fields never breaks the insert.
-_TABLE_COLUMNS: Dict[str, tuple] = {
-    "run": (
-        "tel_runs",
-        ("run_id", "trace_id", "session_id", "entrypoint",
-         "platform", "start_ns", "end_ns", "end_reason",
-         "model_call_count", "tool_call_count", "error_count"),
-    ),
-    "span": (
-        "tel_spans",
-        ("span_id", "trace_id", "run_id", "parent_span_id", "name", "kind",
-         "start_ns", "end_ns", "status"),
-    ),
-    "model_call": (
-        "tel_model_calls",
-        ("span_id", "run_id", "provider", "model", "base_url",
-         "input_tokens", "output_tokens", "cache_read_tokens",
-         "cache_write_tokens", "reasoning_tokens", "latency_ms"),
-    ),
-    "tool_call": (
-        "tel_tool_calls",
-        ("span_id", "run_id", "tool_name", "duration_ms", "result_class"),
-    ),
-    "error": (
-        "tel_error_events",
-        ("run_id", "error_class", "subsystem", "recovery", "ts_ns"),
-    ),
-}
-
-
-class TelemetryEmitter:
-    """Owns the queue, the writer thread, and the telemetry sqlite connection."""
-
-    def __init__(
-        self,
-        *,
-        events_path: Optional[Path] = None,
-        db_path: Optional[Path] = None,
-        enabled: bool = True,
-    ) -> None:
-        self._dir = (events_path.parent if events_path else _default_dir())
-        self._events_path = events_path or (self._dir / "events.jsonl")
-        self._db_path = db_path or _default_db_path()
-        self._enabled = enabled
-        self._q: "queue.Queue[Dict[str, Any]]" = queue.Queue(maxsize=_MAX_QUEUE)
-        self._dropped = 0
-        self._written = 0
-        self._stop = threading.Event()
-        self._started = False
-        self._lock = threading.Lock()
-        self._conn: Optional[sqlite3.Connection] = None
-        self._thread: Optional[threading.Thread] = None
-        # Optional live subscribers (e.g. OTLP exporter). Called from the writer
-        # thread AFTER durable writes, fully fail-isolated — a subscriber that
-        # raises or blocks can never affect the JSONL/SQLite source of truth or
-        # the hot path. Each subscriber is callable(batch: list[dict]).
-        self._subscribers: list = []
-
-    # ── public API (hot path) ───────────────────────────────────────────────
-    def emit(self, event: Any) -> None:
-        """Enqueue an event. Never blocks, never raises.
-
-        ``event`` may be a dataclass with ``to_dict()`` or a plain dict.
-        """
-        if not self._enabled:
-            return
-        try:
-            payload = event.to_dict() if hasattr(event, "to_dict") else dict(event)
-            payload.setdefault("ts_ns", time.time_ns())
-            self._ensure_started()
-            try:
-                self._q.put_nowait(payload)
-            except queue.Full:
-                # Drop oldest to make room — bounded memory, newest-wins.
-                try:
-                    self._q.get_nowait()
-                    self._dropped += 1
-                    self._q.put_nowait(payload)
-                except Exception:
-                    self._dropped += 1
-        except Exception:  # the hot-path invariant: never propagate
-            logger.debug("telemetry emit failed", exc_info=True)
-
-    # ── lifecycle ───────────────────────────────────────────────────────────
-    def _ensure_started(self) -> None:
-        if self._started:
-            return
-        with self._lock:
-            if self._started:
-                return
-            try:
-                self._dir.mkdir(parents=True, exist_ok=True)
-            except Exception:
-                logger.debug("telemetry dir create failed", exc_info=True)
-            self._thread = threading.Thread(
-                target=self._run, name="hermes-telemetry-writer", daemon=True
-            )
-            self._thread.start()
-            self._started = True
-
-    def _open_conn(self) -> Optional[sqlite3.Connection]:
-        if self._conn is not None:
-            return self._conn
-        try:
-            conn = sqlite3.connect(str(self._db_path), isolation_level=None, timeout=5.0)
-            conn.execute("PRAGMA journal_mode=WAL")
-            conn.execute("PRAGMA busy_timeout=5000")
-            self._conn = conn
-        except Exception:
-            logger.debug("telemetry db open failed", exc_info=True)
-            self._conn = None
-        return self._conn
-
-    def _run(self) -> None:
-        while not self._stop.is_set():
-            try:
-                first = self._q.get(timeout=0.5)
-            except queue.Empty:
-                continue
-            batch = [first]
-            while len(batch) < _DRAIN_BATCH:
-                try:
-                    batch.append(self._q.get_nowait())
-                except queue.Empty:
-                    break
-            self._write_batch(batch)
-
-    def _write_batch(self, batch) -> None:
-        # JSONL append (source of truth) — best effort.
-        try:
-            with open(self._events_path, "a", encoding="utf-8") as fh:
-                for ev in batch:
-                    fh.write(json.dumps(ev, ensure_ascii=False) + "\n")
-        except Exception:
-            logger.debug("telemetry jsonl append failed", exc_info=True)
-
-        # SQLite index — best effort, per-event so one bad row can't lose the batch.
-        conn = self._open_conn()
-        if conn is None:
-            return
-        for ev in batch:
-            try:
-                self._index_one(conn, ev)
-                self._written += 1
-            except Exception:
-                logger.debug("telemetry index row failed", exc_info=True)
-
-        # Live fan-out (e.g. OTLP) — AFTER durable writes, fully fail-isolated.
-        # A slow/raising subscriber never affects JSONL/SQLite or the hot path.
-        for sub in self._subscribers:
-            try:
-                sub(batch)
-            except Exception:
-                logger.debug("telemetry subscriber failed", exc_info=True)
-
-    def subscribe(self, callback) -> None:
-        """Register a live batch subscriber (callable(batch: list[dict])).
-
-        Called from the writer thread after durable writes. Used by the OTLP
-        exporter for continuous streaming. Fail-isolated; never on the hot path.
-        """
-        if callback not in self._subscribers:
-            self._subscribers.append(callback)
-
-    def unsubscribe(self, callback) -> None:
-        try:
-            self._subscribers.remove(callback)
-        except ValueError:
-            pass
-
-    def _index_one(self, conn: sqlite3.Connection, ev: Dict[str, Any]) -> None:
-        kind = ev.get("event")
-        spec = _TABLE_COLUMNS.get(kind)
-        if spec is None:
-            return
-        table, cols = spec
-        values = [ev.get(c) for c in cols]
-        placeholders = ", ".join("?" for _ in cols)
-        collist = ", ".join(cols)
-        conn.execute(
-            f"INSERT OR REPLACE INTO {table} ({collist}) VALUES ({placeholders})",
-            values,
-        )
-
-    # ── introspection / shutdown (tests, CLI) ───────────────────────────────
-    def flush(self, timeout: float = 2.0) -> None:
-        """Block until the queue drains (test/CLI helper, NOT the hot path)."""
-        deadline = time.monotonic() + timeout
-        while time.monotonic() < deadline:
-            if self._q.empty():
-                # give the writer a tick to finish the in-flight batch
-                time.sleep(0.05)
-                if self._q.empty():
-                    return
-            time.sleep(0.02)
-
-    def stats(self) -> Dict[str, int]:
-        return {
-            "queued": self._q.qsize(),
-            "written": self._written,
-            "dropped": self._dropped,
-        }
-
-    def close(self) -> None:
-        self._stop.set()
-        if self._thread is not None:
-            self._thread.join(timeout=2.0)
-        if self._conn is not None:
-            try:
-                self._conn.close()
-            except Exception:
-                pass
-            self._conn = None
-        self._started = False
-
-
-# ── process-wide singleton ──────────────────────────────────────────────────
-_EMITTER: Optional[TelemetryEmitter] = None
-_EMITTER_LOCK = threading.Lock()
-
-
-def get_emitter() -> TelemetryEmitter:
-    """Return the process-wide emitter, honoring telemetry.local config."""
-    global _EMITTER
-    if _EMITTER is not None:
-        return _EMITTER
-    with _EMITTER_LOCK:
-        if _EMITTER is None:
-            enabled = _local_enabled()
-            _EMITTER = TelemetryEmitter(enabled=enabled)
-    return _EMITTER
-
-
-def _local_enabled() -> bool:
-    try:
-        from hermes_cli.config import load_config
-        cfg = load_config()
-        tel = cfg.get("telemetry") if isinstance(cfg, dict) else {}
-        return bool((tel or {}).get("local", True))
-    except Exception:
-        return True
-
-
-def emit(event: Any) -> None:
-    """Module-level convenience: emit via the singleton."""
-    get_emitter().emit(event)
-
-
-def reset_emitter_for_tests(emitter: Optional[TelemetryEmitter] = None) -> None:
-    """Swap the singleton (tests only)."""
-    global _EMITTER
-    with _EMITTER_LOCK:
-        if _EMITTER is not None and emitter is not _EMITTER:
-            try:
-                _EMITTER.close()
-            except Exception:
-                pass
-        _EMITTER = emitter
-
-
-__all__ = [
-    "TelemetryEmitter",
-    "get_emitter",
-    "emit",
-    "reset_emitter_for_tests",
-]
--- a/agent/telemetry/events.py
+++ b/agent/telemetry/events.py
@@ -1,111 +0,0 @@
-"""Typed local telemetry events.
-
-These dataclasses are the rows written to the local JSONL log and the ``tel_*``
-SQLite tables. They record the values observed for each run — model id, provider, tool
-name, token counts, durations — and stay on the machine unless explicitly exported.
-"""
-
-from __future__ import annotations
-
-import time
-from dataclasses import dataclass, field, asdict
-from typing import Any, Dict, Optional
-
-# ── local telemetry events (real values) ────────────────────────────────────
-
-
-def _now_ns() -> int:
-    return time.time_ns()
-
-
-@dataclass(slots=True)
-class RunEvent:
-    """One top-level workflow execution (a trace root). A run spans one session."""
-    run_id: str
-    trace_id: str
-    entrypoint: str
-    session_id: Optional[str] = None
-    platform: Optional[str] = None
-    start_ns: int = field(default_factory=_now_ns)
-    end_ns: Optional[int] = None
-    end_reason: Optional[str] = None
-    model_call_count: int = 0
-    tool_call_count: int = 0
-    error_count: int = 0
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {"event": "run", **asdict(self)}
-
-
-@dataclass(slots=True)
-class ModelCallEvent:
-    span_id: str
-    run_id: str
-    provider: Optional[str] = None        # raw provider, e.g. "anthropic"
-    model: Optional[str] = None           # raw model id, e.g. "claude-opus-4"
-    base_url: Optional[str] = None
-    input_tokens: int = 0
-    output_tokens: int = 0
-    cache_read_tokens: int = 0
-    cache_write_tokens: int = 0
-    reasoning_tokens: int = 0
-    latency_ms: Optional[int] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {"event": "model_call", **asdict(self)}
-
-
-@dataclass(slots=True)
-class ToolCallEvent:
-    span_id: str
-    run_id: str
-    tool_name: Optional[str] = None       # raw tool name, e.g. "web_search"
-    duration_ms: Optional[int] = None
-    result_class: Optional[str] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {"event": "tool_call", **asdict(self)}
-
-
-@dataclass(slots=True)
-class SpanEvent:
-    """A timed span — the timing/lineage backbone of a trace.
-
-    One row per run (the root, ``parent_span_id=None``) and one per model/tool call
-    (``parent_span_id`` = the run's root span). Detail rows in ``tel_model_calls`` /
-    ``tel_tool_calls`` share the ``span_id`` and are joined here for ordering and
-    placement on a timeline.
-    """
-    span_id: str
-    trace_id: str
-    run_id: str
-    name: str
-    kind: str                              # "run" | "model" | "tool"
-    start_ns: int
-    end_ns: Optional[int] = None
-    parent_span_id: Optional[str] = None
-    status: Optional[str] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {"event": "span", **asdict(self)}
-
-
-@dataclass(slots=True)
-class ErrorEvent:
-    run_id: Optional[str]
-    error_class: str
-    subsystem: str
-    recovery: Optional[str] = None
-    ts_ns: int = field(default_factory=_now_ns)
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {"event": "error", **asdict(self)}
-
-
-__all__ = [
-    "RunEvent",
-    "ModelCallEvent",
-    "ToolCallEvent",
-    "SpanEvent",
-    "ErrorEvent",
-]
--- a/agent/telemetry/exporter_bulk.py
+++ b/agent/telemetry/exporter_bulk.py
@@ -1,139 +0,0 @@
-"""Export telemetry (and optionally session content) to a file or stream.
-
-Two data domains, both written to an operator-chosen destination:
-
-  * Telemetry: the tel_* rows + events.jsonl (structural observability).
-  * Content (opt-in via telemetry.trajectories): sessions + messages, with every
-    content field (message body, reasoning, raw tool-call args) passed through the
-    redaction pipeline (secrets always stripped; PII per content_redaction).
-
-Formats: ndjson (default) and json. OTLP streaming export lives in otlp_exporter.py.
-
-Content export is gated by ``redaction.content_export_enabled``.
-"""
-
-from __future__ import annotations
-
-import json
-import sqlite3
-from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, TextIO
-
-from . import redaction
-
-_TEL_TABLES = (
-    "tel_runs", "tel_model_calls", "tel_tool_calls", "tel_error_events",
-)
-
-
-def _open(db_path: Optional[Path]) -> sqlite3.Connection:
-    if db_path is None:
-        from hermes_constants import get_hermes_home
-        db_path = get_hermes_home() / "state.db"
-    c = sqlite3.connect(str(db_path), timeout=5.0)
-    c.row_factory = sqlite3.Row
-    return c
-
-
-def _iter_telemetry(conn: sqlite3.Connection, since_ns: Optional[int]) -> Iterator[Dict[str, Any]]:
-    for table in _TEL_TABLES:
-        # only tel_runs has start_ns; window the rest by run join when needed.
-        if table == "tel_runs" and since_ns:
-            rows = conn.execute(
-                f"SELECT * FROM {table} WHERE start_ns >= ?", (int(since_ns),)
-            ).fetchall()
-        else:
-            rows = conn.execute(f"SELECT * FROM {table}").fetchall()
-        for r in rows:
-            d = dict(r)
-            d["_kind"] = table
-            yield d
-
-
-def _iter_content(
-    db_path: Optional[Path],
-    *,
-    config: Optional[Dict[str, Any]],
-    include_content: bool,
-) -> Iterator[Dict[str, Any]]:
-    """Yield session records. Message bodies included only when trajectories on."""
-    from hermes_state import SessionDB
-
-    content_mode = redaction.content_mode_for(config)
-    db = SessionDB(db_path=db_path) if db_path else SessionDB()
-    try:
-        for session in db.export_all():
-            msgs = session.get("messages", []) or []
-            red_msgs = [
-                redaction.redact_message(
-                    m, content_mode=content_mode, include_content=include_content
-                )
-                for m in msgs
-            ]
-            # Session-level metadata is structural; keep ids/model/counts, drop
-            # any free-text title only when content is excluded.
-            out = {
-                "_kind": "session",
-                "id": session.get("id"),
-                "source": session.get("source"),
-                "model": session.get("model"),
-                "started_at": session.get("started_at"),
-                "ended_at": session.get("ended_at"),
-                "message_count": session.get("message_count"),
-                "tool_call_count": session.get("tool_call_count"),
-                "messages": red_msgs,
-            }
-            if include_content and session.get("title"):
-                out["title"] = redaction.redact_for_export(
-                    session["title"], content_mode=content_mode
-                )
-            yield out
-    finally:
-        db.close()
-
-
-def export(
-    out: TextIO,
-    *,
-    fmt: str = "ndjson",
-    since_ns: Optional[int] = None,
-    include_content: bool = False,
-    config: Optional[Dict[str, Any]] = None,
-    db_path: Optional[Path] = None,
-) -> Dict[str, int]:
-    """Write telemetry (+ optional content) to ``out``. Returns counts.
-
-    ``include_content`` is honored only when telemetry.trajectories is enabled in
-    ``config``; otherwise content is forced off and only structural data is written.
-    """
-    # Trajectories gate: a flag cannot override the config setting.
-    content_allowed = include_content and redaction.content_export_enabled(config)
-    counts = {"telemetry": 0, "sessions": 0, "content_included": int(content_allowed)}
-
-    conn = _open(db_path)
-    records: List[Dict[str, Any]] = []
-    try:
-        for rec in _iter_telemetry(conn, since_ns):
-            counts["telemetry"] += 1
-            if fmt == "ndjson":
-                out.write(json.dumps(rec, ensure_ascii=False) + "\n")
-            else:
-                records.append(rec)
-    finally:
-        conn.close()
-
-    # Content/session domain (separate connection via SessionDB).
-    for rec in _iter_content(db_path, config=config, include_content=content_allowed):
-        counts["sessions"] += 1
-        if fmt == "ndjson":
-            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
-        else:
-            records.append(rec)
-
-    if fmt != "ndjson":
-        json.dump({"records": records}, out, ensure_ascii=False, indent=2)
-
-    return counts
-
-
-__all__ = ["export"]
--- a/agent/telemetry/metrics.py
+++ b/agent/telemetry/metrics.py
@@ -1,219 +0,0 @@
-"""Derive metric rollups from the local telemetry tables.
-
-Reads the ``tel_*`` tables in state.db and returns aggregates for /usage, /insights,
-and local dashboards. Metrics are computed by querying the event log rather than being
-emitted on the hot path.
-
-Each function accepts either an open caller-owned ``conn`` (reused, not closed) or a
-``db_path`` (opened and closed internally). InsightsEngine passes its existing
-connection; a standalone dashboard passes a path.
-"""
-
-from __future__ import annotations
-
-import sqlite3
-from contextlib import contextmanager
-from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional
-
-
-@contextmanager
-def _cursor(
-    conn: Optional[sqlite3.Connection], db_path: Optional[Path]
-) -> Iterator[sqlite3.Connection]:
-    """Yield a Row-factory connection. Closes it only if we opened it."""
-    if conn is not None:
-        prev_factory = conn.row_factory
-        conn.row_factory = sqlite3.Row
-        try:
-            yield conn
-        finally:
-            conn.row_factory = prev_factory
-        return
-    if db_path is None:
-        from hermes_constants import get_hermes_home
-        db_path = get_hermes_home() / "state.db"
-    c = sqlite3.connect(str(db_path), timeout=5.0)
-    c.row_factory = sqlite3.Row
-    try:
-        yield c
-    finally:
-        c.close()
-
-
-def _since_clause(since_ns: Optional[int], col: str = "start_ns") -> str:
-    return f" WHERE {col} >= {int(since_ns)}" if since_ns else ""
-
-
-def workflow_summary(
-    db_path: Optional[Path] = None,
-    since_ns: Optional[int] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> Dict[str, Any]:
-    """Run-level counters + duration percentiles (local telemetry, exact)."""
-    with _cursor(conn, db_path) as c:
-        where = _since_clause(since_ns)
-        total = c.execute(f"SELECT COUNT(*) n FROM tel_runs{where}").fetchone()["n"]
-        by_reason = {
-            r["end_reason"] or "unknown": r["n"]
-            for r in c.execute(
-                f"SELECT end_reason, COUNT(*) n FROM tel_runs{where} GROUP BY end_reason"
-            ).fetchall()
-        }
-        by_entry = {
-            r["entrypoint"] or "unknown": r["n"]
-            for r in c.execute(
-                f"SELECT entrypoint, COUNT(*) n FROM tel_runs{where} GROUP BY entrypoint"
-            ).fetchall()
-        }
-        dur_where = (where + " AND end_ns IS NOT NULL") if where else " WHERE end_ns IS NOT NULL"
-        durations = [
-            (r["end_ns"] - r["start_ns"]) / 1e6
-            for r in c.execute(
-                f"SELECT start_ns, end_ns FROM tel_runs{dur_where}"
-            ).fetchall()
-        ]
-        return {
-            "total_runs": total,
-            "by_end_reason": by_reason,
-            "by_entrypoint": by_entry,
-            "duration_ms_p50": _pct(durations, 50),
-            "duration_ms_p95": _pct(durations, 95),
-            "success_rate": round(by_reason.get("completed", 0) / total, 4) if total else 0.0,
-        }
-
-
-def model_call_summary(
-    db_path: Optional[Path] = None,
-    since_ns: Optional[int] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> Dict[str, Any]:
-    with _cursor(conn, db_path) as c:
-        rows = c.execute(
-            "SELECT provider, model, COUNT(*) n, "
-            "SUM(input_tokens) inp, SUM(output_tokens) outp, "
-            "SUM(cache_read_tokens) cache, AVG(latency_ms) avg_latency "
-            "FROM tel_model_calls GROUP BY provider, model"
-        ).fetchall()
-        by_provider: Dict[str, int] = {}
-        by_model: Dict[str, int] = {}
-        tokens = {"input": 0, "output": 0, "cache_read": 0}
-        breakdown: List[Dict[str, Any]] = []
-        for r in rows:
-            prov = r["provider"] or "unknown"
-            mdl = r["model"] or "unknown"
-            by_provider[prov] = by_provider.get(prov, 0) + r["n"]
-            by_model[mdl] = by_model.get(mdl, 0) + r["n"]
-            tokens["input"] += r["inp"] or 0
-            tokens["output"] += r["outp"] or 0
-            tokens["cache_read"] += r["cache"] or 0
-            breakdown.append({
-                "provider": r["provider"],
-                "model": r["model"],
-                "calls": r["n"],
-                "avg_latency_ms": round(r["avg_latency"] or 0, 1),
-            })
-        cache_total = tokens["cache_read"] + tokens["input"]
-        return {
-            "by_provider": by_provider,
-            "by_model": by_model,
-            "tokens": tokens,
-            "cache_hit_rate": round(tokens["cache_read"] / cache_total, 4) if cache_total else 0.0,
-            "breakdown": breakdown,
-        }
-
-
-def tool_call_summary(
-    db_path: Optional[Path] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> Dict[str, Any]:
-    with _cursor(conn, db_path) as c:
-        by_tool = {
-            r["tool_name"] or "unknown": r["n"]
-            for r in c.execute(
-                "SELECT tool_name, COUNT(*) n FROM tel_tool_calls GROUP BY tool_name"
-            ).fetchall()
-        }
-        fails = {
-            r["tool_name"] or "unknown": r["n"]
-            for r in c.execute(
-                "SELECT tool_name, COUNT(*) n FROM tel_tool_calls "
-                "WHERE result_class IN ('error','timeout','blocked') GROUP BY tool_name"
-            ).fetchall()
-        }
-        total = sum(by_tool.values())
-        total_fail = sum(fails.values())
-        return {
-            "by_tool": by_tool,
-            "failures_by_tool": fails,
-            "total": total,
-            "failure_rate": round(total_fail / total, 4) if total else 0.0,
-        }
-
-
-def error_summary(
-    db_path: Optional[Path] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> Dict[str, Any]:
-    with _cursor(conn, db_path) as c:
-        return {
-            "by_class": {
-                r["error_class"] or "unknown": r["n"]
-                for r in c.execute(
-                    "SELECT error_class, COUNT(*) n FROM tel_error_events GROUP BY error_class"
-                ).fetchall()
-            },
-        }
-
-
-def _pct(values: List[float], p: int) -> float:
-    if not values:
-        return 0.0
-    s = sorted(values)
-    k = (len(s) - 1) * (p / 100)
-    lo = int(k)
-    hi = min(lo + 1, len(s) - 1)
-    frac = k - lo
-    return round(s[lo] + (s[hi] - s[lo]) * frac, 2)
-
-
-def overview(
-    db_path: Optional[Path] = None,
-    since_ns: Optional[int] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> Dict[str, Any]:
-    """One call for a dashboard: all the rollups."""
-    return {
-        "workflows": workflow_summary(db_path, since_ns, conn=conn),
-        "model_calls": model_call_summary(db_path, since_ns, conn=conn),
-        "tool_calls": tool_call_summary(db_path, conn=conn),
-        "errors": error_summary(db_path, conn=conn),
-    }
-
-
-def has_data(
-    db_path: Optional[Path] = None,
-    *,
-    conn: Optional[sqlite3.Connection] = None,
-) -> bool:
-    """True when any telemetry runs exist (cheap guard for /insights rendering)."""
-    try:
-        with _cursor(conn, db_path) as c:
-            return c.execute("SELECT 1 FROM tel_runs LIMIT 1").fetchone() is not None
-    except Exception:
-        return False
-
-
-__all__ = [
-    "workflow_summary",
-    "model_call_summary",
-    "tool_call_summary",
-    "error_summary",
-    "overview",
-    "has_data",
-]
--- a/agent/telemetry/otlp_exporter.py
+++ b/agent/telemetry/otlp_exporter.py
@@ -1,289 +0,0 @@
-"""Export telemetry to an OpenTelemetry Collector over OTLP/HTTP.
-
-Maps the local tel_* events to OTel spans and sends them to the endpoint configured
-under ``telemetry.export.otlp``. Lets an operator stream Hermes telemetry into their
-own observability stack.
-
-Notes:
-  * The destination is operator-configured; this module only sends to that endpoint.
-    It does not import or interact with any aggregate-metrics path.
-  * ``opentelemetry-sdk`` + ``opentelemetry-exporter-otlp-proto-http`` are an optional
-    extra (``pip install hermes-agent[otlp]``), imported lazily so the dependency is
-    only required when OTLP export is actually used.
-  * ``headers_env`` maps a header name to an environment variable name; values are read
-    from the environment at export time and never logged or stored.
-  * The continuous subscriber runs in the emitter's writer thread after durable writes
-    and is fail-isolated, so an export error cannot affect a run.
-
-Each event is exported as a span carrying its recorded attributes (provider, model,
-tokens, duration, etc.). The timing/parent linkage captured in tel_spans
-(trace_id/span_id/parent_span_id/start_ns/end_ns) is not yet reconstructed into OTel
-SpanContexts here, so spans currently arrive as independent records rather than a
-connected trace tree; building the connected-trace projection is tracked separately.
-
-Spans carry structural telemetry by default. Message content is included only when
-trajectories is enabled, and always passes through the export redaction pipeline.
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-import sqlite3
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-logger = logging.getLogger(__name__)
-
-
-class OTLPUnavailable(RuntimeError):
-    """Raised when the optional OpenTelemetry SDK isn't installed."""
-
-
-def _require_sdk(*, auto_install: bool = True, prompt: bool = True):
-    """Import the OTel SDK, lazily installing it on first use if needed.
-
-    Routes through tools.lazy_deps (feature 'export.otlp') so a missing SDK
-    triggers the standard venv install flow — same as every other optional
-    backend — gated by security.allow_lazy_installs and TTY-prompted. Falls back
-    to OTLPUnavailable (with a manual install hint) when the SDK can't be made
-    importable (lazy installs disabled, install failed, or auto_install=False).
-
-    ``auto_install``: attempt the lazy install when missing (default True).
-    ``prompt``: ask before installing when interactive (default True); pass
-    False from non-interactive contexts like the continuous streamer.
-    """
-    if auto_install:
-        try:
-            from tools.lazy_deps import ensure as _lazy_ensure
-            _lazy_ensure("export.otlp", prompt=prompt)
-        except ImportError:
-            pass  # lazy_deps unavailable — fall through to the import attempt
-        except Exception:
-            # FeatureUnavailable (lazy installs disabled / declined / failed) —
-            # fall through; the import below raises OTLPUnavailable with the hint.
-            pass
-    try:
-        from opentelemetry.sdk.trace import TracerProvider
-        from opentelemetry.sdk.trace.export import BatchSpanProcessor
-        from opentelemetry.sdk.resources import Resource
-        from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
-            OTLPSpanExporter,
-        )
-        from opentelemetry.trace import SpanKind
-        return {
-            "TracerProvider": TracerProvider,
-            "BatchSpanProcessor": BatchSpanProcessor,
-            "Resource": Resource,
-            "OTLPSpanExporter": OTLPSpanExporter,
-            "SpanKind": SpanKind,
-        }
-    except Exception as e:  # ImportError or partial install
-        raise OTLPUnavailable(
-            "OTLP export requires the optional dependency. Install with:\n"
-            "    pip install 'hermes-agent[otlp]'\n"
-            f"(import error: {e})"
-        )
-
-
-def _resolve_headers(headers_env: Optional[Dict[str, str]]) -> Dict[str, str]:
-    """Resolve {header_name: ENV_VAR_NAME} -> {header_name: value} from env.
-
-    The config stores environment variable names, not secret values; values are read
-    from the environment here. Missing variables are skipped (and noted at debug level
-    without the value).
-    """
-    resolved: Dict[str, str] = {}
-    for header_name, env_name in (headers_env or {}).items():
-        val = os.environ.get(str(env_name))
-        if val:
-            resolved[str(header_name)] = val
-        else:
-            logger.debug("OTLP header %s: env var %s not set; skipping",
-                         header_name, env_name)
-    return resolved
-
-
-def _otlp_config(config: Dict[str, Any]) -> Dict[str, Any]:
-    tel = (config or {}).get("telemetry") or {}
-    export = tel.get("export") or {}
-    return export.get("otlp") or {}
-
-
-def build_exporter(config: Dict[str, Any]):
-    """Construct an OTLP span exporter from config. Raises OTLPUnavailable if no SDK."""
-    sdk = _require_sdk()
-    otlp = _otlp_config(config)
-    endpoint = otlp.get("endpoint")
-    if not endpoint:
-        raise ValueError("telemetry.export.otlp.endpoint is not set")
-    headers = _resolve_headers(otlp.get("headers_env"))
-    return sdk["OTLPSpanExporter"](endpoint=endpoint, headers=headers or None)
-
-
-def _make_provider(config: Dict[str, Any]):
-    sdk = _require_sdk()
-    resource = sdk["Resource"].create({
-        "service.name": "hermes-agent",
-        "telemetry.scope": "local",  # never aggregate metrics
-    })
-    provider = sdk["TracerProvider"](resource=resource)
-    processor = sdk["BatchSpanProcessor"](build_exporter(config))
-    provider.add_span_processor(processor)
-    return provider, processor
-
-
-# ── event -> span attribute mapping (real values) ───────────────────────────
-def _span_attrs(ev: Dict[str, Any]) -> Dict[str, Any]:
-    """Span attributes for an event — the real recorded values (local telemetry)."""
-    kind = ev.get("event")
-    attrs: Dict[str, Any] = {"hermes.event": kind or "unknown"}
-    keep_by_kind = {
-        "run": ("entrypoint", "platform", "end_reason",
-                "model_call_count", "tool_call_count", "error_count"),
-        "span": ("trace_id", "run_id", "parent_span_id", "name", "kind",
-                 "start_ns", "end_ns", "status"),
-        "model_call": ("provider", "model", "base_url",
-                       "input_tokens", "output_tokens", "cache_read_tokens",
-                       "cache_write_tokens", "reasoning_tokens", "latency_ms"),
-        "tool_call": ("tool_name", "duration_ms", "result_class"),
-        "error": ("error_class", "subsystem", "recovery"),
-    }
-    for col in keep_by_kind.get(kind, ()):  # type: ignore[arg-type]
-        v = ev.get(col)
-        if v is not None:
-            attrs[f"hermes.{col}"] = v
-    return attrs
-
-
-def export_batch(provider, batch: List[Dict[str, Any]]) -> int:
-    """Map a batch of events to OTel spans. Returns spans created."""
-    tracer = provider.get_tracer("hermes.telemetry")
-    n = 0
-    for ev in batch:
-        try:
-            name = f"hermes.{ev.get('event', 'event')}"
-            span = tracer.start_span(name, attributes=_span_attrs(ev))
-            span.end()
-            n += 1
-        except Exception:
-            logger.debug("OTLP span map failed", exc_info=True)
-    return n
-
-
-# ── one-shot drain (export current local rows) ──────────────────────────────
-def export_once(
-    config: Dict[str, Any],
-    *,
-    db_path: Optional[Path] = None,
-    since_ns: Optional[int] = None,
-) -> int:
-    """Drain the local tel_* tables to the configured OTLP endpoint once."""
-    provider, processor = _make_provider(config)
-    try:
-        rows = _read_events(db_path, since_ns)
-        total = export_batch(provider, rows)
-        processor.force_flush()
-        return total
-    finally:
-        try:
-            provider.shutdown()
-        except Exception:
-            pass
-
-
-def _read_events(db_path: Optional[Path], since_ns: Optional[int]) -> List[Dict[str, Any]]:
-    if db_path is None:
-        from hermes_constants import get_hermes_home
-        db_path = get_hermes_home() / "state.db"
-    c = sqlite3.connect(str(db_path), timeout=5.0)
-    c.row_factory = sqlite3.Row
-    out: List[Dict[str, Any]] = []
-    try:
-        table_event = {
-            "tel_runs": "run", "tel_spans": "span",
-            "tel_model_calls": "model_call",
-            "tel_tool_calls": "tool_call", "tel_error_events": "error",
-        }
-        for table, evkind in table_event.items():
-            where = ""
-            if table == "tel_runs" and since_ns:
-                where = f" WHERE start_ns >= {int(since_ns)}"
-            for r in c.execute(f"SELECT * FROM {table}{where}").fetchall():
-                d = dict(r)
-                d["event"] = evkind
-                out.append(d)
-    finally:
-        c.close()
-    return out
-
-
-# ── continuous streaming subscriber ─────────────────────────────────────────
-class OTLPStreamer:
-    """A live subscriber that pushes each emitter batch to OTLP as it lands.
-
-    Register with ``emitter.subscribe(streamer)``. Fail-isolated by the emitter.
-    """
-
-    def __init__(self, config: Dict[str, Any]):
-        self._provider, self._processor = _make_provider(config)
-        self.exported = 0
-
-    def __call__(self, batch: List[Dict[str, Any]]) -> None:
-        self.exported += export_batch(self._provider, batch)
-
-    def shutdown(self) -> None:
-        try:
-            self._processor.force_flush()
-            self._provider.shutdown()
-        except Exception:
-            pass
-
-
-def is_available() -> bool:
-    """True when the OTel SDK is already importable. Does NOT auto-install —
-    this is a pure check (e.g. for status display)."""
-    try:
-        _require_sdk(auto_install=False)
-        return True
-    except OTLPUnavailable:
-        return False
-
-
-def is_enabled(config: Dict[str, Any]) -> bool:
-    otlp = _otlp_config(config)
-    return bool(otlp.get("enabled") and otlp.get("endpoint"))
-
-
-def start_streaming(config: Dict[str, Any]) -> Optional[OTLPStreamer]:
-    """If OTLP is enabled, attach a streamer to the singleton emitter.
-
-    Non-interactive context (startup): attempts a lazy install with prompt=False
-    so a configured-but-missing SDK is installed once (gated by
-    security.allow_lazy_installs), then streams. If it still can't load, logs and
-    no-ops — never blocks or raises into startup.
-    """
-    if not is_enabled(config):
-        return None
-    try:
-        _require_sdk(prompt=False)
-    except OTLPUnavailable:
-        logger.warning("telemetry.export.otlp.enabled but the OTel SDK could not "
-                       "be installed/imported; install 'hermes-agent[otlp]'")
-        return None
-    from agent.telemetry.emitter import get_emitter
-    streamer = OTLPStreamer(config)
-    get_emitter().subscribe(streamer)
-    return streamer
-
-
-__all__ = [
-    "OTLPUnavailable",
-    "OTLPStreamer",
-    "build_exporter",
-    "export_once",
-    "export_batch",
-    "is_available",
-    "is_enabled",
-    "start_streaming",
-]
--- a/agent/telemetry/policy.py
+++ b/agent/telemetry/policy.py
@@ -1,72 +0,0 @@
-"""Telemetry consent posture and the aggregate-metrics gate.
-
-Consent is a single config field, ``telemetry.consent_state``:
-
-  * "unknown" — no choice recorded; never uploads (the default).
-  * "local"   — declined aggregate metrics; local telemetry only.
-  * "aggregate" — opted in to aggregate metrics.
-
-The config file is the source of truth: set ``telemetry.consent_state`` with
-``hermes config set`` (or a managed-scope pin). Callers that gate behavior read
-``telemetry.*`` directly from config; this module only provides the consent
-constants, the install-id helper, and the upload gate a future uploader must
-consult.
-
-``allow_aggregate`` is the hard gate. An administrator pins
-``telemetry.allow_aggregate: false`` through the managed-scope layer
-(``/etc/hermes/config.yaml``), which takes precedence over the user's config; when
-it is false, aggregate metrics are off regardless of ``consent_state``.
-"""
-
-from __future__ import annotations
-
-import uuid
-from typing import Any, Dict
-
-CONSENT_UNKNOWN = "unknown"
-CONSENT_LOCAL = "local"
-CONSENT_AGGREGATE = "aggregate"
-VALID_CONSENT_STATES = {CONSENT_UNKNOWN, CONSENT_LOCAL, CONSENT_AGGREGATE}
-
-
-def _telemetry_cfg(config: Dict[str, Any]) -> Dict[str, Any]:
-    cfg = config.get("telemetry") if isinstance(config, dict) else None
-    return cfg if isinstance(cfg, dict) else {}
-
-
-def ensure_install_id(config: Dict[str, Any]) -> str:
-    """Return a stable install id, minting one if the config slot is empty.
-
-    Does not persist — the caller writes the returned value back to config.yaml. A
-    fresh uuid4 is used; clearing ``telemetry.install_id`` (e.g. with
-    ``hermes config set telemetry.install_id ""``) causes the next call to mint anew.
-    """
-    tel = _telemetry_cfg(config)
-    existing = tel.get("install_id")
-    if isinstance(existing, str) and existing.strip():
-        return existing
-    return str(uuid.uuid4())
-
-
-def may_upload_aggregate(config: Dict[str, Any]) -> bool:
-    """Whether aggregate metrics may upload — the gate a future uploader consults.
-
-    Aggregate metrics are derived from the local telemetry tables, so they require
-    local telemetry to be on. True only when local telemetry is enabled, the admin
-    hard gate allows it, and the user has opted in via ``telemetry.consent_state``.
-    """
-    tel = _telemetry_cfg(config)
-    local_enabled = bool(tel.get("local", True))
-    allow_aggregate = bool(tel.get("allow_aggregate", True))
-    state = tel.get("consent_state", CONSENT_UNKNOWN)
-    return local_enabled and allow_aggregate and state == CONSENT_AGGREGATE
-
-
-__all__ = [
-    "CONSENT_UNKNOWN",
-    "CONSENT_LOCAL",
-    "CONSENT_AGGREGATE",
-    "VALID_CONSENT_STATES",
-    "may_upload_aggregate",
-    "ensure_install_id",
-]
--- a/agent/telemetry/redaction.py
+++ b/agent/telemetry/redaction.py
@@ -1,187 +0,0 @@
-"""Redaction applied to telemetry data on export.
-
-Two independent controls:
-
-  * Secrets are always redacted, on every export and in every mode; no setting
-    disables this. Wraps ``agent/redact.py::redact_sensitive_text(force=True)``.
-
-  * Whether message bodies, reasoning, and raw tool arguments are exportable at all is
-    governed by the trajectories setting (``telemetry.trajectories.enabled``, default
-    off, admin-pinnable), not by a redaction mode. With trajectories off, content is
-    dropped. With it on, content is exportable and ``content_redaction`` (none|pii)
-    controls how much is scrubbed; secrets are still always stripped.
-
-This applies to the local and trajectory export paths. It is unrelated to any
-aggregate-metrics path.
-"""
-
-from __future__ import annotations
-
-import re
-from typing import Any, Dict, List, Optional
-
-# Content-redaction strengths for any content that IS exported.
-CONTENT_NONE = "none"   # drop content entirely (structural telemetry only)
-CONTENT_PII = "pii"     # codec-aware PII redaction on exported content
-CONTENT_MODES = {CONTENT_NONE, CONTENT_PII}
-
-# ── PII patterns (applied only in CONTENT_PII mode, on content that is exported) ──
-_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
-# E.164-ish and common separators; conservative to avoid nuking code/IDs.
-_PHONE_RE = re.compile(
-    r"(?<!\w)(?:\+?\d{1,3}[\s.\-]?)?(?:\(\d{2,4}\)[\s.\-]?)?\d{3}[\s.\-]?\d{3,4}(?:[\s.\-]?\d{2,4})?(?!\w)"
-)
-# Long opaque hex/uuid-ish user identifiers.
-_UUID_RE = re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b")
-
-
-def _secret_redact(text: Optional[str]) -> Optional[str]:
-    """Always-on secret redaction. force=True so user config can't disable it."""
-    if text is None:
-        return None
-    try:
-        from agent.redact import redact_sensitive_text
-        return redact_sensitive_text(str(text), force=True)
-    except Exception:
-        # Fail CLOSED: if the redactor can't run, do not emit the raw string.
-        return "[redaction-unavailable]"
-
-
-def _pii_redact(text: str) -> str:
-    text = _EMAIL_RE.sub("[email]", text)
-    text = _UUID_RE.sub("[id]", text)
-    text = _PHONE_RE.sub("[phone]", text)
-    return text
-
-
-def redact_for_export(
-    text: Optional[str],
-    *,
-    content_mode: str = CONTENT_NONE,
-) -> Optional[str]:
-    """Redact a single content string for export.
-
-    Secrets are ALWAYS stripped. Then PII is stripped when content_mode is 'pii'.
-    Callers gate *whether content is exported at all* via telemetry.trajectories
-    (see ``content_export_enabled``); this function only scrubs content that the
-    caller has already decided to export.
-    """
-    redacted = _secret_redact(text)
-    if redacted is None:
-        return None
-    if content_mode == CONTENT_PII:
-        redacted = _pii_redact(redacted)
-    return redacted
-
-
-def content_export_enabled(config: Optional[Dict[str, Any]]) -> bool:
-    """True only when telemetry.trajectories is explicitly enabled.
-
-    This is the consent gate for exporting message bodies / reasoning / raw tool
-    args. Default off. Admin-pinnable via managed scope (telemetry.trajectories.enabled).
-    """
-    try:
-        tel = (config or {}).get("telemetry") or {}
-        traj = tel.get("trajectories") or {}
-        return bool(traj.get("enabled", False))
-    except Exception:
-        return False
-
-
-def content_mode_for(config: Optional[Dict[str, Any]]) -> str:
-    try:
-        tel = (config or {}).get("telemetry") or {}
-        mode = tel.get("content_redaction", CONTENT_NONE)
-        return mode if mode in CONTENT_MODES else CONTENT_NONE
-    except Exception:
-        return CONTENT_NONE
-
-
-# ── Codec-aware message redaction (NeMo pattern) ─────────────────────────────
-# Redact the right fields of a provider message shape rather than regex-blasting
-# the whole blob. Structure (roles, names, counts) is preserved; only the
-# free-text content fields are scrubbed.
-
-def redact_message(
-    msg: Dict[str, Any],
-    *,
-    content_mode: str = CONTENT_NONE,
-    include_content: bool = False,
-) -> Dict[str, Any]:
-    """Redact one chat message dict for export.
-
-    When include_content is False (trajectories off), content/reasoning/tool-arg
-    fields are dropped — only structural fields (role, tool name, counts) remain.
-    When True, those fields are kept but passed through redact_for_export.
-    """
-    role = msg.get("role")
-    out: Dict[str, Any] = {"role": role}
-
-    # Always-structural fields.
-    if msg.get("tool_name") is not None:
-        out["tool_name"] = msg.get("tool_name")
-    if msg.get("name") is not None:
-        out["name"] = msg.get("name")
-
-    if not include_content:
-        # Structural only: record presence/size, not bytes.
-        c = msg.get("content")
-        if c is not None:
-            out["content_chars"] = len(str(c))
-        if msg.get("reasoning_content"):
-            out["reasoning_chars"] = len(str(msg["reasoning_content"]))
-        if msg.get("tool_calls"):
-            out["tool_call_count"] = _count_tool_calls(msg["tool_calls"])
-        return out
-
-    # Content included (trajectories enabled): scrub then keep.
-    if msg.get("content") is not None:
-        out["content"] = redact_for_export(msg["content"], content_mode=content_mode)
-    if msg.get("reasoning_content"):
-        out["reasoning_content"] = redact_for_export(
-            msg["reasoning_content"], content_mode=content_mode
-        )
-    if msg.get("tool_calls"):
-        out["tool_calls"] = _redact_tool_calls(msg["tool_calls"], content_mode=content_mode)
-    return out
-
-
-def _count_tool_calls(tool_calls: Any) -> int:
-    try:
-        import json
-        tc = json.loads(tool_calls) if isinstance(tool_calls, str) else tool_calls
-        return len(tc) if isinstance(tc, list) else (1 if tc else 0)
-    except Exception:
-        return 0
-
-
-def _redact_tool_calls(tool_calls: Any, *, content_mode: str) -> Any:
-    """Redact raw tool-call arguments (free text) while keeping function names."""
-    import json
-    try:
-        tc = json.loads(tool_calls) if isinstance(tool_calls, str) else tool_calls
-    except Exception:
-        return "[unparseable-tool-calls]"
-    if not isinstance(tc, list):
-        return []
-    out: List[Dict[str, Any]] = []
-    for call in tc:
-        if not isinstance(call, dict):
-            continue
-        fn = (call.get("function") or {}) if isinstance(call.get("function"), dict) else {}
-        name = fn.get("name") or call.get("name")
-        args = fn.get("arguments")
-        red_args = redact_for_export(args, content_mode=content_mode) if args is not None else None
-        out.append({"name": name, "arguments": red_args})
-    return out
-
-
-__all__ = [
-    "CONTENT_NONE",
-    "CONTENT_PII",
-    "CONTENT_MODES",
-    "redact_for_export",
-    "content_export_enabled",
-    "content_mode_for",
-    "redact_message",
-]
--- a/agent/telemetry/rollup.py
+++ b/agent/telemetry/rollup.py
@@ -1,144 +0,0 @@
-"""Build per-run summary events from the local telemetry tables.
-
-Reads the ``tel_*`` tables and projects each completed run into a summary dict holding
-the recorded values: provider, models used, tool names, token totals, duration, and
-cost. Powers ``hermes telemetry preview``. No aggregation or bucketing is applied here.
-"""
-
-from __future__ import annotations
-
-import platform
-import sqlite3
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-
-def _os_family() -> str:
-    s = platform.system().lower()
-    if s.startswith("lin"):
-        return "linux"
-    if s == "darwin":
-        return "macos"
-    if s.startswith("win"):
-        return "windows"
-    return "other"
-
-
-def _hermes_version() -> str:
-    try:
-        from hermes_cli import __version__
-        return str(__version__)
-    except Exception:
-        return "0.0.0"
-
-
-def _open(db_path: Optional[Path], conn: Optional[sqlite3.Connection]):
-    if conn is not None:
-        prev = conn.row_factory
-        conn.row_factory = sqlite3.Row
-        return conn, prev, False
-    if db_path is None:
-        from hermes_constants import get_hermes_home
-        db_path = get_hermes_home() / "state.db"
-    c = sqlite3.connect(str(db_path), timeout=5.0)
-    c.row_factory = sqlite3.Row
-    return c, None, True
-
-
-def _run_events(c: sqlite3.Connection, since_ns: Optional[int]) -> List[Dict[str, Any]]:
-    """Project completed runs into per-run summary dicts."""
-    where = " WHERE end_ns IS NOT NULL"
-    if since_ns:
-        where += f" AND start_ns >= {int(since_ns)}"
-    rows = c.execute(
-        "SELECT run_id, entrypoint, platform, end_reason, start_ns, end_ns, "
-        "model_call_count, tool_call_count, error_count "
-        "FROM tel_runs" + where
-    ).fetchall()
-
-    events: List[Dict[str, Any]] = []
-    for r in rows:
-        # Models actually used in this run (real ids), with token totals.
-        models = [
-            {"provider": m["provider"], "model": m["model"],
-             "calls": m["n"], "input_tokens": int(m["inp"] or 0),
-             "output_tokens": int(m["outp"] or 0)}
-            for m in c.execute(
-                "SELECT provider, model, COUNT(*) n, SUM(input_tokens) inp, "
-                "SUM(output_tokens) outp FROM tel_model_calls WHERE run_id = ? "
-                "GROUP BY provider, model ORDER BY n DESC",
-                (r["run_id"],),
-            ).fetchall()
-        ]
-        tools = [
-            row["tool_name"]
-            for row in c.execute(
-                "SELECT DISTINCT tool_name FROM tel_tool_calls WHERE run_id = ?",
-                (r["run_id"],),
-            ).fetchall()
-            if row["tool_name"]
-        ]
-        trow = c.execute(
-            "SELECT SUM(input_tokens) inp, SUM(output_tokens) outp "
-            "FROM tel_model_calls WHERE run_id = ?",
-            (r["run_id"],),
-        ).fetchone()
-        duration_ms = (r["end_ns"] - r["start_ns"]) / 1e6 if r["end_ns"] else None
-        events.append({
-            "event_name": "workflow_completed",
-            "run_id": r["run_id"],
-            "entrypoint": r["entrypoint"] or "cli",
-            "platform": r["platform"],
-            "end_reason": r["end_reason"] or "completed",
-            "models_used": models,
-            "tools_used": tools,
-            "model_call_count": r["model_call_count"] or 0,
-            "tool_call_count": r["tool_call_count"] or 0,
-            "error_count": r["error_count"] or 0,
-            "duration_ms": round(duration_ms, 1) if duration_ms is not None else None,
-            "input_tokens": int((trow["inp"] if trow else 0) or 0),
-            "output_tokens": int((trow["outp"] if trow else 0) or 0),
-        })
-    return events
-
-
-def build_aggregate_events(
-    *,
-    install_id: str,
-    db_path: Optional[Path] = None,
-    since_ns: Optional[int] = None,
-    conn: Optional[sqlite3.Connection] = None,
-    include_heartbeat: bool = True,
-) -> List[Dict[str, Any]]:
-    """Return per-run summary events plus an optional heartbeat."""
-    c, prev_factory, owned = _open(db_path, conn)
-    try:
-        events = _run_events(c, since_ns)
-        if include_heartbeat:
-            events.append({
-                "event_name": "heartbeat",
-                "install_id": install_id,
-                "hermes_version": _hermes_version(),
-                "os_family": _os_family(),
-                "entrypoint": "cli",
-            })
-        return events
-    finally:
-        if owned:
-            c.close()
-        elif prev_factory is not None:
-            c.row_factory = prev_factory
-
-
-def summarize(events: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Counts by event_name + field coverage, for status/preview output."""
-    by_name: Dict[str, int] = {}
-    fields = set()
-    for e in events:
-        name = e.get("event_name", "?")
-        by_name[name] = by_name.get(name, 0) + 1
-        fields.update(e.keys())
-    return {"total": len(events), "by_event_name": by_name, "fields_present": sorted(fields)}
-
-
-__all__ = ["build_aggregate_events", "summarize"]
--- a/agent/telemetry/spans.py
+++ b/agent/telemetry/spans.py
@@ -1,83 +0,0 @@
-"""Trace / run / span id propagation via contextvars.
-
-Telemetry events share IDs so a workflow can be reconstructed: one ``trace_id`` per
-workflow, one ``run_id`` per top-level execution, ``span_id`` per timed operation, and
-``parent_span_id`` for nesting. These live in contextvars so async tool calls and
-spawned subagents inherit the lineage automatically.
-
-Provides helpers to start/clear a run context and mint child span ids. The telemetry
-plugin sets the run context on session start and reads it in each hook callback.
-Nothing here writes to storage — it only carries ids.
-"""
-
-from __future__ import annotations
-
-import contextvars
-import uuid
-from dataclasses import dataclass
-from typing import Optional
-
-_trace_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
-    "hermes_tel_trace_id", default=None
-)
-_run_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
-    "hermes_tel_run_id", default=None
-)
-_parent_span_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar(
-    "hermes_tel_parent_span_id", default=None
-)
-
-
-def new_id() -> str:
-    return uuid.uuid4().hex
-
-
-@dataclass(slots=True)
-class RunContext:
-    trace_id: str
-    run_id: str
-
-
-def start_run(trace_id: Optional[str] = None, run_id: Optional[str] = None) -> RunContext:
-    """Begin a run context, minting ids when not supplied. Sets contextvars."""
-    tid = trace_id or new_id()
-    rid = run_id or new_id()
-    _trace_id.set(tid)
-    _run_id.set(rid)
-    _parent_span_id.set(None)
-    return RunContext(trace_id=tid, run_id=rid)
-
-
-def current_trace_id() -> Optional[str]:
-    return _trace_id.get()
-
-
-def current_run_id() -> Optional[str]:
-    return _run_id.get()
-
-
-def current_parent_span_id() -> Optional[str]:
-    return _parent_span_id.get()
-
-
-def new_span_id() -> str:
-    """Mint a span id (does not alter the parent pointer)."""
-    return new_id()
-
-
-def clear_run() -> None:
-    _trace_id.set(None)
-    _run_id.set(None)
-    _parent_span_id.set(None)
-
-
-__all__ = [
-    "RunContext",
-    "new_id",
-    "start_run",
-    "current_trace_id",
-    "current_run_id",
-    "current_parent_span_id",
-    "new_span_id",
-    "clear_run",
-]
--- a/agent/thinking_timeout_guidance.py
+++ b/agent/thinking_timeout_guidance.py
@@ -1,136 +0,0 @@
-"""Thinking-timeout detection and user-facing guidance for reasoning models.
-
-When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3,
-Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning)
-hits a transport-layer error before the first content token arrives, the
-upstream proxy has almost certainly idle-killed a long thinking stream —
-not a true context overflow or a configuration error.  The user needs
-distinct guidance for this case:
-
-    "The model's thinking phase exceeded the upstream proxy's idle
-     timeout before the first content token arrived.  This is a known
-     issue with reasoning models behind cloud gateways (NVIDIA NIM,
-     OpenAI, Anthropic, DeepSeek).  Workarounds in priority order:
-     1. Set `providers.<provider>.models.<model>.stale_timeout_seconds: 900`
-        in `~/.hermes/config.yaml` to extend the per-call timeout...
-     2. Lower `reasoning_budget` or set `reasoning_effort: medium`...
-     3. Use a smaller / faster reasoning model..."
-
-The existing `_is_stream_drop` guidance at
-``agent/conversation_loop.py:3464-3486`` fires for large-file-write
-stream drops ("try execute_code with Python's open() for large files")
-which is the WRONG advice for the thinking-timeout case.  This module
-provides the detection and the message as standalone helpers so the
-detection logic is unit-testable without driving the full retry loop,
-and the message text can be regression-tested for spelling and accuracy.
-
-Part 2 of Fixes #52310.
-"""
-
-from __future__ import annotations
-
-from typing import Optional
-
-
-# Substring set that identifies a transport-layer failure on the
-# response stream.  Same shape as the existing
-# ``_SERVER_DISCONNECT_PATTERNS`` in ``agent/error_classifier.py:394``
-# but extended to also catch the OSS-level error signature
-# (``broken pipe`` / ``errno 32``) that the upstream kill surfaces
-# to the OpenAI SDK wrapper.
-_THINKING_TIMEOUT_SUBSTRINGS: tuple[str, ...] = (
-    "broken pipe",
-    "errno 32",
-    "remote protocol",
-    "connection reset",
-    "connection lost",
-    "peer closed",
-    "server disconnected",
-)
-
-
-def is_thinking_timeout(classified: object, model: str, error_msg: str) -> bool:
-    """Return True when a reasoning model's thinking phase hit a transport kill.
-
-    Args:
-        classified: a :class:`agent.error_classifier.ClassifiedError` instance
-            (duck-typed here to avoid an import cycle in unit tests).
-        model: the model slug at failure time (e.g.
-            ``"nvidia/nemotron-3-ultra-550b-a55b"``).
-        error_msg: lowercased string representation of the underlying
-            exception (typically ``str(api_error).lower()``).
-
-    Returns True when ALL conditions hold:
-        1. ``classified.reason == FailoverReason.timeout`` (the classifier
-           override at ``agent/error_classifier.py:720-738`` ensures this
-           is the case for reasoning models even on large sessions).
-        2. ``api_error`` has no ``.status_code`` attribute set (transport
-           disconnect, not an HTTP error).
-        3. ``model`` is in the reasoning-model allowlist (reuses
-           ``agent.reasoning_timeouts.get_reasoning_stale_timeout_floor``).
-        4. ``error_msg`` contains one of the transport-kill substrings.
-
-    Non-reasoning models always return False.  Non-transport errors
-    (billing / rate_limit / auth / context_overflow / format_error)
-    always return False.  HTTP-status errors always return False.
-    """
-    # Import here (not at module top) to keep this helper cheap to
-    # import even from callers that don't need it.  ``agent.reasoning_timeouts``
-    # is small and dependency-free.
-    from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor
-
-    # Condition 1: classifier says timeout.  Use a string/value check
-    # rather than importing FailoverReason so this module has zero
-    # import cycles from the error_classifier package.
-    reason = getattr(classified, "reason", None)
-    reason_value = getattr(reason, "value", None)
-    if reason_value != "timeout":
-        return False
-
-    # Condition 2: no HTTP status code (transport, not API error).
-    # Caller is expected to gate on ``getattr(api_error, "status_code", None) is None``
-    # before calling this helper; the surface here is just the post-gate
-    # boolean so the caller can pass an already-prepped error_msg.
-
-    # Condition 3: reasoning model allowlist.
-    if get_reasoning_stale_timeout_floor(model) is None:
-        return False
-
-    # Condition 4: transport-kill substring in the error message.
-    error_msg_lower = (error_msg or "").lower()
-    return any(p in error_msg_lower for p in _THINKING_TIMEOUT_SUBSTRINGS)
-
-
-def build_thinking_timeout_guidance(
-    provider: str, model: str, model_label: Optional[str] = None,
-) -> str:
-    """Return the user-facing guidance string appended to ``_final_response``.
-
-    Args:
-        provider: provider slug (e.g. ``"nvidia"``, ``"openai"``).
-        model: bare model slug the user would put in their config
-            (e.g. ``"nemotron-3-ultra-550b-a55b"`` if the user uses
-            NVIDIA direct, or the full ``"nvidia/nemotron-3-ultra-550b-a55b"``
-            if they go through an aggregator).  Used verbatim in the
-            config snippet so the user can copy-paste.
-        model_label: optional short label for the model name in the
-            prose (e.g. ``"Nemotron 3 Ultra"``).  Falls back to the
-            slug if not provided.
-    """
-    label = model_label or model
-    return (
-        "\n\nThe model's thinking phase exceeded the upstream proxy's "
-        "idle timeout before the first content token arrived. This is a "
-        f"known issue with reasoning models (like {label}) behind cloud "
-        "gateways (NVIDIA NIM, OpenAI, Anthropic, DeepSeek). Workarounds "
-        "in priority order:\n"
-        f"1. Set `providers.{provider}.models.{model}.stale_timeout_seconds: 900` "
-        "in `~/.hermes/config.yaml` to extend the per-call timeout. "
-        "(Hermes's built-in floor is 600s for known reasoning models — "
-        "if you still see this after raising, the upstream cap is even "
-        "shorter.)\n"
-        "2. Lower `reasoning_budget` or set `reasoning_effort: medium` on this "
-        "model if the provider supports it.\n"
-        "3. Use a smaller / faster reasoning model if the task doesn't "
-        "require deep thinking."
-    )
--- a/agent/tool_dispatch_helpers.py
+++ b/agent/tool_dispatch_helpers.py
@@ -11,8 +11,7 @@ Pure module-level utilities extracted from ``run_agent.py``:
  ``_append_subdir_hint_to_multimodal`` — envelope helpers for the
  ``{"_multimodal": True, "content": [...], "text_summary": ...}`` dict
  shape returned by tools like ``computer_use``.
-* ``_extract_file_mutation_targets`` / ``_extract_landed_file_mutation_paths`` /
-  ``_extract_error_preview`` —
+* ``_extract_file_mutation_targets`` / ``_extract_error_preview`` —
  per-turn file-mutation verifier inputs.
 * ``_trajectory_normalize_msg`` — strip image blobs from a message for
  trajectory saving.
@@ -270,35 +269,6 @@ def _extract_file_mutation_targets(tool_name: str, args: Dict[str, Any]) -> List
    return []


-def _extract_landed_file_mutation_paths(
-    tool_name: str,
-    args: Dict[str, Any],
-    result: Any,
-) -> List[str]:
-    """Return the concrete file paths a successful mutation reports."""
-    targets = _extract_file_mutation_targets(tool_name, args)
-    if tool_name not in _FILE_MUTATING_TOOLS or not isinstance(result, str):
-        return targets
-    try:
-        data = json.loads(result.strip())
-    except Exception:
-        return targets
-    if not isinstance(data, dict):
-        return targets
-
-    files = data.get("files_modified")
-    if isinstance(files, list):
-        landed = [str(p) for p in files if p]
-        if landed:
-            return landed
-
-    resolved = data.get("resolved_path")
-    if resolved:
-        return [str(resolved)]
-
-    return targets
-
-
 def _extract_error_preview(result: Any, max_len: int = 180) -> str:
    """Pull a one-line error summary out of a tool result for footer display."""
    text = _multimodal_text_summary(result) if result is not None else ""
@@ -441,7 +411,6 @@ __all__ = [
    "_multimodal_text_summary",
    "_append_subdir_hint_to_multimodal",
    "_extract_file_mutation_targets",
-    "_extract_landed_file_mutation_paths",
    "_extract_error_preview",
    "_trajectory_normalize_msg",
    "make_tool_result_message",
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@@ -26,7 +26,6 @@ from agent.display import (
    build_tool_preview as _build_tool_preview,
    get_cute_tool_message as _get_cute_tool_message_impl,
    get_tool_emoji as _get_tool_emoji,
-    redact_tool_args_for_display as _redact_tool_args_for_display,
    _detect_tool_failure,
 )
 from agent.tool_guardrails import ToolGuardrailDecision
@@ -70,35 +69,12 @@ def _budget_for_agent(agent) -> BudgetConfig:
 _MAX_TOOL_WORKERS = 8


-def _flush_session_db_after_tool_progress(
-    agent,
-    messages: list,
-    *,
-    stage: str,
-) -> None:
-    """Best-effort incremental SessionDB flush for tool-call progress.
-
-    Tool execution can perform side effects that terminate or restart the
-    current Hermes process before the normal turn-end persistence path runs.
-    Flush the already-appended assistant/tool messages immediately so the
-    transcript survives destructive-but-valid tool calls.
-    """
-    try:
-        agent._flush_messages_to_session_db(messages)
-    except Exception as exc:
-        logger.warning("Incremental tool-call persistence failed after %s: %s", stage, exc)
-
-
 def _ra():
    """Lazy reference to ``run_agent`` so patches like ``run_agent._set_interrupt`` work."""
    import run_agent
    return run_agent


-def _is_interpreter_shutdown_submit_error(exc: RuntimeError) -> bool:
-    return "cannot schedule new futures after interpreter shutdown" in str(exc)
-
-
 def _emit_terminal_post_tool_call(
    agent,
    *,
@@ -303,11 +279,6 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
                f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
                tc.id,
            ))
-            _flush_session_db_after_tool_progress(
-                agent,
-                messages,
-                stage=f"cancelled tool result {tc.function.name}",
-            )
        return

    # ── Parse args + pre-execution bookkeeping ───────────────────────
@@ -470,11 +441,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
    if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
        print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
        for i, (tc, name, args, middleware_trace, block_result, blocked_by_guardrail) in enumerate(parsed_calls, 1):
-            display_args = _redact_tool_args_for_display(name, args) or args
-            args_str = json.dumps(display_args, ensure_ascii=False)
+            args_str = json.dumps(args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {name}({list(display_args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {name}({list(args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
@@ -484,9 +454,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_progress_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                preview = _build_tool_preview(name, display_args)
-                agent.tool_progress_callback("tool.started", name, preview, display_args)
+                preview = _build_tool_preview(name, args)
+                agent.tool_progress_callback("tool.started", name, preview, args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

@@ -495,8 +464,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            continue
        if agent.tool_start_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                agent.tool_start_callback(tc.id, name, display_args)
+                agent.tool_start_callback(tc.id, name, args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -613,40 +581,13 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        if runnable_calls:
            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                for submit_index, (i, tc, name, args) in enumerate(runnable_calls):
+                for i, tc, name, args in runnable_calls:
                    # Propagate the agent turn's ContextVars (e.g.
                    # _approval_session_key) AND thread-local approval/sudo
                    # callbacks into the worker thread; clears callbacks on exit.
-                    try:
-                        f = executor.submit(
-                            propagate_context_to_thread(_run_tool), i, tc, name, args, parsed_calls[i][3]
-                        )
-                    except RuntimeError as submit_error:
-                        if not _is_interpreter_shutdown_submit_error(submit_error):
-                            raise
-                        skipped_calls = runnable_calls[submit_index:]
-                        logger.warning(
-                            "interpreter shutdown while scheduling concurrent tools; "
-                            "skipping %d unsubmitted tool(s)",
-                            len(skipped_calls),
-                        )
-                        for skipped_i, _tc, skipped_name, skipped_args in skipped_calls:
-                            if results[skipped_i] is None:
-                                middleware_trace = parsed_calls[skipped_i][3]
-                                result = (
-                                    f"Error executing tool '{skipped_name}': "
-                                    "Python interpreter is shutting down; tool was not started"
-                                )
-                                results[skipped_i] = (
-                                    skipped_name,
-                                    skipped_args,
-                                    result,
-                                    0.0,
-                                    True,
-                                    False,
-                                    middleware_trace,
-                                )
-                        break
+                    f = executor.submit(
+                        propagate_context_to_thread(_run_tool), i, tc, name, args, parsed_calls[i][3]
+                    )
                    futures.append(f)

                # Wait for all to complete with periodic heartbeats so the
@@ -796,8 +737,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe

        if not blocked and agent.tool_complete_callback:
            try:
-                display_args = _redact_tool_args_for_display(name, args) or args
-                agent.tool_complete_callback(tc.id, name, display_args, function_result)
+                agent.tool_complete_callback(tc.id, name, args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

@@ -828,11 +768,6 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        # String results pass through unchanged.
        _tool_content = agent._tool_result_content_for_active_model(name, function_result)
        messages.append(make_tool_result_message(name, _tool_content, tc.id))
-        _flush_session_db_after_tool_progress(
-            agent,
-            messages,
-            stage=f"tool result {name}",
-        )

        # ── Per-tool /steer drain ───────────────────────────────────
        # Same as the sequential path: drain between each collected
@@ -868,16 +803,13 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                agent._vprint(f"{agent.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
            for skipped_tc in remaining_calls:
                skipped_name = skipped_tc.function.name
-                messages.append(make_tool_result_message(
-                    skipped_name,
-                    f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
-                    skipped_tc.id,
-                ))
-                _flush_session_db_after_tool_progress(
-                    agent,
-                    messages,
-                    stage=f"cancelled tool result {skipped_name}",
-                )
+                skip_msg = {
+                    "role": "tool",
+                    "name": skipped_name,
+                    "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
+                    "tool_call_id": skipped_tc.id,
+                }
+                messages.append(skip_msg)
            break

        function_name = tool_call.function.name
@@ -959,11 +891,10 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            agent._iters_since_skill = 0

        if not agent.quiet_mode and getattr(agent, "tool_progress_mode", "all") != "off":
-            display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-            args_str = json.dumps(display_args, ensure_ascii=False)
+            args_str = json.dumps(function_args, ensure_ascii=False)
            if agent.verbose_logging:
-                print(f"  📞 Tool {i}: {function_name}({list(display_args.keys())})")
-                print(agent._wrap_verbose("Args: ", json.dumps(display_args, indent=2, ensure_ascii=False)))
+                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
+                print(agent._wrap_verbose("Args: ", json.dumps(function_args, indent=2, ensure_ascii=False)))
            else:
                args_preview = args_str[:agent.log_prefix_chars] + "..." if len(args_str) > agent.log_prefix_chars else args_str
                print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
@@ -984,16 +915,14 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_progress_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args)
-                agent.tool_progress_callback("tool.started", function_name, preview, display_args)
+                preview = _build_tool_preview(function_name, function_args)
+                agent.tool_progress_callback("tool.started", function_name, preview, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool progress callback error: {cb_err}")

        if not _execution_blocked and agent.tool_start_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                agent.tool_start_callback(tool_call.id, function_name, display_args)
+                agent.tool_start_callback(tool_call.id, function_name, function_args)
            except Exception as cb_err:
                logging.debug(f"Tool start callback error: {cb_err}")

@@ -1223,8 +1152,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _ce_result = None
@@ -1257,8 +1185,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _mem_result = None
@@ -1289,8 +1216,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
            if agent._should_emit_quiet_tool_messages() and agent._should_start_quiet_spinner():
                face = random.choice(KawaiiSpinner.get_waiting_faces())
                emoji = _get_tool_emoji(function_name)
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                preview = _build_tool_preview(function_name, display_args) or function_name
+                preview = _build_tool_preview(function_name, function_args) or function_name
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots', print_fn=agent._print_fn)
                spinner.start()
            _spinner_result = None
@@ -1452,8 +1378,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe

        if not _execution_blocked and agent.tool_complete_callback:
            try:
-                display_args = _redact_tool_args_for_display(function_name, function_args) or function_args
-                agent.tool_complete_callback(tool_call.id, function_name, display_args, function_result)
+                agent.tool_complete_callback(tool_call.id, function_name, function_args, function_result)
            except Exception as cb_err:
                logging.debug(f"Tool complete callback error: {cb_err}")

@@ -1477,11 +1402,6 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
        # (see parallel path for rationale). String results pass through.
        _tool_content = agent._tool_result_content_for_active_model(function_name, function_result)
        messages.append(make_tool_result_message(function_name, _tool_content, tool_call.id))
-        _flush_session_db_after_tool_progress(
-            agent,
-            messages,
-            stage=f"tool result {function_name}",
-        )

        # ── Per-tool /steer drain ───────────────────────────────────
        # Drain pending steer BETWEEN individual tool calls so the
@@ -1508,11 +1428,6 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
                    skipped_tc.id,
                ))
-                _flush_session_db_after_tool_progress(
-                    agent,
-                    messages,
-                    stage=f"skipped tool result {skipped_name}",
-                )
            break

        if agent.tool_delay > 0 and i < len(assistant_message.tool_calls):
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@@ -5,47 +5,12 @@ This transport owns format conversion and normalization — NOT client lifecycle
 streaming, or the _run_codex_stream() call path.
 """

-import hashlib
-import json
 from typing import Any, Dict, List, Optional

 from agent.transports.base import ProviderTransport
 from agent.transports.types import NormalizedResponse, ToolCall


-def _content_cache_key(instructions: str, tools: Optional[List[Dict[str, Any]]]) -> Optional[str]:
-    """Content-address the prompt cache key from the static request prefix.
-
-    Returns ``pck_<sha256[:24]>`` of (instructions + sorted tool schemas), or
-    None when there is nothing static to key on. The cache key is a routing
-    hint only — never a correctness boundary — so two requests sharing a system
-    prompt and tool set intentionally resolve to the same warm prefix bucket.
-
-    The fix this exists for: recurring cron jobs build session_id as
-    ``cron_<id>_<timestamp>``, so using session_id as the cache key made every
-    fire cache-cold. The static prefix (identity + tools) is identical across
-    fires, so hashing it gives a stable key that stays warm within the
-    provider's cache TTL. Sorting tools by name keeps the hash insertion-order
-    independent.
-    """
-    if not instructions and not tools:
-        return None
-    tools_part = ""
-    if tools:
-        sorted_tools = sorted(
-            (t for t in tools if isinstance(t, dict)),
-            key=lambda t: str(t.get("name") or t.get("type") or ""),
-        )
-        tools_part = json.dumps(
-            sorted_tools, sort_keys=True, ensure_ascii=False, separators=(",", ":")
-        )
-    # \x00 separator so instructions ending in the tool JSON can't collide with
-    # a request whose instructions contain that JSON and whose tools are empty.
-    content = f"{instructions or ''}\x00{tools_part}"
-    digest = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()[:24]
-    return f"pck_{digest}"
-
-
 class ResponsesApiTransport(ProviderTransport):
    """Transport for api_mode='codex_responses'.

@@ -106,10 +71,7 @@ class ResponsesApiTransport(ProviderTransport):
        params:
            instructions: str — system prompt (extracted from messages[0] if not given)
            reasoning_config: dict | None — {effort, enabled}
-            session_id: str | None — transcript/session id; drives the xAI
-                x-grok-conv-id header and the Codex cache-scope headers, and is
-                the fallback prompt_cache_key when there is no static prefix to
-                content-address
+            session_id: str | None — used for prompt_cache_key + xAI conv header
            max_tokens: int | None — max_output_tokens
            timeout: float | None — per-request timeout forwarded to the SDK
            request_overrides: dict | None — extra kwargs merged in
@@ -250,17 +212,10 @@ class ResponsesApiTransport(ProviderTransport):
            kwargs["parallel_tool_calls"] = True

        session_id = params.get("session_id")
-        # prompt_cache_key is content-addressed from the static prefix
-        # (instructions + tools), NOT session_id — recurring cron jobs carry a
-        # per-fire timestamp in session_id (cron_<id>_<ts>) that made every run
-        # cache-cold. session_id is left untouched for transcript isolation and
-        # the cache-scope routing headers below. Falls back to session_id when
-        # there is no static content to hash.
-        cache_key = _content_cache_key(instructions, response_tools) or session_id
        # xAI Responses takes prompt_cache_key in extra_body (set further
        # down); GitHub Models opts out of cache-key routing entirely.
-        if not is_github_responses and not is_xai_responses and cache_key:
-            kwargs["prompt_cache_key"] = cache_key
+        if not is_github_responses and not is_xai_responses and session_id:
+            kwargs["prompt_cache_key"] = session_id

        if reasoning_enabled and is_xai_responses:
            from agent.model_metadata import grok_supports_reasoning_effort
@@ -371,7 +326,7 @@ class ResponsesApiTransport(ProviderTransport):
            merged_extra_body: Dict[str, Any] = {}
            if isinstance(existing_extra_body, dict):
                merged_extra_body.update(existing_extra_body)
-            merged_extra_body.setdefault("prompt_cache_key", cache_key)
+            merged_extra_body.setdefault("prompt_cache_key", session_id)
            kwargs["extra_body"] = merged_extra_body

        return kwargs
--- a/agent/turn_context.py
+++ b/agent/turn_context.py
@@ -28,12 +28,8 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional

-from agent.conversation_compression import conversation_history_after_compression
 from agent.iteration_budget import IterationBudget
-from agent.model_metadata import (
-    estimate_messages_tokens_rough,
-    estimate_request_tokens_rough,
-)
+from agent.model_metadata import estimate_request_tokens_rough

 logger = logging.getLogger(__name__)

@@ -61,34 +57,6 @@ def _compression_made_progress(
    return orig_tokens > 0 and new_tokens < orig_tokens * 0.95


-def _should_run_preflight_estimate(
-    messages: List[Dict[str, Any]],
-    protect_first_n: int,
-    protect_last_n: int,
-    threshold_tokens: int,
-) -> bool:
-    """Cheap gate for the (expensive) full preflight token estimate.
-
-    Returns ``True`` when either:
-      (a) message count exceeds the protected ranges (the historical gate), or
-      (b) a cheap char-based estimate already crosses the configured threshold
-          — the few-but-huge case from issue #27405 that the count-only gate
-          would silently skip (a handful of very large messages never trips
-          the count condition, so compression was never attempted and the
-          turn hit a hard context-overflow error).
-
-    Branch (b) uses ``estimate_messages_tokens_rough`` (the shared char-based
-    estimator) so a single large base64 image isn't mistaken for ~250K tokens.
-    It intentionally undercounts vs. the full request estimate — it omits the
-    system prompt and tool schemas — because it is only a *hint* deciding
-    whether to pay for the authoritative ``estimate_request_tokens_rough``,
-    which (together with ``should_compress``) makes the real decision.
-    """
-    if len(messages) > protect_first_n + protect_last_n + 1:
-        return True
-    return estimate_messages_tokens_rough(messages) >= threshold_tokens
-
-
@dataclass
 class TurnContext:
    """Values produced by the turn prologue and consumed by the turn loop."""
@@ -143,13 +111,7 @@ def build_turn_context(
    # Guard stdio against OSError from broken pipes (systemd/headless/daemon).
    install_safe_stdio()

-    # NOTE: the DB session row is created later, AFTER the system prompt is
-    # restored/built (see _ensure_db_session() below the system-prompt block).
-    # Creating it here — before _cached_system_prompt is populated — inserts a
-    # row with system_prompt=NULL on a fresh API/gateway agent that carries
-    # client-managed history, which then trips the "stored system prompt is
-    # null; rebuilding from scratch" warning and a needless first-turn prefix
-    # cache miss. (Issue #45499.)
+    agent._ensure_db_session()

    # Tell auxiliary_client what the live main provider/model are for this turn.
    try:
@@ -316,11 +278,6 @@ def build_turn_context(

    active_system_prompt = agent._cached_system_prompt

-    # Create the DB session row now that _cached_system_prompt is populated, so
-    # the persisted snapshot is written non-NULL on the first turn (Issue
-    # #45499). Idempotent: _ensure_db_session() no-ops once the row exists.
-    agent._ensure_db_session()
-
    # Crash-resilience: persist the inbound user turn as soon as the session row exists.
    try:
        agent._persist_session(messages, conversation_history)
@@ -332,14 +289,10 @@ def build_turn_context(
        )

    # ── Preflight context compression ──
-    # Gate the (expensive) full token estimate behind a cheap pre-check.
-    # See ``_should_run_preflight_estimate`` for the OR semantics that fix
-    # issue #27405 (a few very large messages slipping past the count gate).
-    if agent.compression_enabled and _should_run_preflight_estimate(
-        messages,
-        agent.context_compressor.protect_first_n,
-        agent.context_compressor.protect_last_n,
-        agent.context_compressor.threshold_tokens,
+    if (
+        agent.compression_enabled
+        and len(messages) > agent.context_compressor.protect_first_n
+                            + agent.context_compressor.protect_last_n + 1
    ):
        _preflight_tokens = estimate_request_tokens_rough(
            messages,
@@ -401,9 +354,7 @@ def build_turn_context(
                    _orig_len, len(messages), _orig_tokens, _preflight_tokens
                ):
                    break  # Cannot compress further: neither rows nor tokens moved
-                conversation_history = conversation_history_after_compression(
-                    agent, messages
-                )
+                conversation_history = None
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
                agent._last_content_with_tools = None
@@ -441,8 +392,6 @@ def build_turn_context(

    # Per-turn file-mutation verifier state.
    agent._turn_failed_file_mutations = {}
-    agent._turn_file_mutation_paths = set()
-    agent._verification_stop_nudges = 0

    # Record the execution thread so interrupt()/clear_interrupt() can scope
    # the tool-level interrupt signal to THIS agent's thread only.
--- a/agent/turn_finalizer.py
+++ b/agent/turn_finalizer.py
@@ -166,25 +166,6 @@ def finalize_turn(
    # same empty-response loop again.
    try:
        agent._drop_trailing_empty_response_scaffolding(messages)
-
-        # When the turn was interrupted and the last message is a tool
-        # result, append a synthetic assistant message to close the
-        # tool-call sequence. Without this, the session persists a
-        # ``tool → user`` alternation that strict providers (Gemini,
-        # Claude) reject, causing them to hallucinate a continuation of
-        # the user's message on the next turn (#48879).
-        #
-        # ``_drop_trailing_empty_response_scaffolding`` only rewinds the
-        # tool tail when an empty-response scaffolding flag is present; a
-        # clean ``/stop`` interrupt after a successful tool sets no such
-        # flag, so the tool result survives as the tail and we close it
-        # here instead. On an interrupt ``final_response`` is typically
-        # empty, so fall back to an explicit placeholder rather than
-        # persisting an empty-content assistant turn.
-        if interrupted:
-            from agent.message_sanitization import close_interrupted_tool_sequence
-            close_interrupted_tool_sequence(messages, final_response)
-
        agent._persist_session(messages, conversation_history)
    except Exception as _persist_err:
        _cleanup_errors.append(f"persist_session: {_persist_err}")
@@ -289,14 +270,7 @@ def finalize_turn(
                    and len(_stripped) <= 24
                    and _stripped[-1:] not in {".", "!", "?", "。", "！", "？", "`", ")"}
                )
-                _is_partial_stream_recovery = (
-                    str(_turn_exit_reason) == "partial_stream_recovery"
-                )
-                if (
-                    _is_empty_terminal
-                    or _is_partial_fragment
-                    or _is_partial_stream_recovery
-                ):
+                if _is_empty_terminal or _is_partial_fragment:
                    _explanation = agent._format_turn_completion_explanation(
                        _turn_exit_reason
                    )
--- a/agent/turn_retry_state.py
+++ b/agent/turn_retry_state.py
@@ -67,11 +67,6 @@ class TurnRetryState:
    # ── Restart signals (read by the outer loop after the attempt) ───────
    restart_with_compressed_messages: bool = False
    restart_with_length_continuation: bool = False
-    # Set when a content-filter stream stall (e.g. MiniMax "new_sensitive")
-    # has been escalated to the fallback chain: the partial-stream content
-    # was rolled back off ``messages`` and the loop should re-issue the API
-    # call against the newly-activated provider (#32421).
-    restart_with_rebuilt_messages: bool = False

    def __iter__(self):
        # Convenience for debugging / tests: iterate (name, value) pairs.
--- a/agent/verification_evidence.py
+++ b/agent/verification_evidence.py
@@ -1,618 +0,0 @@
-"""Coding verification evidence ledger.
-
-This module records what the agent actually proved while working in a code
-workspace. It is deliberately passive: it never decides to run a suite, never
-blocks completion, and never upgrades targeted checks into "repo green".
-"""
-
-from __future__ import annotations
-
-import json
-import re
-import shlex
-import sqlite3
-import tempfile
-import threading
-from dataclasses import dataclass
-from datetime import datetime, timedelta, timezone
-from pathlib import Path
-from typing import Any, Optional
-
-from hermes_constants import get_hermes_home
-
-
-_DB_LOCK = threading.Lock()
-_MAX_OUTPUT_SUMMARY_CHARS = 2000
-_MAX_EVIDENCE_AGE_DAYS = 30
-_MAX_EVENTS_PER_SESSION_ROOT = 100
-_MAX_TOTAL_UNREFERENCED_EVENTS = 10_000
-_AD_HOC_SCRIPT_NAME_PREFIXES = ("hermes-verify-", "hermes-ad-hoc-")
-_VERIFY_SCHEMA_VERSION = 1
-_SHELL_SPLIT_RE = re.compile(r"\s*(?:&&|\|\||;)\s*")
-
-
-@dataclass(frozen=True)
-class VerificationEvidence:
-    """A classified command result worth recording."""
-
-    command: str
-    canonical_command: str
-    kind: str
-    scope: str
-    status: str
-    exit_code: int
-    cwd: str
-    root: str
-    session_id: str
-    output_summary: str = ""
-
-
-def _utc_now() -> str:
-    return datetime.now(timezone.utc).isoformat()
-
-
-def _retention_cutoff() -> str:
-    return (datetime.now(timezone.utc) - timedelta(days=_MAX_EVIDENCE_AGE_DAYS)).isoformat()
-
-
-def _db_path() -> Path:
-    return get_hermes_home() / "verification_evidence.db"
-
-
-def _connect() -> sqlite3.Connection:
-    path = _db_path()
-    path.parent.mkdir(parents=True, exist_ok=True)
-    conn = sqlite3.connect(path)
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.execute("PRAGMA busy_timeout=5000")
-    conn.row_factory = sqlite3.Row
-    _ensure_schema(conn)
-    return conn
-
-
-def _ensure_schema(conn: sqlite3.Connection) -> None:
-    conn.execute(
-        """
-        CREATE TABLE IF NOT EXISTS meta (
-            key TEXT PRIMARY KEY,
-            value TEXT NOT NULL
-        )
-        """
-    )
-    conn.execute(
-        """
-        CREATE TABLE IF NOT EXISTS verification_events (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            created_at TEXT NOT NULL,
-            session_id TEXT NOT NULL,
-            cwd TEXT NOT NULL,
-            root TEXT NOT NULL,
-            command TEXT NOT NULL,
-            canonical_command TEXT NOT NULL,
-            kind TEXT NOT NULL,
-            scope TEXT NOT NULL,
-            status TEXT NOT NULL,
-            exit_code INTEGER NOT NULL,
-            output_summary TEXT NOT NULL
-        )
-        """
-    )
-    conn.execute(
-        """
-        CREATE TABLE IF NOT EXISTS verification_state (
-            session_id TEXT NOT NULL,
-            root TEXT NOT NULL,
-            last_event_id INTEGER,
-            last_edit_at TEXT,
-            changed_paths_json TEXT NOT NULL DEFAULT '[]',
-            PRIMARY KEY (session_id, root)
-        )
-        """
-    )
-    conn.execute(
-        """
-        CREATE INDEX IF NOT EXISTS idx_verification_events_session_root
-        ON verification_events(session_id, root, id DESC)
-        """
-    )
-    conn.execute(
-        "INSERT OR REPLACE INTO meta(key, value) VALUES ('schema_version', ?)",
-        (str(_VERIFY_SCHEMA_VERSION),),
-    )
-    conn.commit()
-
-
-def _split_segment_tokens(command: str) -> list[list[str]]:
-    segments: list[list[str]] = []
-    for segment in _SHELL_SPLIT_RE.split(command.strip()):
-        if not segment:
-            continue
-        try:
-            tokens = shlex.split(segment)
-        except ValueError:
-            continue
-        if tokens:
-            segments.append(tokens)
-    return segments
-
-
-def _clean_token(token: str) -> str:
-    token = token.strip()
-    while token.startswith("./"):
-        token = token[2:]
-    return token
-
-
-def _canonical_tokens(canonical: str) -> list[str]:
-    try:
-        return [_clean_token(t) for t in shlex.split(canonical) if t]
-    except ValueError:
-        return []
-
-
-def _find_subsequence(tokens: list[str], needle: list[str]) -> Optional[int]:
-    if not tokens or not needle or len(needle) > len(tokens):
-        return None
-    cleaned = [_clean_token(t) for t in tokens]
-    for idx in range(0, len(cleaned) - len(needle) + 1):
-        if cleaned[idx:idx + len(needle)] == needle:
-            return idx
-    return None
-
-
-def _strip_command_prefix(tokens: list[str]) -> list[str]:
-    """Remove harmless command prefixes before matching canonical commands."""
-    remaining = list(tokens)
-    if remaining and remaining[0] == "env":
-        remaining = remaining[1:]
-    while remaining and "=" in remaining[0] and not remaining[0].startswith("-"):
-        remaining = remaining[1:]
-    while remaining and remaining[0] in {"command", "time", "noglob"}:
-        remaining = remaining[1:]
-    return remaining
-
-
-def _equivalent_needles(needle: list[str]) -> list[list[str]]:
-    """Return command spellings equivalent to the detected canonical command."""
-    candidates = [needle]
-    if len(needle) >= 3 and needle[1] == "run":
-        package_manager = needle[0]
-        script_name = needle[2]
-        if package_manager in {"npm", "pnpm", "yarn", "bun"}:
-            candidates.append([package_manager, script_name])
-    if len(needle) == 1 and "/" in needle[0]:
-        candidates.extend([["bash", needle[0]], ["sh", needle[0]]])
-    if needle == ["pytest"]:
-        candidates.extend(
-            [
-                ["python", "-m", "pytest"],
-                ["python3", "-m", "pytest"],
-                ["uv", "run", "pytest"],
-                ["poetry", "run", "pytest"],
-                ["pipenv", "run", "pytest"],
-            ]
-        )
-    return candidates
-
-
-def _find_canonical_match(command: str, canonical_commands: list[str]) -> Optional[tuple[str, list[str]]]:
-    """Return ``(canonical, trailing_args)`` for the first detected command."""
-
-    segments = _split_segment_tokens(command)
-    for canonical in canonical_commands:
-        needle = _canonical_tokens(canonical)
-        if not needle:
-            continue
-        for tokens in segments:
-            candidate_tokens = _strip_command_prefix(tokens)
-            for candidate in _equivalent_needles(needle):
-                if candidate_tokens[:len(candidate)] == candidate:
-                    return canonical, candidate_tokens[len(candidate):]
-    return None
-
-
-def _kind_for_command(canonical: str) -> str:
-    lowered = canonical.lower()
-    if any(word in lowered for word in ("lint", "eslint", "ruff")):
-        return "lint"
-    if any(word in lowered for word in ("typecheck", "tsc", "mypy", "pyright", "ty")):
-        return "typecheck"
-    if "build" in lowered:
-        return "build"
-    if "fmt" in lowered or "format" in lowered:
-        return "format"
-    if "check" in lowered and "test" not in lowered:
-        return "check"
-    return "test"
-
-
-def _looks_like_target(arg: str) -> bool:
-    if not arg or arg.startswith("-") or "=" in arg:
-        return False
-    return (
-        "/" in arg
-        or "\\" in arg
-        or "::" in arg
-        or arg.endswith((".py", ".js", ".jsx", ".ts", ".tsx", ".rs", ".go", ".java"))
-        or arg.startswith(("test_", "tests", "spec", "__tests__"))
-    )
-
-
-def _scope_for_args(args: list[str]) -> str:
-    return "targeted" if any(_looks_like_target(arg) for arg in args) else "full"
-
-
-def _is_under_temp_dir(token: str) -> bool:
-    if not token or token.startswith("-"):
-        return False
-    try:
-        path = Path(token).expanduser()
-        if not path.is_absolute():
-            return False
-        resolved = path.resolve()
-        temp_root = Path(tempfile.gettempdir()).resolve()
-        return resolved == temp_root or temp_root in resolved.parents
-    except Exception:
-        return False
-
-
-def _is_under_root(token: str, root: str | Path | None) -> bool:
-    if not root:
-        return False
-    try:
-        path = Path(token).expanduser().resolve()
-        root_path = Path(root).expanduser().resolve()
-        return path == root_path or root_path in path.parents
-    except Exception:
-        return False
-
-
-def _is_temp_script_path(token: str, root: str | Path | None) -> bool:
-    try:
-        name = Path(token).expanduser().name
-    except Exception:
-        return False
-    return (
-        name.startswith(_AD_HOC_SCRIPT_NAME_PREFIXES)
-        and _is_under_temp_dir(token)
-        and not _is_under_root(token, root)
-    )
-
-
-def _ad_hoc_script_args(tokens: list[str], root: str | Path | None) -> Optional[list[str]]:
-    candidate_tokens = _strip_command_prefix(tokens)
-    if not candidate_tokens:
-        return None
-    command = candidate_tokens[0]
-    if _is_temp_script_path(command, root):
-        return candidate_tokens[1:]
-    if command in {"python", "python3", "node", "bash", "sh", "ruby", "perl"}:
-        for idx, token in enumerate(candidate_tokens[1:], start=1):
-            if token == "--":
-                continue
-            if _is_temp_script_path(token, root):
-                return candidate_tokens[idx + 1:]
-            if not token.startswith("-"):
-                return None
-    return None
-
-
-def _find_ad_hoc_match(command: str, root: str | Path | None) -> Optional[list[str]]:
-    for tokens in _split_segment_tokens(command):
-        trailing_args = _ad_hoc_script_args(tokens, root)
-        if trailing_args is not None:
-            return trailing_args
-    return None
-
-
-def _summarize_output(output: str) -> str:
-    text = (output or "").strip()
-    if len(text) <= _MAX_OUTPUT_SUMMARY_CHARS:
-        return text
-    head = _MAX_OUTPUT_SUMMARY_CHARS // 3
-    tail = _MAX_OUTPUT_SUMMARY_CHARS - head
-    return (
-        text[:head]
-        + f"\n... [{len(text) - _MAX_OUTPUT_SUMMARY_CHARS} chars omitted] ...\n"
-        + text[-tail:]
-    )
-
-
-def _prune_old_events(conn: sqlite3.Connection, *, session_id: str, root: str) -> None:
-    """Bound ledger growth without deleting the current state pointer."""
-    cutoff = _retention_cutoff()
-    conn.execute(
-        """
-        DELETE FROM verification_events
-        WHERE session_id = ?
-          AND root = ?
-          AND id NOT IN (
-              SELECT id FROM verification_events
-              WHERE session_id = ? AND root = ?
-              ORDER BY id DESC
-              LIMIT ?
-          )
-        """,
-        (session_id, root, session_id, root, _MAX_EVENTS_PER_SESSION_ROOT),
-    )
-    conn.execute(
-        """
-        DELETE FROM verification_state
-        WHERE (
-            last_edit_at IS NOT NULL
-            AND last_edit_at < ?
-        )
-        OR (
-            last_edit_at IS NULL
-            AND last_event_id IN (
-                SELECT id FROM verification_events
-                WHERE created_at < ?
-            )
-        )
-        """,
-        (cutoff, cutoff),
-    )
-    conn.execute(
-        """
-        DELETE FROM verification_events
-        WHERE created_at < ?
-          AND id NOT IN (
-              SELECT last_event_id FROM verification_state
-              WHERE last_event_id IS NOT NULL
-          )
-        """,
-        (cutoff,),
-    )
-    conn.execute(
-        """
-        DELETE FROM verification_events
-        WHERE id NOT IN (
-            SELECT id FROM verification_events
-            ORDER BY id DESC
-            LIMIT ?
-        )
-          AND id NOT IN (
-              SELECT last_event_id FROM verification_state
-              WHERE last_event_id IS NOT NULL
-          )
-        """,
-        (_MAX_TOTAL_UNREFERENCED_EVENTS,),
-    )
-
-
-def classify_verification_command(
-    command: str,
-    *,
-    cwd: str | Path | None = None,
-    session_id: str | None = None,
-    exit_code: int = 0,
-    output: str = "",
-) -> Optional[VerificationEvidence]:
-    """Classify a terminal command as verification evidence, if applicable."""
-
-    if not command or not isinstance(command, str):
-        return None
-    try:
-        from agent.coding_context import project_facts_for
-
-        facts = project_facts_for(cwd)
-    except Exception:
-        facts = None
-    if not facts:
-        return None
-
-    verify_commands = list(facts.get("verifyCommands") or [])
-    match = _find_canonical_match(command, verify_commands)
-    is_ad_hoc = False
-    if match is None and not verify_commands:
-        ad_hoc_args = _find_ad_hoc_match(command, facts.get("root"))
-        if ad_hoc_args is not None:
-            match = ("ad-hoc verification script", ad_hoc_args)
-            is_ad_hoc = True
-    if match is None:
-        return None
-
-    canonical, trailing_args = match
-    return VerificationEvidence(
-        command=command,
-        canonical_command=canonical,
-        kind="ad_hoc" if is_ad_hoc else _kind_for_command(canonical),
-        scope="targeted" if is_ad_hoc else _scope_for_args(trailing_args),
-        status="passed" if int(exit_code) == 0 else "failed",
-        exit_code=int(exit_code),
-        cwd=str(Path(cwd or ".").resolve()),
-        root=str(facts.get("root") or Path(cwd or ".").resolve()),
-        session_id=str(session_id or "default"),
-        output_summary=_summarize_output(output),
-    )
-
-
-def record_terminal_result(
-    *,
-    command: str,
-    cwd: str | Path | None,
-    session_id: str | None,
-    exit_code: int,
-    output: str = "",
-) -> Optional[dict[str, Any]]:
-    """Record a foreground terminal result when it is verification evidence."""
-
-    evidence = classify_verification_command(
-        command,
-        cwd=cwd,
-        session_id=session_id,
-        exit_code=exit_code,
-        output=output,
-    )
-    if evidence is None:
-        return None
-
-    created_at = _utc_now()
-    with _DB_LOCK:
-        with _connect() as conn:
-            cur = conn.execute(
-                """
-                INSERT INTO verification_events(
-                    created_at, session_id, cwd, root, command, canonical_command,
-                    kind, scope, status, exit_code, output_summary
-                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-                """,
-                (
-                    created_at,
-                    evidence.session_id,
-                    evidence.cwd,
-                    evidence.root,
-                    evidence.command,
-                    evidence.canonical_command,
-                    evidence.kind,
-                    evidence.scope,
-                    evidence.status,
-                    evidence.exit_code,
-                    evidence.output_summary,
-                ),
-            )
-            if cur.lastrowid is None:
-                raise RuntimeError("verification event insert did not return an id")
-            event_id = int(cur.lastrowid)
-            conn.execute(
-                """
-                INSERT INTO verification_state(
-                    session_id, root, last_event_id, last_edit_at, changed_paths_json
-                ) VALUES (?, ?, ?, NULL, '[]')
-                ON CONFLICT(session_id, root) DO UPDATE SET
-                    last_event_id = excluded.last_event_id,
-                    last_edit_at = NULL,
-                    changed_paths_json = '[]'
-                """,
-                (evidence.session_id, evidence.root, event_id),
-            )
-            _prune_old_events(conn, session_id=evidence.session_id, root=evidence.root)
-            conn.commit()
-
-    return {"id": event_id, **evidence.__dict__, "created_at": created_at}
-
-
-def mark_workspace_edited(
-    *,
-    session_id: str | None,
-    cwd: str | Path | None,
-    paths: list[str] | tuple[str, ...] | None = None,
-) -> Optional[dict[str, Any]]:
-    """Mark verification evidence stale after a successful file edit."""
-
-    try:
-        from agent.coding_context import project_facts_for
-
-        facts = project_facts_for(cwd)
-    except Exception:
-        facts = None
-    if not facts:
-        return None
-
-    sid = str(session_id or "default")
-    root = str(facts.get("root") or Path(cwd or ".").resolve())
-    changed_paths = sorted({str(p) for p in (paths or []) if p})
-    edited_at = _utc_now()
-
-    with _DB_LOCK:
-        with _connect() as conn:
-            row = conn.execute(
-                """
-                SELECT changed_paths_json FROM verification_state
-                WHERE session_id = ? AND root = ?
-                """,
-                (sid, root),
-            ).fetchone()
-            existing: set[str] = set()
-            if row is not None:
-                try:
-                    existing = set(json.loads(row["changed_paths_json"] or "[]"))
-                except (TypeError, ValueError):
-                    existing = set()
-            merged = sorted((existing | set(changed_paths)))[-200:]
-            conn.execute(
-                """
-                INSERT INTO verification_state(
-                    session_id, root, last_event_id, last_edit_at, changed_paths_json
-                ) VALUES (?, ?, NULL, ?, ?)
-                ON CONFLICT(session_id, root) DO UPDATE SET
-                    last_edit_at = excluded.last_edit_at,
-                    changed_paths_json = excluded.changed_paths_json
-                """,
-                (sid, root, edited_at, json.dumps(merged)),
-            )
-            conn.commit()
-
-    return {"session_id": sid, "root": root, "last_edit_at": edited_at, "changed_paths": changed_paths}
-
-
-def verification_status(
-    *,
-    session_id: str | None,
-    cwd: str | Path | None,
-) -> dict[str, Any]:
-    """Return the best known verification state for a session/workspace."""
-
-    try:
-        from agent.coding_context import project_facts_for
-
-        facts = project_facts_for(cwd)
-    except Exception:
-        facts = None
-    if not facts:
-        return {"status": "not_applicable", "evidence": None}
-
-    sid = str(session_id or "default")
-    root = str(facts.get("root") or Path(cwd or ".").resolve())
-    with _DB_LOCK:
-        with _connect() as conn:
-            state = conn.execute(
-                """
-                SELECT last_event_id, last_edit_at, changed_paths_json
-                FROM verification_state
-                WHERE session_id = ? AND root = ?
-                """,
-                (sid, root),
-            ).fetchone()
-            if state is None:
-                return {
-                    "status": "unverified",
-                    "evidence": None,
-                    "root": root,
-                    "session_id": sid,
-                    "changed_paths": [],
-                }
-            event = None
-            if state["last_event_id"] is not None:
-                event = conn.execute(
-                    "SELECT * FROM verification_events WHERE id = ?",
-                    (state["last_event_id"],),
-                ).fetchone()
-
-    changed_paths: list[str] = []
-    try:
-        changed_paths = json.loads(state["changed_paths_json"] or "[]")
-    except (TypeError, ValueError):
-        changed_paths = []
-
-    if event is None:
-        return {
-            "status": "unverified",
-            "evidence": None,
-            "root": root,
-            "session_id": sid,
-            "changed_paths": changed_paths,
-        }
-
-    evidence = dict(event)
-    if state["last_edit_at"] and state["last_edit_at"] > evidence["created_at"]:
-        status = "stale"
-    else:
-        status = evidence["status"]
-    return {
-        "status": status,
-        "evidence": evidence,
-        "root": root,
-        "session_id": sid,
-        "changed_paths": changed_paths,
-    }
--- a/agent/verification_stop.py
+++ b/agent/verification_stop.py
@@ -1,304 +0,0 @@
-"""Turn-end verification guard for coding edits.
-
-This module is intentionally policy-only. It never runs checks itself; it turns
-the passive verification ledger into a bounded follow-up when the model tries to
-finish immediately after editing code without fresh evidence.
-"""
-
-from __future__ import annotations
-
-import os
-import tempfile
-from pathlib import Path
-from typing import Any, Iterable
-
-
-_MAX_CHANGED_PATHS_IN_NUDGE = 8
-
-# Non-code file extensions whose edits carry no verifiable runtime behavior:
-# documentation, prose, and data/markup that no test/build exercises. When a
-# turn touches ONLY these, verify-on-stop has nothing to check, so the nudge is
-# suppressed (this is fix "C" for the doc/markdown/skill false-positive — a
-# SKILL.md or README edit must never demand a /tmp verification script). A turn
-# that edits any non-listed path (a real source/code/config file) still nudges.
-_NON_CODE_VERIFY_EXTENSIONS = frozenset(
-    {
-        ".md",
-        ".markdown",
-        ".mdx",
-        ".rst",
-        ".txt",
-        ".text",
-        ".adoc",
-        ".asciidoc",
-        ".org",
-        ".log",
-        ".csv",
-        ".tsv",
-    }
-)
-
-# Filenames (case-insensitive, extension-less or otherwise) that are pure prose
-# even without a recognized doc extension.
-_NON_CODE_VERIFY_FILENAMES = frozenset(
-    {
-        "license",
-        "licence",
-        "notice",
-        "authors",
-        "contributors",
-        "changelog",
-        "codeowners",
-    }
-)
-
-
-def _is_non_code_path(raw: str) -> bool:
-    """Return True when a changed path is documentation/prose with nothing to verify."""
-    try:
-        p = Path(str(raw))
-    except Exception:
-        return False
-    suffix = p.suffix.lower()
-    if suffix in _NON_CODE_VERIFY_EXTENSIONS:
-        return True
-    if not suffix and p.name.lower() in _NON_CODE_VERIFY_FILENAMES:
-        return True
-    return False
-
-
-def _filter_verifiable_paths(paths: Iterable[str]) -> list[str]:
-    """Drop documentation/prose paths; keep paths that could have verifiable behavior."""
-    return [p for p in paths if p and not _is_non_code_path(p)]
-
-
-# Session identities (platform or source) that are NOT human conversational
-# messaging surfaces: interactive coding surfaces (CLI, TUI, desktop, codex,
-# local, gateway) and programmatic callers (API server, webhooks, tools).
-# Verify-on-stop stays ON by default for these. Any other resolved gateway
-# platform is a conversational messaging surface (Telegram, Discord, WhatsApp,
-# Signal, Slack, etc.) where the verification narrative would reach a human as
-# chat noise, so it defaults OFF. Mirrors LOCAL_SESSION_SOURCE_IDS in
-# apps/desktop/src/lib/session-source.ts; keep roughly in sync when adding a
-# local or programmatic surface. Default-deny by design: an unrecognized
-# identity is treated as messaging (OFF) so a new chat platform never leaks the
-# verification receipt before this set is updated.
-_NON_MESSAGING_SESSION_SURFACES = frozenset(
-    {
-        "",
-        "cli",
-        "codex",
-        "desktop",
-        "gateway",
-        "local",
-        "tui",
-        "tool",
-        "api_server",
-        "webhook",
-        "msgraph_webhook",
-    }
-)
-
-
-def _session_is_messaging_surface() -> bool:
-    """Return whether this turn is delivered over a human messaging channel.
-
-    The gateway binds the platform value (e.g. ``telegram``) to
-    ``HERMES_SESSION_PLATFORM``; the CLI and TUI set ``HERMES_SESSION_SOURCE``
-    (e.g. ``cli``, ``tui``) instead. Both are consulted via the session-context
-    helper (with an ``os.environ`` fallback), alongside the ``HERMES_PLATFORM``
-    override, matching the sibling platform resolution in
-    ``agent/skill_commands.py`` and ``agent/prompt_builder.py``. A turn is a
-    messaging surface when a resolved identity is present and is not a known
-    non-messaging surface.
-    """
-    try:
-        from gateway.session_context import get_session_env
-
-        platform = (
-            os.getenv("HERMES_PLATFORM")
-            or get_session_env("HERMES_SESSION_PLATFORM", "")
-        )
-        source = get_session_env("HERMES_SESSION_SOURCE", "")
-    except Exception:
-        platform = os.getenv("HERMES_PLATFORM", "") or os.environ.get(
-            "HERMES_SESSION_PLATFORM", ""
-        )
-        source = os.environ.get("HERMES_SESSION_SOURCE", "")
-    for identity in (platform, source):
-        identity = str(identity or "").strip().lower()
-        if identity and identity not in _NON_MESSAGING_SESSION_SURFACES:
-            return True
-    return False
-
-
-def verify_on_stop_enabled(config: dict[str, Any] | None = None) -> bool:
-    """Return whether edit -> verify-before-finish behavior is enabled.
-
-    Precedence: an explicit ``HERMES_VERIFY_ON_STOP`` env var wins, then an
-    explicit ``agent.verify_on_stop`` config value. The config default is
-    ``False`` (see ``DEFAULT_CONFIG``) — verify-on-stop is OFF unless the user
-    opts in. The legacy ``"auto"`` sentinel is still honored for anyone who
-    sets it explicitly: it resolves to ON for interactive coding surfaces
-    (CLI, TUI, desktop) and programmatic callers, and OFF for conversational
-    messaging surfaces (Telegram, Discord, etc.). A missing/unknown value
-    falls back to OFF.
-    """
-    env = os.environ.get("HERMES_VERIFY_ON_STOP")
-    if env is not None:
-        return env.strip().lower() not in {"0", "false", "no", "off"}
-    if config is None:
-        try:
-            from hermes_cli.config import load_config
-
-            config = load_config()
-        except Exception:
-            config = {}
-    agent_cfg = (config or {}).get("agent") if isinstance(config, dict) else None
-    cfg_val = agent_cfg.get("verify_on_stop") if isinstance(agent_cfg, dict) else None
-    if isinstance(cfg_val, bool):
-        return cfg_val
-    if isinstance(cfg_val, str):
-        token = cfg_val.strip().lower()
-        if token in {"1", "true", "yes", "on"}:
-            return True
-        if token in {"0", "false", "no", "off"}:
-            return False
-        if token == "auto":
-            # Explicit opt-in to the legacy surface-aware behavior.
-            return not _session_is_messaging_surface()
-    # Missing or unknown value -> OFF (the new default).
-    return False
-
-
-def _candidate_cwds(paths: Iterable[str]) -> list[Path]:
-    candidates: list[Path] = []
-    seen: set[str] = set()
-    for raw in paths:
-        if not raw:
-            continue
-        try:
-            path = Path(raw).expanduser()
-            candidate = path if path.is_dir() else path.parent
-            resolved = str(candidate.resolve())
-        except Exception:
-            continue
-        if resolved not in seen:
-            seen.add(resolved)
-            candidates.append(Path(resolved))
-    return candidates
-
-
-def _verification_snapshot(
-    *,
-    session_id: str | None,
-    changed_paths: list[str],
-) -> tuple[dict[str, Any], dict[str, Any]] | None:
-    """Return ``(status, facts)`` for the first edited workspace needing proof."""
-    try:
-        from agent.coding_context import project_facts_for
-        from agent.verification_evidence import verification_status
-    except Exception:
-        return None
-
-    first_snapshot: tuple[dict[str, Any], dict[str, Any]] | None = None
-    for cwd in _candidate_cwds(changed_paths):
-        facts = project_facts_for(cwd)
-        if not facts:
-            continue
-        status = verification_status(session_id=session_id, cwd=cwd)
-        snapshot = (status, facts)
-        if first_snapshot is None:
-            first_snapshot = snapshot
-        if str(status.get("status") or "unverified") != "passed":
-            return snapshot
-    return first_snapshot
-
-
-def _format_changed_paths(paths: list[str]) -> str:
-    shown = paths[:_MAX_CHANGED_PATHS_IN_NUDGE]
-    lines = [f"- `{path}`" for path in shown]
-    remaining = len(paths) - len(shown)
-    if remaining > 0:
-        lines.append(f"- ... and {remaining} more")
-    return "\n".join(lines)
-
-
-def _status_detail(status: dict[str, Any]) -> str:
-    state = str(status.get("status") or "unverified")
-    evidence = status.get("evidence") if isinstance(status.get("evidence"), dict) else None
-    if not evidence:
-        return state
-
-    command = evidence.get("canonical_command") or evidence.get("command")
-    summary = str(evidence.get("output_summary") or "").strip()
-    parts = [state]
-    if command:
-        parts.append(f"last command `{command}`")
-    if summary:
-        max_summary = 1200
-        if len(summary) > max_summary:
-            summary = summary[:max_summary].rstrip() + "\n... [truncated]"
-        parts.append(f"last output:\n{summary}")
-    return "\n".join(parts)
-
-
-def build_verify_on_stop_nudge(
-    *,
-    session_id: str | None,
-    changed_paths: Iterable[str],
-    attempts: int = 0,
-    max_attempts: int = 2,
-) -> str | None:
-    """Return a synthetic follow-up when edited code lacks fresh verification."""
-    # Drop documentation/prose paths (markdown, skills, README, LICENSE, ...) —
-    # they carry no verifiable behavior, so a turn that touched only those has
-    # nothing to verify and must not nudge.
-    paths = sorted({str(p) for p in _filter_verifiable_paths(changed_paths)})
-    if not paths or attempts >= max_attempts:
-        return None
-
-    snapshot = _verification_snapshot(session_id=session_id, changed_paths=paths)
-    if snapshot is None:
-        return None
-    status, facts = snapshot
-
-    verify_commands = [
-        str(cmd).strip()
-        for cmd in (facts.get("verifyCommands") or [])
-        if str(cmd).strip()
-    ]
-
-    state = str(status.get("status") or "unverified")
-    if state == "passed":
-        return None
-
-    if verify_commands:
-        command_instruction = (
-            "Run the relevant verification command now ("
-            + ", ".join(f"`{cmd}`" for cmd in verify_commands[:3])
-            + (", ..." if len(verify_commands) > 3 else "")
-            + "), read any failure, repair the code, and summarize what passed."
-        )
-    else:
-        temp_dir = tempfile.gettempdir()
-        command_instruction = (
-            "No canonical test/lint/build command was detected. Create a focused "
-            f"temporary verification script under `{temp_dir}` using an OS-safe "
-            "`tempfile` path with a `hermes-verify-` filename prefix, run it "
-            "against the changed behavior, clean it up when possible, and "
-            "summarize it explicitly as ad-hoc verification rather than suite "
-            "green."
-        )
-
-    return (
-        "[System: You edited code in this turn, but the workspace does not have "
-        "fresh passing verification evidence yet.\n\n"
-        f"Verification status: {_status_detail(status)}\n\n"
-        f"Changed paths:\n{_format_changed_paths(paths)}\n\n"
-        f"{command_instruction} If verification is not possible, explain the "
-        "concrete blocker instead of claiming the work is fully verified.]"
-    )
-
-
-__all__ = ["build_verify_on_stop_nudge", "verify_on_stop_enabled"]
--- a/apps/desktop/components.json
+++ b/apps/desktop/components.json
@@ -17,5 +17,5 @@
    "lib": "@/lib",
    "hooks": "@/hooks"
  },
-  "iconLibrary": "tabler"
+  "iconLibrary": "lucide"
 }
--- a/apps/desktop/electron/backend-env.cjs
+++ b/apps/desktop/electron/backend-env.cjs
@@ -61,7 +61,10 @@ function buildDesktopBackendPath({
  const venvBin = venvRoot ? pathModule.join(venvRoot, platform === 'win32' ? 'Scripts' : 'bin') : null
  const saneEntries = platform === 'win32' ? [] : POSIX_SANE_PATH_ENTRIES

-  return appendUniquePathEntries([hermesNodeBin, venvBin, currentPath, saneEntries], { delimiter })
+  return appendUniquePathEntries(
+    [hermesNodeBin, venvBin, currentPath, saneEntries],
+    { delimiter }
+  )
 }

 function normalizeHermesHomeRoot(hermesHome, { pathModule = pathModuleForPlatform(process.platform) } = {}) {
--- a/apps/desktop/electron/backend-env.test.cjs
+++ b/apps/desktop/electron/backend-env.test.cjs
@@ -76,7 +76,10 @@ test('normalizeHermesHomeRoot maps profile homes back to the global Hermes root'
    normalizeHermesHomeRoot('C:\\Users\\test\\AppData\\Local\\hermes\\profiles\\oracle', { pathModule: path.win32 }),
    'C:\\Users\\test\\AppData\\Local\\hermes'
  )
-  assert.equal(normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }), '/Users/test/.hermes')
+  assert.equal(
+    normalizeHermesHomeRoot('/Users/test/.hermes', { pathModule: path.posix }),
+    '/Users/test/.hermes'
+  )
 })

 test('Windows PATH casing and delimiter are preserved without POSIX sane entries', () => {
@@ -101,5 +104,8 @@ test('Windows PATH casing and delimiter are preserved without POSIX sane entries
 })

 test('appendUniquePathEntries drops empty entries and keeps first occurrence', () => {
-  assert.equal(appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }), '/a:/b:/c')
+  assert.equal(
+    appendUniquePathEntries([':/a::/b', ['/a', '/c']], { delimiter: ':' }),
+    '/a:/b:/c'
+  )
 })
--- a/apps/desktop/electron/backend-probes.cjs
+++ b/apps/desktop/electron/backend-probes.cjs
@@ -37,18 +37,7 @@ const { execFileSync } = require('node:child_process')
 const PROBE_TIMEOUT_MS = 5000

 /**
- * Return the Python snippet used to verify Hermes can import far enough to
- * launch the CLI. Kept exported for tests so dependency regressions are
- * caught without needing a real broken venv fixture.
- *
- * @returns {string}
- */
-function hermesRuntimeImportProbe() {
-  return 'import yaml; import hermes_cli.config'
-}
-
-/**
- * Return true iff the Hermes runtime import probe exits 0.
+ * Return true iff `python -c "import hermes_cli"` exits 0.
 *
 * Used to gate the "fallback to system Python with hermes_cli installed"
 * rung of resolveHermesBackend. Without this, a system Python 3.11-3.13
@@ -57,20 +46,13 @@ function hermesRuntimeImportProbe() {
 * site-packages -- and the resolver returns a backend that immediately
 * dies on spawn.
 *
- * The probe intentionally imports hermes_cli.config, not just the top-level
- * package: a broken/empty Windows launcher venv can still see the source tree
- * through PYTHONPATH but lack PyYAML, then die on the first real CLI import.
- *
 * @param {string} pythonPath - Absolute path to a python.exe / python.
- * @param {object} [opts]
- * @param {object} [opts.env] - Additional environment for the probe.
 * @returns {boolean}
 */
-function canImportHermesCli(pythonPath, opts = {}) {
+function canImportHermesCli(pythonPath) {
  if (!pythonPath) return false
  try {
-    execFileSync(pythonPath, ['-c', hermesRuntimeImportProbe()], {
-      env: { ...process.env, ...(opts.env || {}) },
+    execFileSync(pythonPath, ['-c', 'import hermes_cli'], {
      stdio: 'ignore',
      timeout: PROBE_TIMEOUT_MS,
      windowsHide: true
@@ -119,7 +101,6 @@ function verifyHermesCli(hermesCommand, opts = {}) {

 module.exports = {
  canImportHermesCli,
-  hermesRuntimeImportProbe,
  verifyHermesCli,
  PROBE_TIMEOUT_MS
 }
--- a/apps/desktop/electron/backend-probes.test.cjs
+++ b/apps/desktop/electron/backend-probes.test.cjs
@@ -11,7 +11,7 @@ const fs = require('node:fs')
 const os = require('node:os')
 const path = require('node:path')

-const { canImportHermesCli, hermesRuntimeImportProbe, verifyHermesCli } = require('./backend-probes.cjs')
+const { canImportHermesCli, verifyHermesCli } = require('./backend-probes.cjs')

 // Resolve the host's own Node binary -- guaranteed to be on disk and
 // runnable. We use it as both a stand-in for "a python that doesn't
@@ -40,12 +40,6 @@ test('canImportHermesCli returns false when binary does not exist', () => {
  assert.equal(canImportHermesCli(ghost), false)
 })

-test('hermes runtime import probe checks config dependencies', () => {
-  const probe = hermesRuntimeImportProbe()
-  assert.match(probe, /\bimport yaml\b/)
-  assert.match(probe, /\bimport hermes_cli\.config\b/)
-})
-
 test('verifyHermesCli returns false when command is falsy', () => {
  assert.equal(verifyHermesCli(''), false)
  assert.equal(verifyHermesCli(null), false)
--- a/apps/desktop/electron/backend-ready.cjs
+++ b/apps/desktop/electron/backend-ready.cjs
@@ -1,5 +1,3 @@
-const fs = require('node:fs')
-
 const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m

 // The announcement clock starts the instant the backend process is spawned —
@@ -96,76 +94,9 @@ function waitForDashboardPort(child, timeoutMs = resolvePortAnnounceTimeoutMs())
  })
 }

-function readDashboardReadyFile(readyFile) {
-  if (!readyFile) return null
-  try {
-    const parsed = JSON.parse(fs.readFileSync(readyFile, 'utf8'))
-    const port = Number(parsed?.port)
-    return Number.isInteger(port) && port > 0 ? port : null
-  } catch {
-    return null
-  }
-}
-
-function waitForDashboardReadyFile(readyFile, child, timeoutMs = resolvePortAnnounceTimeoutMs()) {
-  return new Promise((resolve, reject) => {
-    let done = false
-    let interval = null
-
-    function cleanup() {
-      if (done) return
-      done = true
-      clearTimeout(timer)
-      if (interval) clearInterval(interval)
-      child.off('exit', onExit)
-      child.off('error', onError)
-    }
-
-    function check() {
-      const port = readDashboardReadyFile(readyFile)
-      if (port) {
-        cleanup()
-        resolve(port)
-      }
-    }
-
-    function onExit(code, signal) {
-      cleanup()
-      reject(new Error(`Hermes backend: exited before port announcement (${signal || code})`))
-    }
-
-    function onError(err) {
-      cleanup()
-      reject(err)
-    }
-
-    const timer = setTimeout(() => {
-      cleanup()
-      reject(new Error(`Timed out waiting for Hermes backend port announcement (${timeoutMs}ms)`))
-    }, timeoutMs)
-
-    child.on('exit', onExit)
-    child.on('error', onError)
-    interval = setInterval(check, 50)
-    if (typeof interval.unref === 'function') interval.unref()
-    check()
-  })
-}
-
-function waitForDashboardPortAnnouncement(child, options = {}) {
-  const timeoutMs = options.timeoutMs ?? resolvePortAnnounceTimeoutMs()
-  if (options.readyFile) {
-    return waitForDashboardReadyFile(options.readyFile, child, timeoutMs)
-  }
-  return waitForDashboardPort(child, timeoutMs)
-}
-
 module.exports = {
  waitForDashboardPort,
-  waitForDashboardPortAnnouncement,
-  waitForDashboardReadyFile,
-  readDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
 }
--- a/apps/desktop/electron/backend-ready.test.cjs
+++ b/apps/desktop/electron/backend-ready.test.cjs
@@ -14,18 +14,12 @@
 const test = require('node:test')
 const assert = require('node:assert/strict')
 const { EventEmitter } = require('node:events')
-const fs = require('node:fs')
-const os = require('node:os')
-const path = require('node:path')

 const {
-  readDashboardReadyFile,
  waitForDashboardPort,
-  waitForDashboardPortAnnouncement,
-  waitForDashboardReadyFile,
  resolvePortAnnounceTimeoutMs,
  DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
-  MIN_PORT_ANNOUNCE_TIMEOUT_MS
+  MIN_PORT_ANNOUNCE_TIMEOUT_MS,
 } = require('./backend-ready.cjs')

 // A minimal stand-in for a spawned child process: an EventEmitter with a
@@ -125,75 +119,3 @@ test('a late announcement after timeout does not throw (listeners torn down)', a
    child.stdout.emit('data', 'HERMES_DASHBOARD_READY port=9999\n')
  })
 })
-
-// ---------------------------------------------------------------------------
-// ready-file port announcement
-// ---------------------------------------------------------------------------
-
-function mkTmpReadyFile() {
-  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-ready-test-'))
-  return {
-    dir,
-    file: path.join(dir, 'ready.json'),
-    cleanup: () => fs.rmSync(dir, { recursive: true, force: true })
-  }
-}
-
-test('readDashboardReadyFile returns a valid port from JSON', () => {
-  const tmp = mkTmpReadyFile()
-  try {
-    fs.writeFileSync(tmp.file, JSON.stringify({ port: 4567 }))
-    assert.equal(readDashboardReadyFile(tmp.file), 4567)
-  } finally {
-    tmp.cleanup()
-  }
-})
-
-test('readDashboardReadyFile ignores missing, malformed, or invalid files', () => {
-  const tmp = mkTmpReadyFile()
-  try {
-    assert.equal(readDashboardReadyFile(tmp.file), null)
-    fs.writeFileSync(tmp.file, '{')
-    assert.equal(readDashboardReadyFile(tmp.file), null)
-    fs.writeFileSync(tmp.file, JSON.stringify({ port: 0 }))
-    assert.equal(readDashboardReadyFile(tmp.file), null)
-  } finally {
-    tmp.cleanup()
-  }
-})
-
-test('waitForDashboardReadyFile resolves when the ready file appears', async () => {
-  const tmp = mkTmpReadyFile()
-  const child = makeFakeChild()
-  try {
-    const p = waitForDashboardReadyFile(tmp.file, child, 1000)
-    setTimeout(() => fs.writeFileSync(tmp.file, JSON.stringify({ port: 8765 })), 20)
-    assert.equal(await p, 8765)
-  } finally {
-    tmp.cleanup()
-  }
-})
-
-test('waitForDashboardPortAnnouncement uses ready file when provided', async () => {
-  const tmp = mkTmpReadyFile()
-  const child = makeFakeChild()
-  try {
-    const p = waitForDashboardPortAnnouncement(child, { readyFile: tmp.file, timeoutMs: 1000 })
-    setTimeout(() => fs.writeFileSync(tmp.file, JSON.stringify({ port: 9876 })), 20)
-    assert.equal(await p, 9876)
-  } finally {
-    tmp.cleanup()
-  }
-})
-
-test('waitForDashboardReadyFile rejects when the child exits before file readiness', async () => {
-  const tmp = mkTmpReadyFile()
-  const child = makeFakeChild()
-  try {
-    const p = waitForDashboardReadyFile(tmp.file, child, 1000)
-    child.emit('exit', 1, null)
-    await assert.rejects(p, /exited before port announcement/)
-  } finally {
-    tmp.cleanup()
-  }
-})
--- a/apps/desktop/electron/bootstrap-runner.cjs
+++ b/apps/desktop/electron/bootstrap-runner.cjs
@@ -179,13 +179,7 @@ function downloadInstallScript(commit, destPath) {
  })
 }

-async function resolveInstallScript({
-  installStamp,
-  sourceRepoRoot,
-  hermesHome,
-  emit,
-  _download = downloadInstallScript
-}) {
+async function resolveInstallScript({ installStamp, sourceRepoRoot, hermesHome, emit, _download = downloadInstallScript }) {
  // 1. Dev shortcut: prefer a local checkout's installer so we can iterate
  //    without pushing. SOURCE_REPO_ROOT comes from main.cjs (path.resolve
  //    of APP_ROOT/../..).
@@ -299,19 +293,15 @@ function spawnPowerShell(scriptPath, args, { emit, stageName, abortSignal, herme
    const ps = process.platform === 'win32' ? resolveWindowsPowerShell() : 'pwsh'
    const fullArgs = ['-NoProfile', '-ExecutionPolicy', 'Bypass', '-File', scriptPath, ...args]

-    const child = spawn(
-      ps,
-      fullArgs,
-      hiddenWindowsChildOptions({
-        stdio: ['ignore', 'pipe', 'pipe'],
-        env: {
-          ...process.env,
-          // Pass HERMES_HOME through so install.ps1 respects the caller's
-          // choice rather than re-computing the default.
-          HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
-        }
-      })
-    )
+    const child = spawn(ps, fullArgs, hiddenWindowsChildOptions({
+      stdio: ['ignore', 'pipe', 'pipe'],
+      env: {
+        ...process.env,
+        // Pass HERMES_HOME through so install.ps1 respects the caller's
+        // choice rather than re-computing the default.
+        HERMES_HOME: hermesHome || process.env.HERMES_HOME || ''
+      }
+    }))

    let stdout = ''
    let stderr = ''
--- a/apps/desktop/electron/connection-config.cjs
+++ b/apps/desktop/electron/connection-config.cjs
@@ -261,7 +261,12 @@ function cookiesHaveSession(cookies) {
 */
 function cookiesHaveLiveSession(cookies) {
  if (!Array.isArray(cookies)) return false
-  return cookies.some(c => c && c.value && (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name)))
+  return cookies.some(
+    c =>
+      c &&
+      c.value &&
+      (AT_COOKIE_VARIANTS.includes(c.name) || RT_COOKIE_VARIANTS.includes(c.name))
+  )
 }

 module.exports = {
--- a/apps/desktop/electron/desktop-uninstall.cjs
+++ b/apps/desktop/electron/desktop-uninstall.cjs
@@ -138,7 +138,10 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
  if (pythonPath) {
    lines.push(`export PYTHONPATH=${q(pythonPath)}\${PYTHONPATH:+:$PYTHONPATH}`)
  }
-  lines.push(`cd ${q(agentRoot)} 2>/dev/null || true`, `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`)
+  lines.push(
+    `cd ${q(agentRoot)} 2>/dev/null || true`,
+    `${q(pythonExe)} ${uninstallArgs.map(q).join(' ')} || true`
+  )
  if (appPath) {
    lines.push(`rm -rf ${q(appPath)} || true`)
  }
@@ -166,15 +169,7 @@ function buildPosixCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot,
 * Removal: even after the desktop PID is gone, Windows releases directory
 * handles lazily, so a single `rmdir /s /q` can half-fail — retry up to 10x.
 */
-function buildWindowsCleanupScript({
-  desktopPid,
-  pythonExe,
-  pythonPath,
-  agentRoot,
-  uninstallArgs,
-  appPath,
-  hermesHome
-}) {
+function buildWindowsCleanupScript({ desktopPid, pythonExe, pythonPath, agentRoot, uninstallArgs, appPath, hermesHome }) {
  const pid = Number(desktopPid) || 0
  // cmd.exe has no string escaping inside quotes; strip embedded quotes (paths
  // under %LOCALAPPDATA% never contain them). `&`/`^` in a path would still be
--- a/apps/desktop/electron/desktop-uninstall.test.cjs
+++ b/apps/desktop/electron/desktop-uninstall.test.cjs
@@ -101,7 +101,10 @@ test('resolveRemovableAppPath uses APPIMAGE on Linux when set', () => {
 })

 test('resolveRemovableAppPath finds the unpacked dir on Linux', () => {
-  assert.equal(resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}), '/opt/hermes/linux-unpacked')
+  assert.equal(
+    resolveRemovableAppPath('/opt/hermes/linux-unpacked/hermes', 'linux', {}),
+    '/opt/hermes/linux-unpacked'
+  )
  // A system-package install (/usr/bin) → null, left to apt/dnf.
  assert.equal(resolveRemovableAppPath('/usr/bin/hermes', 'linux', {}), null)
 })
--- a/apps/desktop/electron/embed-referer.cjs
+++ b/apps/desktop/electron/embed-referer.cjs
@@ -1,48 +0,0 @@
-'use strict'
-
-const { session } = require('electron')
-
-const EMBED_SESSION_PARTITION = 'persist:hermes-embed'
-const EMBED_REFERER = 'https://www.youtube.com/'
-const YOUTUBE_REFERER_HOST_RE =
-  /(^|\.)(youtube\.com|youtube-nocookie\.com|googlevideo\.com|ytimg\.com|youtubei\.googleapis\.com)$/i
-
-function installEmbedRefererForSession(embedSession) {
-  if (!embedSession) {
-    return
-  }
-
-  embedSession.webRequest.onBeforeSendHeaders((details, callback) => {
-    let host = ''
-
-    try {
-      host = new URL(details.url).hostname
-    } catch {
-      host = ''
-    }
-
-    if (!YOUTUBE_REFERER_HOST_RE.test(host)) {
-      callback({ requestHeaders: details.requestHeaders })
-      return
-    }
-
-    const headers = { ...details.requestHeaders }
-
-    if (!headers.Referer && !headers.referer) {
-      headers.Referer = EMBED_REFERER
-    }
-
-    callback({ requestHeaders: headers })
-  })
-}
-
-/** Stamp Referer on YouTube requests in the embed webview partition only. */
-function installEmbedReferer() {
-  try {
-    installEmbedRefererForSession(session.fromPartition(EMBED_SESSION_PARTITION))
-  } catch {
-    // Non-fatal: embeds still render; YouTube may show referer errors.
-  }
-}
-
-module.exports = { installEmbedReferer }
--- a/Show More
+++ b/Show More