change(ci): remove lint PR comment

it's already in the job summary. having it as a comment just makes people ignore it. don't waste sapce.
feat(ci): add CI timing report
2026-06-27 20:34:35 +08:00 · 2026-06-25 19:51:48 -04:00 · 2026-06-25 19:51:48 -04:00 · 2026-06-25 19:47:43 -04:00 · 2026-06-25 19:15:00 -04:00 · 2026-06-25 19:12:49 -04:00
63 changed files with 2602 additions and 1703 deletions
--- a/.envrc
+++ b/.envrc
@@ -1,5 +1,5 @@
 watch_file pyproject.toml uv.lock
 watch_file package-lock.json package.json web/package.json ui-tui/package.json website/package.json apps/shared/package.json apps/desktop/package.json ui-tui/packages/hermes-ink/package.json
-watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix
+watch_file flake.nix flake.lock nix/devShell.nix nix/tui.nix nix/package.nix nix/python.nix nix/hermes-agent.nix nix/desktop.nix

 use flake
--- a/.github/actions/hermes-smoke-test/action.yml
+++ b/.github/actions/hermes-smoke-test/action.yml
@@ -1,50 +0,0 @@
-name: Hermes smoke test
-description: >
-  Run the image's built-in entrypoint against `--help` and `dashboard --help`
-  to catch basic runtime regressions before publishing.  Requires the image
-  to already be loaded into the local Docker daemon under `image`.
-
-  Works identically on amd64 and arm64 runners.
-
-inputs:
-  image:
-    description: Fully-qualified image tag (e.g. nousresearch/hermes-agent:test)
-    required: true
-
-runs:
-  using: composite
-  steps:
-    - name: Ensure /tmp/hermes-test is hermes-writable
-      shell: bash
-      run: |
-        # The image runs as the hermes user (UID 10000).  GitHub Actions
-        # creates /tmp/hermes-test root-owned by default, which hermes
-        # can't write to — chown it to match the in-container UID before
-        # bind-mounting.  Real users doing `docker run -v ~/.hermes:...`
-        # with their own UID hit the same issue and have their own
-        # remediations (HERMES_UID env var, or chown locally).
-        mkdir -p /tmp/hermes-test
-        sudo chown -R 10000:10000 /tmp/hermes-test
-
-    - name: hermes --help
-      shell: bash
-      run: |
-        # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so
-        # this exercises the actual production startup path. PR #30136
-        # review caught that an --entrypoint override here had been
-        # silently neutered by the s6-overlay migration — stage2-hook
-        # ignores its CMD args, so the smoke test was a no-op.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" --help
-
-    - name: hermes dashboard --help
-      shell: bash
-      run: |
-        # Regression guard for #9153: dashboard was present in source but
-        # missing from the published image.  If this fails, something in
-        # the Dockerfile is excluding the dashboard subcommand from the
-        # installed package.
-        docker run --rm \
-          -v /tmp/hermes-test:/opt/data \
-          "${{ inputs.image }}" dashboard --help
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ permissions:
  pull-requests: write # needed by lint (PR comment) + supply-chain (PR comment)
  actions: read # needed by osv-scanner (SARIF upload)
  security-events: write # needed by osv-scanner (SARIF upload)
+  packages: write # needed by docker build

 concurrency:
  group: ci-${{ github.ref }}
@@ -32,6 +33,7 @@ jobs:
  # (all lanes true) so post-merge validation is never weakened.
  # ─────────────────────────────────────────────────────────────────────
  detect:
+    name: Detect affected areas
    runs-on: ubuntu-latest
    outputs:
      python: ${{ steps.classify.outputs.python }}
@@ -53,11 +55,15 @@ jobs:
  # Skipped workflows (if condition is false) don't spin up runners.
  # ─────────────────────────────────────────────────────────────────────
  tests:
+    name: Python tests
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/tests.yml
+    with:
+      slice_count: 8

  lint:
+    name: Python lints
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/lint.yml
@@ -65,35 +71,48 @@ jobs:
      event_name: ${{ needs.detect.outputs.event_name }}

  typecheck:
+    name: TypeScript
    needs: detect
    if: needs.detect.outputs.frontend == 'true'
    uses: ./.github/workflows/typecheck.yml

  docs-site:
+    name: Docs Site
    needs: detect
    if: needs.detect.outputs.site == 'true'
    uses: ./.github/workflows/docs-site-checks.yml

  history-check:
+    name: Deny unrelated histories
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request'
    uses: ./.github/workflows/history-check.yml

  contributor-check:
+    name: Check contributors
    needs: detect
    if: needs.detect.outputs.python == 'true'
    uses: ./.github/workflows/contributor-check.yml

  uv-lockfile:
+    name: Check uv.lock
    needs: detect
    uses: ./.github/workflows/uv-lockfile-check.yml

  docker-lint:
+    name: Lint Docker scripts
    needs: detect
    if: needs.detect.outputs.docker_meta == 'true'
    uses: ./.github/workflows/docker-lint.yml

+  docker:
+    name: Build&Test Docker image
+    needs: detect
+    if: needs.detect.outputs.python == 'true' || needs.detect.outputs.frontend == 'true' || needs.detect.outputs.docker_meta == 'true'
+    uses: ./.github/workflows/docker.yml
+
  supply-chain:
+    name: Supply-chain scan
    needs: detect
    if: needs.detect.outputs.event_name == 'pull_request' && (needs.detect.outputs.scan == 'true' || needs.detect.outputs.deps == 'true' || needs.detect.outputs.mcp_catalog == 'true')
    uses: ./.github/workflows/supply-chain-audit.yml
@@ -104,7 +123,7 @@ jobs:
      mcp_catalog: ${{ needs.detect.outputs.mcp_catalog == 'true' }}

  osv-scanner:
-    needs: detect
+    name: OSV scan
    uses: ./.github/workflows/osv-scanner.yml

  # ─────────────────────────────────────────────────────────────────────
@@ -127,6 +146,7 @@ jobs:
      - docker-lint
      - supply-chain
      - osv-scanner
+      - docker
    if: always()
    runs-on: ubuntu-latest
    steps:
@@ -143,3 +163,67 @@ jobs:
              sys.exit(1)
          print('All checks passed (or were skipped)')
          "
+
+  # ─────────────────────────────────────────────────────────────────────
+  # CI timing report: collect per-job/step durations from the GitHub API,
+  # cache them on main (as a baseline), and on PRs generate an HTML diff
+  # report with a gantt chart + per-step breakdown. The report is uploaded
+  # as an artifact and a markdown summary is written to $GITHUB_STEP_SUMMARY.
+  # ─────────────────────────────────────────────────────────────────────
+  ci-timings:
+    name: CI timing report
+    needs: all-checks-pass
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Restore baseline cache (PR only)
+        if: github.event_name == 'pull_request'
+        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ci-timings-baseline.json
+          # Prefix-match: exact key will never hit (run_id differs), so
+          # restore-keys finds the most recent baseline from main.
+          key: ci-timings-baseline-never-exact
+          restore-keys: |
+            ci-timings-baseline-
+
+      - name: Collect timings and generate report
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          python3 scripts/ci/timings_report.py \
+            --baseline ci-timings-baseline.json \
+            --output ci-timings-report.html \
+            --json-out ci-timings.json \
+            --summary-out ci-timings-summary.md
+
+      - name: Upload HTML report
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        id: ci-timings-artifact
+        with:
+          name: ci-timings-report
+          path: ci-timings-report.html
+          retention-days: 14
+          archive: false
+
+      - name: Output summary
+        env:
+          REPORT_URL: ${{ steps.ci-timings-artifact.outputs.artifact-url}}
+        run: |
+          echo "# CI Timing report" >> "$GITHUB_STEP_SUMMARY"
+          echo "[View the full interactive report]($REPORT_URL)" >> "$GITHUB_STEP_SUMMARY"
+          cat ci-timings-summary.md >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Save baseline cache (main only)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        run: cp ci-timings.json ci-timings-baseline.json
+
+      - name: Upload baseline to cache (main only)
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        with:
+          path: ci-timings-baseline.json
+          key: ci-timings-baseline-${{ github.run_id }}
--- a/.github/workflows/docker-lint.yml
+++ b/.github/workflows/docker-lint.yml
@@ -2,7 +2,7 @@ name: Docker / shell lint

 # Lints the container build inputs: Dockerfile (via hadolint) and any shell
 # scripts under docker/ (via shellcheck). These catch the class of regression
-# the behavioral docker-publish smoke test can't — unquoted variable
+# the behavioral docker smoke test can't — unquoted variable
 # expansions, silently-failing RUN commands, etc.
 #
 # Rules and ignores are documented in .hadolint.yaml at the repo root.
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,24 +1,9 @@
-name: Docker Build and Publish
+name: Docker Build, Test, and Publish

 on:
-  push:
-    branches: [main]
-    paths:
-      - '**/*.py'
-      - 'pyproject.toml'
-      - 'uv.lock'
-      - 'Dockerfile'
-      - 'docker/**'
-      - '.github/workflows/docker-publish.yml'
-      - '.github/actions/hermes-smoke-test/**'
-
-  # No paths filter — the job must always run so the required check
-  # reports a status (path-gated workflows leave checks "pending" forever
-  # when no matching files change, which blocks merge).
-  pull_request:
-
  release:
    types: [published]
+  workflow_call:

 permissions:
  contents: read
@@ -39,11 +24,7 @@ env:
  IMAGE_NAME: nousresearch/hermes-agent

 jobs:
-  # ---------------------------------------------------------------------------
-  # Build amd64 natively.  This job also runs the smoke tests (basic --help
-  # and the dashboard subcommand regression guard from #9153), because amd64
-  # is the only arch we can `load` into the local daemon on an amd64 runner.
-  # ---------------------------------------------------------------------------
+  # Build, test, and optionally push the amd64 image.
  build-amd64:
    # Only run on the upstream repository, not on forks
    if: github.repository == 'NousResearch/hermes-agent'
@@ -53,24 +34,19 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      # The image build + smoke test + integration tests run ONLY on
-      # push-to-main and release — never on PRs. They are the heaviest jobs
-      # in CI (~15-45 min) and a broken build surfaces on the main push (and
-      # is gated pre-merge by docker-lint + uv-lockfile-check). Every step
-      # below is skipped on PRs, so the job still reports green and the
-      # required check never hangs.
+      # The image build + integration tests run on every event
+      # (PRs, push-to-main, release). Publish steps below are gated to
+      # push-to-main / release only.
      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

-      # Build once, load into the local daemon for smoke testing.  Cached
+      # Build once, load into the local daemon for testing.  Cached
      # to gha with a per-arch scope; the push step below reuses every
      # layer from this build.
-      - name: Build image (amd64, smoke test)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+      - name: Build image (amd64)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -82,25 +58,12 @@ jobs:
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64

-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
-
-      # ---------------------------------------------------------------------
      # Run the docker-integration test suite against the freshly-built
-      # image already loaded into the local daemon (`:test`).  These tests
-      # are excluded from the sharded `tests.yml :: test` matrix on purpose
-      # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each
-      # shard would otherwise reach the session-scoped ``built_image``
-      # fixture in ``tests/docker/conftest.py`` and start a 3-7min
-      # ``docker build`` — guaranteed to
-      # die in fixture setup.
+      # image already loaded into the local daemon (`:test`).
      #
-      # Piggybacking here avoids a second image build: the smoke test
-      # already proved the image loads + runs, so the daemon has it under
-      # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at
+      # Piggybacking here avoids a second image build: the build step
+      # already loaded the image into the daemon under
+      # `${IMAGE_NAME}:test`, so we just point ``HERMES_TEST_IMAGE`` at
      # that.  The fixture's ``HERMES_TEST_IMAGE`` branch (see
      # tests/docker/conftest.py:62-63) short-circuits the rebuild.
      #
@@ -110,26 +73,20 @@ jobs:
      # cheapest path to coverage on every PR that touches docker code.
      # ---------------------------------------------------------------------
      - name: Install uv (for docker tests)
-        if: github.event_name != 'pull_request'
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Set up Python 3.11 (for docker tests)
-        if: github.event_name != 'pull_request'
        run: uv python install 3.11

      - name: Install Python dependencies (for docker tests)
-        if: github.event_name != 'pull_request'
        run: |
-          uv venv .venv --python 3.11
-          source .venv/bin/activate
          # ``dev`` extra pulls in pytest, pytest-asyncio —
          # everything tests/docker/ needs.  We deliberately avoid ``all``
          # here because the docker tests only drive the container via
          # subprocess and don't import hermes_agent's optional deps.
-          uv pip install -e ".[dev]"
+          uv sync --locked --python 3.11 --extra dev

      - name: Run docker integration tests
-        if: github.event_name != 'pull_request'
        env:
          # Skip rebuild; use the image already loaded by the build step.
          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
@@ -140,11 +97,11 @@ jobs:
          NOUS_API_KEY: ""
        run: |
          source .venv/bin/activate
-          python -m pytest tests/docker/ -v --tb=short
+          python -m pytest -m tests/docker/

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -155,7 +112,7 @@ jobs:
      - name: Push amd64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -179,7 +136,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: digest-amd64
          path: /tmp/digests/*
@@ -187,10 +144,7 @@ jobs:
          retention-days: 1

  # ---------------------------------------------------------------------------
-  # Build arm64 natively on GitHub's free arm64 runner.  This replaces the
-  # previous QEMU-emulated arm64 build, which was ~5-10x slower and shared
-  # a cache scope with amd64.  Matches the amd64 job's shape: build+load,
-  # smoke test, then on push/release push by digest.
+  # Build, test, and optionally push the arm64 image.
  # ---------------------------------------------------------------------------
  build-arm64:
    if: github.repository == 'NousResearch/hermes-agent'
@@ -200,29 +154,26 @@ jobs:
      digest: ${{ steps.push.outputs.digest }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      # arm64 build runs only on push-to-main and release (see build-amd64).
      - name: Set up Docker Buildx
-        if: github.event_name != 'pull_request'
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

      # Log in to ghcr.io so the registry-backed build cache below can be
      # read (cache-from) on every event and written (cache-to) on
      # push/release.  Uses the workflow's GITHUB_TOKEN, which is valid for
      # the whole job — unlike the gha cache backend's short-lived Azure SAS
      # token, which expired mid-build on slow cold-cache arm64 runs and
-      # crashed the build before the smoke test (the reason the gha cache
+      # crashed the build before the tests ran (the reason the gha cache
      # was removed from arm64 PRs in the first place).
      - name: Log in to ghcr.io (build cache)
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # Build once, load into the local daemon for smoke testing, then push
+      # Build once, load into the local daemon for testing, then push
      # by digest below. Reads AND writes the registry-backed cache so the
      # push reuses layers from this build and the next build starts warm.
      #
@@ -230,9 +181,8 @@ jobs:
      # cache that previously broke here: its credential is the job-lifetime
      # GITHUB_TOKEN, not a short-lived SAS token, so the cold-build-outlives-
      # token failure mode cannot recur.
-      - name: Build image (arm64, smoke test, cached publish)
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+      - name: Build image (arm64, cached publish)
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -244,15 +194,30 @@ jobs:
          cache-from: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64
          cache-to: type=registry,ref=ghcr.io/nousresearch/hermes-agent:buildcache-arm64,mode=max

-      - name: Smoke test image
-        if: github.event_name != 'pull_request'
-        uses: ./.github/actions/hermes-smoke-test
-        with:
-          image: ${{ env.IMAGE_NAME }}:test
+      - name: Install uv for docker tests
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
+
+      - name: Set up Python 3.11 for docker tests
+        run: uv python install 3.11
+
+      - name: Install Python dependencies for docker tests
+        run: |
+          uv sync --locked --python 3.11 --extra dev
+
+      - name: Run docker tests
+        env:
+          # Skip rebuild; use the image already loaded by the build step.
+          HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test
+          OPENROUTER_API_KEY: ""
+          OPENAI_API_KEY: ""
+          NOUS_API_KEY: ""
+        run: |
+          source .venv/bin/activate
+          python -m pytest -m tests/docker/

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -260,7 +225,7 @@ jobs:
      - name: Push arm64 by digest
        id: push
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
        with:
          context: .
          file: Dockerfile
@@ -282,7 +247,7 @@ jobs:

      - name: Upload digest artifact
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: digest-arm64
          path: /tmp/digests/*
@@ -304,17 +269,17 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Download digests
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          path: /tmp/digests
          pattern: digest-*
          merge-multiple: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3

      - name: Log in to Docker Hub
-        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121  # v4.1.0
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -37,7 +37,7 @@ jobs:
          fetch-depth: 0 # need full history for merge-base + worktree

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff + ty
        uses: ./.github/actions/retry
@@ -109,46 +109,6 @@ jobs:
            --output    .lint-reports/summary.md
          cat .lint-reports/summary.md >> "$GITHUB_STEP_SUMMARY"

-      - name: Upload reports as artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
-        with:
-          name: lint-reports
-          path: .lint-reports/
-          retention-days: 14
-
-      - name: Post / update PR comment
-        if: inputs.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
-        continue-on-error: true
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
-        with:
-          script: |
-            const fs = require('fs');
-            const body = fs.readFileSync('.lint-reports/summary.md', 'utf8');
-            const marker = '<!-- lint-diff-summary -->';
-            const fullBody = marker + '\n' + body;
-
-            const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo:  context.repo.repo,
-              issue_number: context.issue.number,
-            });
-            const existing = comments.find(c => c.body && c.body.includes(marker));
-            if (existing) {
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo:  context.repo.repo,
-                comment_id: existing.id,
-                body: fullBody,
-              });
-            } else {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo:  context.repo.repo,
-                issue_number: context.issue.number,
-                body: fullBody,
-              });
-            }
-
  ruff-blocking:
    # Enforce the rules in pyproject.toml [tool.ruff.lint.select]. Currently
    # PLW1514 (unspecified-encoding) — catches bare ``open()`` /
@@ -164,7 +124,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Install ruff
        uses: ./.github/actions/retry
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -3,17 +3,17 @@ name: Build Skills Index
 on:
  schedule:
    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: '0 6,18 * * *'
-  workflow_dispatch:  # Manual trigger
+    - cron: "0 6,18 * * *"
+  workflow_dispatch: # Manual trigger
  push:
    branches: [main]
    paths:
-      - 'scripts/build_skills_index.py'
-      - '.github/workflows/skills-index.yml'
+      - "scripts/build_skills_index.py"
+      - ".github/workflows/skills-index.yml"

 permissions:
  contents: read
-  actions: write   # to trigger deploy-site.yml on schedule
+  actions: write # to trigger deploy-site.yml on schedule

 jobs:
  build-index:
@@ -21,11 +21,11 @@ jobs:
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.11'
+          python-version: "3.11"

      - name: Install dependencies
        run: pip install httpx==0.28.1 pyyaml==6.0.2
@@ -36,7 +36,7 @@ jobs:
        run: python scripts/build_skills_index.py

      - name: Upload index artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: skills-index
          path: website/static/api/skills-index.json
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -2,6 +2,11 @@ name: Tests

 on:
  workflow_call:
+    inputs:
+      slice_count:
+        description: Number of parallel test slices
+        type: number
+        default: 8

 permissions:
  contents: read
@@ -12,13 +17,29 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  generate:
+    name: "Generate slices"
+    runs-on: ubuntu-latest
+    outputs:
+      slices: ${{ steps.matrix.outputs.slices }}
+      slice_count: ${{ steps.matrix.outputs.slice_count }}
+    steps:
+      - name: Generate test slices
+        id: matrix
+        run: |
+          COUNT="${{ inputs.slice_count }}"
+          SLICES=$(python3 -c "import json; print(json.dumps({'slice': list(range(1, $COUNT + 1))}))")
+          echo "slices=$SLICES" >> "$GITHUB_OUTPUT"
+          echo "slice_count=$COUNT" >> "$GITHUB_OUTPUT"
+
  test:
+    name: Run tests slice
+    needs: generate
    runs-on: ubuntu-latest
    timeout-minutes: 30
    strategy:
      fail-fast: false
-      matrix:
-        slice: [1, 2, 3, 4, 5, 6]
+      matrix: ${{ fromJSON(needs.generate.outputs.slices) }}
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -49,7 +70,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
@@ -78,8 +99,8 @@ jobs:
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice }}/6)
-        # Per-file isolation via scripts/run_tests_parallel.py: discovers
+      - name: Run tests (slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }})
+        # Per-file isolation via scripts/run_tests.sh: discovers
        # every test_*.py file under tests/ (excluding integration/ + e2e/),
        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
        # with bounded parallelism. No xdist, no shared workers, no
@@ -97,14 +118,14 @@ jobs:
        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
        # the job with cleaner semantics.
        #
-        # Matrix slicing (--slice I/N): files are distributed across 6
+        # Matrix slicing (--slice I/N): files are distributed across N
        # jobs by cached duration (LPT algorithm) so each job gets
        # roughly equal wall time. Without a cache, files default to 2s
        # estimate and get split roughly evenly by count — still correct,
        # just not perfectly balanced.
        run: |
          source .venv/bin/activate
-          python scripts/run_tests_parallel.py --slice ${{ matrix.slice }}/6
+          scripts/run_tests.sh --slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }}
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@@ -173,7 +194,7 @@ jobs:
          rg --version

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0
        with:
          # Persist uv's download/wheel cache (~/.cache/uv) across runs.
          # Keyed on the dependency manifests, so the cache is reused until
--- a/.github/workflows/typecheck.yml
+++ b/.github/workflows/typecheck.yml
@@ -6,6 +6,7 @@ on:

 jobs:
  typecheck:
+    name: Check TypeScript
    runs-on: ubuntu-latest
    strategy:
      matrix:
@@ -22,8 +23,7 @@ jobs:
      # native builds. Skipping install scripts drops node-pty's node-gyp
      # header fetch — the transient flake that killed this job pre-`tsc` — and
      # is faster. retry covers the remaining registry blips.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci --ignore-scripts
      - run: npm run --prefix ${{ matrix.package }} typecheck
@@ -35,6 +35,7 @@ jobs:
  # users build apps/desktop from source on install/update. Run the real
  # `vite build` here so that class of break fails in CI instead.
  desktop-build:
+    name: Build desktop app
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -44,8 +45,7 @@ jobs:
          cache: npm
      # Keep install scripts here: the production build may need node-pty's
      # native binary. retry handles the transient install-time fetch flakes.
-      - 
-        uses: ./.github/actions/retry
+      - uses: ./.github/actions/retry
        with:
          command: npm ci
      - run: npm run --prefix apps/desktop build
--- a/.github/workflows/upload_to_pypi.yml
+++ b/.github/workflows/upload_to_pypi.yml
@@ -5,11 +5,11 @@ name: Publish to PyPI
 on:
  push:
    tags:
-      - 'v20*'  # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
+      - "v20*" # CalVer tags: v2026.5.15, v2026.5.15.2, etc.
  workflow_dispatch:
    inputs:
      confirm_tag:
-        description: 'Tag to publish (e.g. v2026.5.15). Must already exist.'
+        description: "Tag to publish (e.g. v2026.5.15). Must already exist."
        required: true
        type: string

@@ -27,7 +27,7 @@ jobs:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
          # On workflow_dispatch, check out the confirmed tag.
@@ -43,17 +43,17 @@ jobs:
          fi

      - name: Set up Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405  # v6.2.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
-          python-version: '3.13'
+          python-version: "3.13"

      - name: Install uv
-        uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e  # v6
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
        with:
-          node-version: '22'
+          node-version: "22"

      - name: Build web dashboard
        run: cd web && npm ci && npm run build
@@ -81,7 +81,7 @@ jobs:
        run: uv build --sdist --wheel

      - name: Upload distribution artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
        with:
          name: python-package-distributions
          path: dist/
@@ -94,17 +94,17 @@ jobs:
      name: pypi
      url: https://pypi.org/p/hermes-agent
    permissions:
-      id-token: write  # OIDC trusted publishing
+      id-token: write # OIDC trusted publishing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b  # v1.14.0
+        uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
        with:
          skip-existing: true

@@ -116,12 +116,12 @@ jobs:
    needs: publish
    runs-on: ubuntu-latest
    permissions:
-      contents: write   # attach assets to the existing release
-      id-token: write   # sigstore signing
+      contents: write # attach assets to the existing release
+      id-token: write # sigstore signing

    steps:
      - name: Download distribution artifacts
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
        with:
          name: python-package-distributions
          path: dist/
@@ -145,7 +145,7 @@ jobs:

      - name: Sign with Sigstore
        if: env.skip_sign != 'true'
-        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc  # v3.3.0
+        uses: sigstore/gh-action-sigstore-python@04cffa1d795717b140764e8b640de88853c92acc # v3.3.0
        with:
          inputs: >-
            ./dist/*.tar.gz
--- a/.github/workflows/uv-lockfile-check.yml
+++ b/.github/workflows/uv-lockfile-check.yml
@@ -4,7 +4,7 @@ name: uv.lock check
 # that modify pyproject.toml without regenerating uv.lock (or vice versa)
 # must not merge, because the Docker build's `uv sync --frozen` step will
 # fail on a stale lockfile and we'd rather catch it here than in the
-# docker-publish workflow on main.
+# docker workflow on main.
 #
 # ─────────────────────────────────────────────────────────────────────────
 # IMPORTANT: this check runs against the MERGED state, not just your branch
@@ -63,7 +63,7 @@ jobs:
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
+        uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # 8.2.0

      # `uv lock --check` re-resolves the project from pyproject.toml and
      # compares the result to uv.lock, exiting non-zero if they disagree.
@@ -100,7 +100,7 @@ jobs:

          This check is blocking because the Docker image build uses
          `uv sync --frozen --extra all`, which rejects stale lockfiles
-          — catching it here avoids a ~15 min failed docker-publish run
+          — catching it here avoids a ~15 min failed docker run
          on `main` post-merge.
          EOF
            echo "::error title=uv.lock out of sync::Run \`uv lock\` locally and commit the result. If on a PR, sync with main first."
--- a/28
+++ b/28
@@ -189,7 +189,13 @@ RUN cd web && npm run build && \

 # ---------- Source code ----------
 # .dockerignore excludes node_modules, so the installs above survive.
-COPY . .
+# --link decouples this layer from parents for cache purposes; --chmod bakes
+# the final read-only permissions at copy time so we skip the separate
+# `chmod -R` pass that previously walked ~30k files across the venv +
+# node_modules + source (21s amd64 / 222s arm64 — #49113).  `a+rX,go-w`
+# gives the non-root hermes user read + traverse but no write; root retains
+# write so the build steps below don't need chmod u+w dances.
+COPY --link --chmod=a+rX,go-w . .

 # ---------- Permissions ----------
 # Link hermes-agent itself (editable). Deps are already installed in the
@@ -197,19 +203,15 @@ COPY . .
 # resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

-# Keep /opt/hermes immutable for the runtime hermes user. Hosted/container
-# instances must not be able to self-edit the installed source or venv; user
-# data, skills, plugins, config, logs, and dashboard uploads live under
-# /opt/data instead. Root can still repair the image during build/boot, but
-# supervised Hermes processes drop to the non-root hermes user.
+# Wire the exec shim and install-method stamp.  Files under /opt/hermes are
+# already root-owned (COPY, uv sync, npm install all run as root) and
+# read-only for the hermes user (go-w from the --chmod above).
+
 USER root
 RUN mkdir -p /opt/hermes/bin && \
    cp /opt/hermes/docker/hermes-exec-shim.sh /opt/hermes/bin/hermes && \
    chmod 0755 /opt/hermes/bin/hermes && \
-    printf 'docker\n' > /opt/hermes/.install_method && \
-    chown -R root:root /opt/hermes && \
-    chmod -R a+rX /opt/hermes && \
-    chmod -R a-w /opt/hermes
+    printf 'docker\n' > /opt/hermes/.install_method
 # The ``.install_method`` stamp is baked next to the running code (the install
 # tree), NOT into $HERMES_HOME. $HERMES_HOME (/opt/data) is a shared data
 # volume that is commonly bind-mounted from the host and even shared with a
@@ -236,13 +238,11 @@ RUN mkdir -p /opt/hermes/bin && \
 #
 # The arg is optional — local `docker build` without --build-arg simply
 # omits the file, and the runtime falls back to live-git lookup.  CI
-# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
+# (.github/workflows/docker.yml) passes ${{ github.sha }} so
 # every published image has it.
 ARG HERMES_GIT_SHA=
 RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
-        chmod u+w /opt/hermes && \
-        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
-        chmod a-w /opt/hermes /opt/hermes/.hermes_build_sha; \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha; \
    fi

 # ---------- s6-overlay service wiring ----------
--- a/nix/devShell.nix
+++ b/nix/devShell.nix
@@ -25,10 +25,12 @@
    in
    {
      devShells.default = pkgs.mkShell {
-        inputsFrom = packages;
-        packages = with pkgs; [
-          uv
-        ];
+        packages =
+          with pkgs;
+          [
+            uv
+          ]
+          ++ self'.packages.default.passthru.devDeps;
        shellHook = ''
          echo "Hermes Agent dev shell"
          ${combinedNonNpm}
--- a/nix/hermes-agent.nix
+++ b/nix/hermes-agent.nix
@@ -37,10 +37,14 @@
 }:
 let
  nodejs = nodejs_22;
-  hermesVenv = callPackage ./python.nix {
-    inherit uv2nix pyproject-nix pyproject-build-systems;
-    dependency-groups = [ "all" ] ++ extraDependencyGroups;
-  };
+  mkHermesVenv =
+    extraDependencyGroups:
+    callPackage ./python.nix {
+      inherit uv2nix pyproject-nix pyproject-build-systems;
+      dependency-groups = [ "all" ] ++ extraDependencyGroups;
+    };
+
+  hermesVenv = mkHermesVenv extraDependencyGroups;

  hermesNpmLib = callPackage ./lib.nix {
    inherit npm-lockfile-fix nodejs;
@@ -106,12 +110,6 @@ let

  pythonPath = lib.makeSearchPath sitePackagesPath allExtraPythonPackages;

-  pyprojectHash = builtins.hashString "sha256" (builtins.readFile ../pyproject.toml);
-  uvLockHash =
-    if builtins.pathExists ../uv.lock then
-      builtins.hashString "sha256" (builtins.readFile ../uv.lock)
-    else
-      "none";
  checkPackageCollisions = ''
    import pathlib, sys, re

@@ -223,21 +221,10 @@ stdenv.mkDerivation (finalAttrs: {
    };

    devShellHook = ''
-      STAMP=".nix-stamps/hermes-agent"
-      STAMP_VALUE="${pyprojectHash}:${uvLockHash}"
-      if [ ! -f "$STAMP" ] || [ "$(cat "$STAMP")" != "$STAMP_VALUE" ]; then
-        echo "hermes-agent: installing Python dependencies..."
-        uv venv .venv --python ${python312}/bin/python3 2>/dev/null || true
-        source .venv/bin/activate
-        uv pip install -e ".[all]"
-        [ -d mini-swe-agent ] && uv pip install -e ./mini-swe-agent 2>/dev/null || true
-        mkdir -p .nix-stamps
-        echo "$STAMP_VALUE" > "$STAMP"
-      else
-        source .venv/bin/activate
-        export HERMES_PYTHON=${hermesVenv}/bin/python3
-      fi
+      export HERMES_PYTHON=${hermesVenv}/bin/python3
    '';
+
+    devDeps = runtimeDeps ++ [ (mkHermesVenv (extraDependencyGroups ++ [ "dev" ])) ];
  };

  meta = with lib; {
--- a/nix/packages.nix
+++ b/nix/packages.nix
@@ -2,54 +2,62 @@
 { inputs, ... }:
 {
  perSystem =
-    { pkgs, lib, inputs', ... }:
+    {
+      pkgs,
+      lib,
+      inputs',
+      ...
+    }:
    let
-      hermesAgent = pkgs.callPackage ./hermes-agent.nix {
+      minimal = pkgs.callPackage ./hermes-agent.nix {
        inherit (inputs) uv2nix pyproject-nix pyproject-build-systems;
        npm-lockfile-fix = inputs'.npm-lockfile-fix.packages.default;
        # Only embed clean revs — dirtyRev doesn't represent any upstream
        # commit, so comparing it would always claim "update available".
        rev = inputs.self.rev or null;
      };
+
+      # All platform-portable optional integrations pre-built.
+      full = minimal.override {
+        extraDependencyGroups = [
+          "anthropic"
+          "azure-identity"
+          "bedrock"
+          "daytona"
+          "dingtalk"
+          "edge-tts"
+          "exa"
+          "fal"
+          "feishu"
+          "firecrawl"
+          "hindsight"
+          "honcho"
+          "messaging"
+          "modal"
+          "parallel-web"
+          "tts-premium"
+          "voice"
+        ]
+        # matrix is Linux-only (oqs/liboqs lacks aarch64-darwin wheels).
+        ++ lib.optionals pkgs.stdenv.isLinux [ "matrix" ];
+      };
    in
    {
      packages = {
-        default = hermesAgent;
+        default = full;
+
+        inherit minimal;

        # Ships discord.py + python-telegram-bot + slack-sdk so a plain
        # `nix profile install .#messaging` connects to Discord/Telegram/Slack
        # on first run — lazy-install can't write to the read-only /nix/store.
-        messaging = hermesAgent.override {
+        messaging = minimal.override {
          extraDependencyGroups = [ "messaging" ];
        };

-        # All platform-portable optional integrations pre-built.
-        # matrix is Linux-only (oqs/liboqs lacks aarch64-darwin wheels).
-        full = hermesAgent.override {
-          extraDependencyGroups = [
-            "anthropic"
-            "azure-identity"
-            "bedrock"
-            "daytona"
-            "dingtalk"
-            "edge-tts"
-            "exa"
-            "fal"
-            "feishu"
-            "firecrawl"
-            "hindsight"
-            "honcho"
-            "messaging"
-            "modal"
-            "parallel-web"
-            "tts-premium"
-            "voice"
-          ] ++ lib.optionals pkgs.stdenv.isLinux [ "matrix" ];
-        };
-
-        tui = hermesAgent.hermesTui;
-        web = hermesAgent.hermesWeb;
-        desktop = hermesAgent.hermesDesktop;
+        tui = full.hermesTui;
+        web = full.hermesWeb;
+        desktop = full.hermesDesktop;
      };
    };
 }
--- a/scripts/ci/timings_report.py
+++ b/scripts/ci/timings_report.py
@@ -0,0 +1,782 @@
+#!/usr/bin/env python3
+"""Collect CI job/step timings from the GitHub API and generate an HTML diff report.
+
+In CI, the script reads GITHUB_TOKEN, GITHUB_REPOSITORY, GITHUB_RUN_ID, and
+GITHUB_SHA from the environment to collect timings via the REST API.
+
+If a baseline JSON file (ci-timings-baseline.json by default) exists, the
+report includes a diff with per-job and per-step deltas, plus a gantt chart
+overlaying current vs baseline bars.
+
+Usage:
+    # Collect from API (CI mode):
+    python scripts/ci/timings_report.py
+
+    # Regenerate HTML from saved JSON (testing):
+    python scripts/ci/timings_report.py --from-json ci-timings.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from html import escape
+
+API_BASE = "https://api.github.com"
+
+
+# ---------------------------------------------------------------------------
+# GitHub API helpers
+# ---------------------------------------------------------------------------
+
+def api_get(path: str, token: str, params: dict | None = None,
+            list_key: str | None = None) -> list | dict:
+    """Authenticated GitHub API GET with automatic pagination.
+
+    For list endpoints, pass list_key to extract items from the paginated
+    wrapper response (e.g. list_key='jobs' for {'total_count': N, 'jobs': [...]}).
+    When list_key is omitted, a non-list response is returned as-is (single object).
+    """
+    url = f"{API_BASE}{path}"
+    if params:
+        url += "?" + urllib.parse.urlencode(params)
+
+    results: list = []
+    while url:
+        req = urllib.request.Request(url, headers={
+            "Authorization": f"Bearer {token}",
+            "Accept": "application/vnd.github+json",
+            "X-GitHub-Api-Version": "2022-11-28",
+            "User-Agent": "ci-timings-report",
+        })
+        with urllib.request.urlopen(req) as resp:
+            data = json.loads(resp.read())
+            link_header = resp.headers.get("Link", "")
+
+        if list_key:
+            results.extend(data.get(list_key, []))
+        elif isinstance(data, list):
+            results.extend(data)
+        else:
+            return data
+
+        next_url = None
+        for part in link_header.split(","):
+            part = part.strip()
+            if 'rel="next"' in part:
+                next_url = part[part.find("<") + 1:part.find(">")]
+                break
+        url = next_url
+
+    return results
+
+
+def parse_ts(ts: str | None) -> datetime | None:
+    if not ts:
+        return None
+    return datetime.fromisoformat(ts.replace("Z", "+00:00"))
+
+
+def dur_s(started: str | None, completed: str | None) -> float | None:
+    s = parse_ts(started)
+    e = parse_ts(completed)
+    if not s or not e:
+        return None
+    return (e - s).total_seconds()
+
+
+# ---------------------------------------------------------------------------
+# Timings collection
+# ---------------------------------------------------------------------------
+
+def _normalize_job(raw: dict) -> dict:
+    steps = []
+    for step in (raw.get("steps") or []):
+        steps.append({
+            "name": step.get("name", ""),
+            "number": step.get("number", 0),
+            "status": step.get("status", ""),
+            "conclusion": step.get("conclusion", ""),
+            "started_at": step.get("started_at"),
+            "completed_at": step.get("completed_at"),
+            "duration_s": dur_s(step.get("started_at"), step.get("completed_at")),
+        })
+    return {
+        "name": raw.get("name", "unknown"),
+        "workflow_name": raw.get("_workflow_name", ""),
+        "job_id": raw.get("id"),
+        "status": raw.get("status", ""),
+        "conclusion": raw.get("conclusion", ""),
+        "started_at": raw.get("started_at"),
+        "completed_at": raw.get("completed_at"),
+        "duration_s": dur_s(raw.get("started_at"), raw.get("completed_at")),
+        "html_url": raw.get("html_url", ""),
+        "steps": steps,
+    }
+
+
+def collect_timings(token: str, repo: str, run_id: str, head_sha: str) -> dict:
+    """Collect job/step timings from the GitHub API.
+
+    1. Get orchestrator run's direct jobs (detect, all-checks-pass, etc.).
+       Skip workflow-call placeholder jobs (step name starts with "Run ./.github/").
+    2. Find sub-workflow runs via head_sha + event=workflow_call.
+    3. Get each sub-workflow run's jobs with full step timing.
+    """
+    owner, repo_name = repo.split("/")
+
+    # Orchestrator run info
+    run_info = api_get(f"/repos/{owner}/{repo_name}/actions/runs/{run_id}", token)
+    created_at = run_info.get("created_at", "")
+
+    # Orchestrator direct jobs
+    orch_jobs = api_get(f"/repos/{owner}/{repo_name}/actions/runs/{run_id}/jobs",
+                        token, list_key="jobs")
+
+    direct = []
+    for job in orch_jobs:
+        steps = job.get("steps") or []
+        if any(s.get("name", "").startswith("Run ./.github/") for s in steps):
+            continue  # workflow-call placeholder
+        if job.get("status") in ("in_progress", "queued"):
+            continue  # skip self / unfinished
+        direct.append(job)
+
+    # Sub-workflow runs
+    sub_runs = api_get(f"/repos/{owner}/{repo_name}/actions/runs", token, params={
+        "head_sha": head_sha,
+        "event": "workflow_call",
+        "per_page": 100,
+    }, list_key="workflow_runs")
+    sub_runs = [r for r in sub_runs if r.get("created_at", "") >= created_at]
+
+    sub_jobs_raw = []
+    for sr in sub_runs:
+        sr_id = sr["id"]
+        sr_name = sr.get("name", "")
+        sr_jobs = api_get(f"/repos/{owner}/{repo_name}/actions/runs/{sr_id}/jobs",
+                          token, list_key="jobs")
+        for j in sr_jobs:
+            j["_workflow_name"] = sr_name
+            j["_workflow_run_id"] = sr_id
+            sub_jobs_raw.append(j)
+
+    # Normalize + sort
+    all_jobs = [_normalize_job(j) for j in direct + sub_jobs_raw]
+    all_jobs = [j for j in all_jobs if j["status"] not in ("in_progress", "queued")]
+    all_jobs.sort(key=lambda j: j.get("started_at") or "")
+
+    return {
+        "run_id": run_id,
+        "head_sha": head_sha,
+        "created_at": created_at,
+        "jobs": all_jobs,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Formatting helpers
+# ---------------------------------------------------------------------------
+
+def fmt_dur(seconds: float | None) -> str:
+    if seconds is None:
+        return "—"
+    if seconds < 60:
+        return f"{seconds:.1f}s"
+    m = int(seconds // 60)
+    s = seconds % 60
+    if s == 0:
+        return f"{m}m"
+    return f"{m}m{s:.0f}s"
+
+
+def fmt_delta(current: float | None, baseline: float | None) -> tuple[str, str]:
+    """Return (text, css_class) for a delta."""
+    if current is None or baseline is None:
+        return ("—", "neutral")
+    delta = current - baseline
+    if baseline == 0:
+        pct_str = "new" if delta > 0 else "0%"
+    else:
+        pct = (delta / baseline) * 100
+        pct_str = f"{pct:+.1f}%"
+    if abs(delta) < 1.0:
+        cls = "neutral"
+    elif delta > 0:
+        cls = "slower"
+    else:
+        cls = "faster"
+    sign = "+" if delta >= 0 else ""
+    return (f"{sign}{delta:.1f}s ({pct_str})", cls)
+
+
+def nice_ticks(max_seconds: float, num_ticks: int = 8) -> list[int]:
+    if max_seconds <= 0:
+        return [0]
+    raw = max_seconds / num_ticks
+    for nice in [5, 10, 15, 30, 60, 120, 180, 300, 600, 900, 1800, 3600, 7200]:
+        if nice >= raw:
+            step = nice
+            break
+    else:
+        step = max(int(raw), 3600)
+    return list(range(0, int(max_seconds) + step + 1, step))
+
+
+def fmt_tick(seconds: int) -> str:
+    if seconds < 60:
+        return f"{seconds}s"
+    m, s = divmod(seconds, 60)
+    if s == 0:
+        return f"{m}m"
+    return f"{m}m{s}s"
+
+
+# ---------------------------------------------------------------------------
+# Stats computation
+# ---------------------------------------------------------------------------
+
+def compute_stats(timings: dict, baseline: dict | None = None) -> dict:
+    jobs = timings.get("jobs", [])
+    bl_jobs = {j["name"]: j for j in (baseline or {}).get("jobs", [])}
+
+    # Wall time
+    starts = [s for s in (parse_ts(j.get("started_at")) for j in jobs) if s is not None]
+    ends = [e for e in (parse_ts(j.get("completed_at")) for j in jobs) if e is not None]
+    wall = (max(ends) - min(starts)).total_seconds() if starts and ends else 0
+    compute = sum(j.get("duration_s") or 0 for j in jobs)
+
+    # Baseline wall/compute
+    bl_wall = None
+    bl_compute = None
+    if baseline:
+        bl_starts = [s for s in (parse_ts(j.get("started_at")) for j in baseline.get("jobs", [])) if s is not None]
+        bl_ends = [e for e in (parse_ts(j.get("completed_at")) for j in baseline.get("jobs", [])) if e is not None]
+        if bl_starts and bl_ends:
+            bl_wall = (max(bl_ends) - min(bl_starts)).total_seconds()
+        bl_compute = sum(j.get("duration_s") or 0 for j in baseline.get("jobs", []))
+
+    # Per-job deltas
+    faster = 0
+    slower = 0
+    unchanged = 0
+    no_baseline = 0
+    for j in jobs:
+        bl = bl_jobs.get(j["name"])
+        if not bl:
+            no_baseline += 1
+            continue
+        cur_d = j.get("duration_s") or 0
+        bl_d = bl.get("duration_s") or 0
+        if abs(cur_d - bl_d) < 1.0:
+            unchanged += 1
+        elif cur_d > bl_d:
+            slower += 1
+        else:
+            faster += 1
+
+    return {
+        "wall": wall,
+        "compute": compute,
+        "bl_wall": bl_wall,
+        "bl_compute": bl_compute,
+        "faster": faster,
+        "slower": slower,
+        "unchanged": unchanged,
+        "no_baseline": no_baseline,
+        "total_jobs": len(jobs),
+    }
+
+
+# ---------------------------------------------------------------------------
+# HTML generation
+# ---------------------------------------------------------------------------
+
+CSS = """
+* { box-sizing: border-box; margin: 0; padding: 0; }
+body {
+  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
+  background: #0d1117; color: #e6edf3; line-height: 1.5; padding: 24px;
+}
+h1 { font-size: 24px; border-bottom: 1px solid #30363d; padding-bottom: 12px; margin-bottom: 8px; }
+.meta { color: #8b949e; font-size: 13px; margin-bottom: 24px; }
+h2 { font-size: 18px; margin: 32px 0 12px; }
+
+/* Stats cards */
+.stats { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 24px; }
+.stat-card {
+  background: #161b22; border: 1px solid #30363d; border-radius: 8px;
+  padding: 14px 18px; min-width: 140px;
+}
+.stat-label { font-size: 12px; color: #8b949e; text-transform: uppercase; letter-spacing: 0.5px; }
+.stat-value { font-size: 22px; font-weight: 600; margin: 4px 0; }
+.stat-delta { font-size: 13px; }
+.faster { color: #3fb950; }
+.slower { color: #f85149; }
+.neutral { color: #8b949e; }
+
+/* Gantt */
+.gantt-wrap { overflow-x: auto; }
+.gantt { min-width: 700px; }
+.gantt-row { display: flex; align-items: center; height: 28px; }
+.gantt-label {
+  width: 220px; padding-right: 12px; text-align: right;
+  font-size: 12px; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;
+}
+.gantt-track { flex: 1; position: relative; height: 100%; border-left: 1px solid #21262d; }
+.gantt-bar {
+  position: absolute; height: 18px; border-radius: 3px;
+  display: flex; align-items: center; justify-content: center;
+  font-size: 10px; color: transparent; overflow: hidden;
+  transition: color 0.15s;
+}
+.gantt-bar:hover { color: #fff; z-index: 10; }
+.gantt-bar.current { background: #1f6feb; top: 5px; z-index: 2; }
+.gantt-bar.baseline {
+  background: transparent; border: 1px dashed #8b949e; top: 2px; height: 24px; z-index: 1;
+}
+.gantt-axis { display: flex; height: 20px; position: relative; border-top: 1px solid #30363d; margin-top: 4px; }
+.gantt-tick { position: absolute; font-size: 10px; color: #8b949e; transform: translateX(-50%); top: 4px; }
+.gantt-tick::before { content: ''; position: absolute; top: -4px; left: 50%; width: 1px; height: 4px; background: #30363d; }
+.legend { display: flex; gap: 16px; margin-top: 8px; font-size: 12px; color: #8b949e; }
+.legend-swatch { display: inline-block; width: 16px; height: 10px; border-radius: 2px; margin-right: 4px; vertical-align: middle; }
+
+/* Tables */
+table { border-collapse: collapse; width: 100%; font-size: 13px; margin-bottom: 16px; }
+th, td { border: 1px solid #30363d; padding: 6px 10px; text-align: left; }
+th { background: #161b22; font-weight: 600; position: sticky; top: 0; }
+tr:hover td { background: #161b22; }
+.num { text-align: right; font-variant-numeric: tabular-nums; }
+.job-name { font-weight: 500; }
+
+/* Step details */
+details { margin-bottom: 8px; background: #161b22; border: 1px solid #30363d; border-radius: 6px; }
+summary { padding: 8px 12px; cursor: pointer; font-weight: 500; font-size: 14px; user-select: none; }
+summary:hover { background: #21262d; }
+details[open] summary { border-bottom: 1px solid #30363d; }
+details table { border: none; margin: 0; }
+details td, details th { font-size: 12px; }
+
+/* Worst regressions */
+.regressions { margin-bottom: 24px; }
+.regressions table { font-size: 13px; }
+.tag {
+  display: inline-block; padding: 1px 6px; border-radius: 3px; font-size: 11px; font-weight: 500;
+}
+.tag.slow { background: rgba(248,81,73,0.15); color: #f85149; }
+.tag.fast { background: rgba(63,185,80,0.15); color: #3fb950; }
+"""
+
+
+def _gantt_bars(timings: dict, baseline: dict | None) -> str:
+    """Render the gantt chart HTML.
+
+    Both current and baseline timelines are normalized to start at t=0
+    (relative to each run's earliest job start). The axis scale spans
+    0..max_end across both runs so bars are directly comparable.
+    """
+    jobs = [j for j in timings.get("jobs", []) if j.get("started_at") and j.get("completed_at")]
+    bl_map = {j["name"]: j for j in (baseline or {}).get("jobs", [])}
+
+    # Current run: relative offsets from earliest start
+    cur_starts = [s for s in (parse_ts(j.get("started_at")) for j in jobs) if s is not None]
+    cur_ends = [e for e in (parse_ts(j.get("completed_at")) for j in jobs) if e is not None]
+    if not cur_starts or not cur_ends:
+        return '<p style="color:#8b949e">No timing data available.</p>'
+    cur_t0 = min(cur_starts)
+    cur_max = (max(cur_ends) - cur_t0).total_seconds()
+
+    # Baseline run: relative offsets from its earliest start
+    bl_t0 = None
+    bl_max = 0.0
+    bl_jobs_timed = []
+    for bl_j in bl_map.values():
+        s = parse_ts(bl_j.get("started_at"))
+        e = parse_ts(bl_j.get("completed_at"))
+        if s is not None and e is not None:
+            bl_jobs_timed.append((bl_j, s, e))
+            if bl_t0 is None or s < bl_t0:
+                bl_t0 = s
+            rel_end = (e - s).total_seconds() + (s - (bl_t0 or s)).total_seconds()
+    if bl_t0 is not None:
+        bl_max = max((e - bl_t0).total_seconds() for _, _, e in bl_jobs_timed) if bl_jobs_timed else 0
+
+    total_s = max(cur_max, bl_max)
+    if total_s <= 0:
+        total_s = 1
+
+    rows = []
+    for j in jobs:
+        s = parse_ts(j.get("started_at"))
+        e = parse_ts(j.get("completed_at"))
+        if s is None or e is None:
+            continue
+        left = (s - cur_t0).total_seconds() / total_s * 100
+        width = max((e - s).total_seconds() / total_s * 100, 0.5)  # min 0.5% for visibility
+        dur = j.get("duration_s") or 0
+
+        bl = bl_map.get(j["name"])
+        bl_bar = ""
+        if bl and bl_t0 is not None:
+            bl_s = parse_ts(bl.get("started_at"))
+            bl_e = parse_ts(bl.get("completed_at"))
+            if bl_s is not None and bl_e is not None:
+                bl_left = (bl_s - bl_t0).total_seconds() / total_s * 100
+                bl_width = max((bl_e - bl_s).total_seconds() / total_s * 100, 0.5)
+                bl_dur = bl.get("duration_s") or 0
+                bl_bar = (
+                    f'<div class="gantt-bar baseline" '
+                    f'style="left:{bl_left:.2f}%;width:{bl_width:.2f}%" '
+                    f'title="baseline: {fmt_dur(bl_dur)}"></div>'
+                )
+
+        name_display = escape(j["name"])
+        if j.get("workflow_name"):
+            name_display = f'{escape(j["workflow_name"])} / {escape(j["name"])}'
+
+        delta_info = ""
+        if bl and bl.get("duration_s") is not None:
+            d_text, d_cls = fmt_delta(dur, bl.get("duration_s"))
+            delta_info = f' — {d_text}'
+
+        rows.append(
+            f'<div class="gantt-row">'
+            f'<div class="gantt-label" title="{escape(j["name"])}">{name_display}</div>'
+            f'<div class="gantt-track">'
+            f'{bl_bar}'
+            f'<div class="gantt-bar current" '
+            f'style="left:{left:.2f}%;width:{width:.2f}%" '
+            f'title="{escape(j["name"])}: {fmt_dur(dur)}{delta_info}"></div>'
+            f'</div></div>'
+        )
+
+    # Axis
+    ticks = nice_ticks(total_s)
+    tick_html = "".join(
+        f'<span class="gantt-tick" style="left:{(t / total_s * 100):.1f}%">{fmt_tick(t)}</span>'
+        for t in ticks
+    )
+    axis = f'<div class="gantt-axis"><div class="gantt-track">{tick_html}</div></div>'
+
+    legend = (
+        '<div class="legend">'
+        '<span><span class="legend-swatch" style="background:#1f6feb"></span>Current</span>'
+    )
+    if baseline:
+        legend += '<span><span class="legend-swatch" style="border:1px dashed #8b949e"></span>Baseline (main)</span>'
+    legend += '</div>'
+
+    return f'<div class="gantt-wrap"><div class="gantt">{"".join(rows)}{axis}</div></div>{legend}'
+
+
+def _stats_cards(stats: dict) -> str:
+    wall_text = fmt_dur(stats["wall"])
+    wall_delta = ""
+    if stats["bl_wall"] is not None:
+        d, cls = fmt_delta(stats["wall"], stats["bl_wall"])
+        wall_delta = f'<span class="stat-delta {cls}">{d}</span>'
+
+    compute_text = fmt_dur(stats["compute"])
+    compute_delta = ""
+    if stats["bl_compute"] is not None:
+        d, cls = fmt_delta(stats["compute"], stats["bl_compute"])
+        compute_delta = f'<span class="stat-delta {cls}">{d}</span>'
+
+    cards = [
+        f'<div class="stat-card"><span class="stat-label">Wall Time</span>'
+        f'<div class="stat-value">{wall_text}</div>{wall_delta}</div>',
+        f'<div class="stat-card"><span class="stat-label">Total Compute</span>'
+        f'<div class="stat-value">{compute_text}</div>{compute_delta}</div>',
+        f'<div class="stat-card"><span class="stat-label">Jobs Faster</span>'
+        f'<div class="stat-value faster">{stats["faster"]}</div></div>',
+        f'<div class="stat-card"><span class="stat-label">Jobs Slower</span>'
+        f'<div class="stat-value slower">{stats["slower"]}</div></div>',
+        f'<div class="stat-card"><span class="stat-label">Unchanged</span>'
+        f'<div class="stat-value neutral">{stats["unchanged"]}</div></div>',
+        f'<div class="stat-card"><span class="stat-label">No Baseline</span>'
+        f'<div class="stat-value neutral">{stats["no_baseline"]}</div></div>',
+    ]
+    return f'<div class="stats">{"".join(cards)}</div>'
+
+
+def _job_table(timings: dict, baseline: dict | None) -> str:
+    bl_map = {j["name"]: j for j in (baseline or {}).get("jobs", [])}
+    rows = []
+    for j in timings.get("jobs", []):
+        dur = j.get("duration_s")
+        bl = bl_map.get(j["name"])
+        bl_dur = bl.get("duration_s") if bl else None
+        delta_text, delta_cls = fmt_delta(dur, bl_dur)
+
+        name = escape(j["name"])
+        if j.get("workflow_name"):
+            name = f'{escape(j["workflow_name"])} / {escape(j["name"])}'
+
+        concl = j.get("conclusion", "")
+        concl_icon = {"success": "✓", "failure": "✗", "skipped": "⊘"}.get(concl, "?")
+        concl_cls = {"success": "faster", "failure": "slower", "skipped": "neutral"}.get(concl, "neutral")
+
+        rows.append(
+            f'<tr>'
+            f'<td class="job-name">{name}</td>'
+            f'<td class="num">{fmt_dur(dur)}</td>'
+            f'<td class="num">{fmt_dur(bl_dur)}</td>'
+            f'<td class="num {delta_cls}">{delta_text}</td>'
+            f'<td class="{concl_cls}" style="text-align:center">{concl_icon}</td>'
+            f'</tr>'
+        )
+
+    return (
+        '<table><thead><tr>'
+        '<th>Job</th><th class="num">Current</th><th class="num">Baseline</th>'
+        '<th class="num">Delta</th><th>Status</th>'
+        '</tr></thead><tbody>' + "".join(rows) + '</tbody></table>'
+    )
+
+
+def _step_details(timings: dict, baseline: dict | None) -> str:
+    bl_map = {j["name"]: j for j in (baseline or {}).get("jobs", [])}
+    blocks = []
+    for j in timings.get("jobs", []):
+        if not j.get("steps"):
+            continue
+        bl = bl_map.get(j["name"], {})
+        bl_steps = {s["name"]: s for s in bl.get("steps", [])}
+
+        dur = j.get("duration_s") or 0
+        bl_dur = bl.get("duration_s") if bl else None
+        delta_text, delta_cls = fmt_delta(dur, bl_dur)
+
+        summary_text = f'{escape(j["name"])} — {fmt_dur(dur)}'
+        if bl_dur is not None:
+            summary_text += f' <span class="{delta_cls}">({delta_text})</span>'
+
+        step_rows = []
+        for s in j["steps"]:
+            s_dur = s.get("duration_s")
+            bl_s = bl_steps.get(s["name"])
+            bl_s_dur = bl_s.get("duration_s") if bl_s else None
+            s_delta, s_cls = fmt_delta(s_dur, bl_s_dur)
+
+            step_rows.append(
+                f'<tr>'
+                f'<td>{escape(s["name"])}</td>'
+                f'<td class="num">{fmt_dur(s_dur)}</td>'
+                f'<td class="num">{fmt_dur(bl_s_dur)}</td>'
+                f'<td class="num {s_cls}">{s_delta}</td>'
+                f'</tr>'
+            )
+
+        blocks.append(
+            f'<details><summary>{summary_text}</summary>'
+            f'<table><thead><tr>'
+            '<th>Step</th><th class="num">Current</th><th class="num">Baseline</th>'
+            '<th class="num">Delta</th>'
+            f'</tr></thead><tbody>{"".join(step_rows)}</tbody></table>'
+            f'</details>'
+        )
+
+    return "".join(blocks) if blocks else '<p style="color:#8b949e">No step data available.</p>'
+
+
+def _regressions(timings: dict, baseline: dict | None) -> str:
+    """Show top 10 biggest absolute regressions/improvements across all steps."""
+    if not baseline:
+        return ""
+    bl_map = {j["name"]: j for j in baseline.get("jobs", [])}
+
+    deltas = []  # (abs_delta, job_name, step_name, current, baseline, is_slower)
+    for j in timings.get("jobs", []):
+        bl = bl_map.get(j["name"])
+        if not bl:
+            continue
+        bl_steps = {s["name"]: s for s in bl.get("steps", [])}
+        for s in j.get("steps", []):
+            bl_s = bl_steps.get(s["name"])
+            if not bl_s:
+                continue
+            cur = s.get("duration_s") or 0
+            bl_d = bl_s.get("duration_s") or 0
+            diff = cur - bl_d
+            if abs(diff) < 1.0:
+                continue
+            deltas.append((abs(diff), diff, j["name"], s["name"], cur, bl_d))
+
+    deltas.sort(key=lambda x: x[0], reverse=True)
+    top = deltas[:10]
+    if not top:
+        return ""
+
+    rows = []
+    for _, diff, job, step, cur, bl_d in top:
+        cls = "slower" if diff > 0 else "faster"
+        tag = f'<span class="tag {"slow" if diff > 0 else "fast"}">{"+" if diff > 0 else ""}{diff:.1f}s</span>'
+        rows.append(
+            f'<tr>'
+            f'<td class="job-name">{escape(job)}</td>'
+            f'<td>{escape(step)}</td>'
+            f'<td class="num">{fmt_dur(cur)}</td>'
+            f'<td class="num">{fmt_dur(bl_d)}</td>'
+            f'<td>{tag}</td>'
+            f'</tr>'
+        )
+
+    return (
+        '<div class="regressions">'
+        '<table><thead><tr>'
+        '<th>Job</th><th>Step</th><th class="num">Current</th><th class="num">Baseline</th>'
+        '<th>Delta</th>'
+        '</tr></thead><tbody>' + "".join(rows) + '</tbody></table>'
+        '</div>'
+    )
+
+
+def generate_html(timings: dict, baseline: dict | None = None) -> str:
+    stats = compute_stats(timings, baseline)
+
+    sha_short = (timings.get("head_sha") or "")[:7]
+    run_id = timings.get("run_id", "?")
+    created = timings.get("created_at", "")
+
+    bl_info = ""
+    if baseline:
+        bl_sha = (baseline.get("head_sha") or "")[:7]
+        bl_info = f' | Baseline: <code>{bl_sha}</code> (main)'
+
+    html = (
+        f'<!DOCTYPE html>\n<html lang="en">\n<head>\n'
+        f'<meta charset="utf-8">\n'
+        f'<meta name="viewport" content="width=device-width, initial-scale=1">\n'
+        f'<title>CI Timing Report — {sha_short}</title>\n'
+        f'<style>{CSS}</style>\n'
+        f'</head>\n<body>\n'
+        f'<h1>CI Timing Report</h1>\n'
+        f'<div class="meta">Run <code>{escape(run_id)}</code> | SHA <code>{sha_short}</code>'
+        f' | Generated {escape(created)}{bl_info}</div>\n'
+    )
+
+    html += '<h2>Global Stats</h2>\n'
+    html += _stats_cards(stats)
+
+    if baseline:
+        html += '<h2>Top Regressions & Improvements</h2>\n'
+        html += _regressions(timings, baseline)
+
+    html += '<h2>Gantt Chart</h2>\n'
+    html += _gantt_bars(timings, baseline)
+
+    html += '<h2>Per-Job Comparison</h2>\n'
+    html += _job_table(timings, baseline)
+
+    html += '<h2>Step Details</h2>\n'
+    html += _step_details(timings, baseline)
+
+    html += '</body>\n</html>\n'
+    return html
+
+
+# ---------------------------------------------------------------------------
+# Markdown summary for $GITHUB_STEP_SUMMARY
+# ---------------------------------------------------------------------------
+
+def generate_summary(timings: dict, baseline: dict | None = None) -> str:
+    stats = compute_stats(timings, baseline)
+    bl_map = {j["name"]: j for j in (baseline or {}).get("jobs", [])}
+
+    lines = ["## CI Timing Summary\n"]
+
+    # Global stats table
+    lines.append("| Metric | Current | Baseline | Delta |")
+    lines.append("|--------|---------|----------|-------|")
+
+    wall_d = ""
+    if stats["bl_wall"] is not None:
+        d, _ = fmt_delta(stats["wall"], stats["bl_wall"])
+        wall_d = d
+    lines.append(f"| Wall time | {fmt_dur(stats['wall'])} | {fmt_dur(stats['bl_wall'])} | {wall_d} |")
+
+    compute_d = ""
+    if stats["bl_compute"] is not None:
+        d, _ = fmt_delta(stats["compute"], stats["bl_compute"])
+        compute_d = d
+    lines.append(f"| Total compute | {fmt_dur(stats['compute'])} | {fmt_dur(stats['bl_compute'])} | {compute_d} |")
+
+    lines.append(f"| Jobs faster | {stats['faster']} | — | — |")
+    lines.append(f"| Jobs slower | {stats['slower']} | — | — |")
+    lines.append(f"| Jobs unchanged | {stats['unchanged']} | — | — |")
+    lines.append(f"| Jobs without baseline | {stats['no_baseline']} | — | — |")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def expect_env(var: str) -> str:
+    val = os.environ.get(var)
+    if not val:
+        raise ValueError(f"missing environment variable {var}")
+    return val
+
+def main():
+    parser = argparse.ArgumentParser(description="Collect CI timings and generate HTML report")
+    parser.add_argument("--from-json", help="Read timings from JSON instead of API")
+    parser.add_argument("--baseline", default="ci-timings-baseline.json",
+                        help="Baseline JSON path (default: ci-timings-baseline.json)")
+    parser.add_argument("--output", default="ci-timings-report.html",
+                        help="HTML output path (default: ci-timings-report.html)")
+    parser.add_argument("--json-out", default="ci-timings.json",
+                        help="JSON output path (default: ci-timings.json)")
+    parser.add_argument("--summary-out", default="ci-timings-summary.md",
+                        help="Markdown summary output path (default: ci-timings-summary.md)")
+    args = parser.parse_args()
+
+    # Collect or load timings
+    if args.from_json:
+        with open(args.from_json, encoding="utf-8") as f:
+            timings = json.load(f)
+    else:
+        token = expect_env("GITHUB_TOKEN")
+        repo = expect_env("GITHUB_REPOSITORY")
+        run_id = expect_env("GITHUB_RUN_ID")
+        head_sha = expect_env("GITHUB_SHA")
+
+    timings = collect_timings(token, repo, run_id, head_sha)
+
+    # Save JSON
+    with open(args.json_out, "w", encoding="utf-8") as f:
+        json.dump(timings, f, indent=2)
+    print(f"Saved timings to {args.json_out} ({len(timings.get('jobs', []))} jobs)")
+
+    # Load baseline
+    baseline = None
+    if os.path.exists(args.baseline):
+        with open(args.baseline, encoding="utf-8") as f:
+            baseline = json.load(f)
+        print(f"Loaded baseline from {args.baseline}")
+    else:
+        print(f"No baseline file at {args.baseline} — generating current-only report")
+
+    # Generate HTML
+    html = generate_html(timings, baseline)
+    with open(args.output, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Generated HTML report: {args.output}")
+
+    # Write summary
+    summary = generate_summary(timings, baseline)
+    with open(args.summary_out, "a", encoding="utf-8") as f:
+        f.write(summary)
+        print(f"Wrote summary to {args.summary_out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_tests_parallel.py
+++ b/scripts/run_tests_parallel.py
@@ -58,7 +58,7 @@ _DEFAULT_ROOTS = ["tests"]
 #
 #   tests/e2e/         — .github/workflows/tests.yml :: e2e job
 #   tests/integration/ — historical; legacy --ignore flags
-#   tests/docker/      — .github/workflows/docker-publish.yml ::
+#   tests/docker/      — .github/workflows/docker.yml ::
 #                        build-amd64 job (runs against the freshly-loaded
 #                        nousresearch/hermes-agent:test image, via
 #                        ``HERMES_TEST_IMAGE`` so the fixture skips
@@ -81,7 +81,7 @@ _DURATIONS_FILE = "test_durations.json"


 def _count_tests(
-    files: List[Path], repo_root: Path, pytest_passthrough: List[str]
+    files: List[Path], repo_root: Path
 ) -> dict[Path, int]:
    """Run ``pytest --co -q`` once to count individual tests per file.

@@ -113,7 +113,6 @@ def _count_tests(
        "--co", "-q",
        *ignore_args,
        *[str(f) for f in files],
-        *pytest_passthrough,
    ]
    try:
        result = subprocess.run(
@@ -697,7 +696,7 @@ def main() -> int:
        return 1

    # Count individual tests per file via a single pytest --co pass.
-    test_counts = _count_tests(files, repo_root, pytest_passthrough)
+    test_counts = _count_tests(files, repo_root)
    total_tests = sum(test_counts.values())

    # Apply slicing if requested — distribute files across CI jobs by
--- a/tests/acp/test_events.py
+++ b/tests/acp/test_events.py
@@ -410,8 +410,8 @@ class TestSendUpdate:

        assert created["coro"] is not None
        assert created["coro"].cr_frame is None
-        # Only count warnings about THIS test's coroutine; other tests in the
-        # same xdist worker (or stdlib mock internals) may emit unrelated
+        # Only count warnings about THIS test's coroutine; other tests
+        #  may emit unrelated
        # "coroutine was never awaited" warnings that bleed through.
        runtime_warnings = [
            w for w in caught
--- a/tests/agent/test_async_utils.py
+++ b/tests/agent/test_async_utils.py
@@ -20,8 +20,7 @@ def _no_unawaited_warnings(caught, *, coro_name: str = "") -> bool:
    """Return True if no "X was never awaited" warning slipped through.

    When *coro_name* is provided, only warnings naming that coroutine are
-    counted — xdist workers may emit unrelated unawaited-coroutine warnings
-    (e.g. ``AsyncMockMixin._execute_mock_call``) from concurrent tests.
+    counted
    """
    bad = [
        w for w in caught
--- a/tests/agent/test_skill_commands_reload.py
+++ b/tests/agent/test_skill_commands_reload.py
@@ -39,10 +39,9 @@ def _write_skill(skills_dir: Path, name: str, description: str = "") -> Path:
 def hermes_home(monkeypatch):
    """Isolate HERMES_HOME for ``reload_skills`` tests.

-    Rather than popping cache-bearing modules from ``sys.modules`` (which
-    races against pytest-xdist's parallel workers), we monkeypatch the
-    module-level ``HERMES_HOME`` / ``SKILLS_DIR`` constants in place so the
-    isolation is local to this fixture's scope.
+    Rather than popping cache-bearing modules from ``sys.modules``,
+    we monkeypatch the module-level ``HERMES_HOME`` / ``SKILLS_DIR``
+    constants in place so the isolation is local to this fixture's scope.
    """
    td = tempfile.mkdtemp(prefix="hermes-reload-skills-")
    monkeypatch.setenv("HERMES_HOME", td)
--- a/tests/cli/test_cli_provider_resolution.py
+++ b/tests/cli/test_cli_provider_resolution.py
@@ -13,7 +13,7 @@ from hermes_cli import main as hermes_main
 # ---------------------------------------------------------------------------
 # Module isolation: _import_cli() wipes tools.* / cli / run_agent from
 # sys.modules so it can re-import cli fresh.  Without cleanup the wiped
-# modules leak into subsequent tests on the same xdist worker, breaking
+# modules leak into subsequent tests, breaking
 # mock patches that target "tools.file_tools._get_file_ops" etc.
 # ---------------------------------------------------------------------------

--- a/tests/cli/test_quick_commands.py
+++ b/tests/cli/test_quick_commands.py
@@ -184,8 +184,7 @@ class TestGatewayQuickCommands:
        from gateway.run import GatewayRunner

        # Ensure redaction is active regardless of host HERMES_REDACT_SECRETS state
-        # or test ordering (the module snapshots env at import time, so other
-        # tests in the same xdist worker can flip the flag).
+        # or test ordering
        monkeypatch.setattr("agent.redact._REDACT_ENABLED", True)

        runner = GatewayRunner.__new__(GatewayRunner)
--- a/tests/docker/conftest.py
+++ b/tests/docker/conftest.py
@@ -8,15 +8,13 @@ Override the image with ``HERMES_TEST_IMAGE`` env var to point at a pre-built
 image (faster local iteration); otherwise the ``built_image`` fixture builds
 the repo's Dockerfile once per session.

-Docker tests need longer timeouts than the suite default (30s), so every
-test under this directory is granted a 180s default via
-``pytest.mark.timeout`` applied at collection time.
 """
 from __future__ import annotations

 import os
 import shutil
 import subprocess
+import time
 from collections.abc import Iterator

 import pytest
@@ -43,11 +41,9 @@ def pytest_collection_modifyitems(config, items):  # noqa: D401 - pytest hook
    skip_docker = pytest.mark.skip(
        reason="Docker not available or daemon not running",
    )
-    extend_timeout = pytest.mark.timeout(180)
    for item in items:
        if "tests/docker/" not in str(item.fspath).replace(os.sep, "/"):
            continue
-        item.add_marker(extend_timeout)
        if not docker_ok:
            item.add_marker(skip_docker)

@@ -137,3 +133,181 @@ def docker_exec_sh(
    return docker_exec(
        container, "sh", "-c", command, user=user, timeout=timeout,
    )
+
+
+def wait_for_container_ready(
+    container: str,
+    *,
+    deadline_s: float = 30.0,
+    interval_s: float = 0.25,
+) -> None:
+    """Poll until the container has finished s6 cont-init (stage2 + reconcile).
+
+    The readiness signal is ``profile=default`` appearing in
+    ``/opt/data/logs/container-boot.log``, which the 02-reconcile-profiles
+    cont-init script writes on every boot. That log entry fires AFTER
+    stage2-hook.sh completes, so by the time it appears the full
+    cont-init chain (UID remap, chown, config seeding, skills sync,
+    browser discovery, config migration) has run.
+
+    Raises ``TimeoutError`` if the container never becomes ready — much
+    better than a fixed ``time.sleep()`` that either wastes time on fast
+    machines or flakes on slow ones.
+    """
+    end = time.monotonic() + deadline_s
+    while time.monotonic() < end:
+        r = docker_exec(
+            container,
+            "sh", "-c",
+            "cat /opt/data/logs/container-boot.log 2>/dev/null",
+            timeout=5,
+        )
+        if r.returncode == 0 and "profile=default" in r.stdout:
+            return
+        time.sleep(interval_s)
+    raise TimeoutError(
+        f"container {container} did not finish cont-init within {deadline_s}s"
+    )
+
+
+def start_container(
+    image: str,
+    name: str,
+    *env: str,
+    cmd: str = "sleep infinity",
+    timeout: int = 60,
+) -> str:
+    """Start a detached container and wait for cont-init to finish.
+
+    Args:
+        image: Docker image to run.
+        name: Container name (cleanup is the caller's responsibility —
+            typically handled by the ``container_name`` fixture).
+        env: Env vars as ``KEY=VALUE`` strings, each passed via ``-e``.
+        cmd: Container CMD (default ``sleep infinity``).
+        timeout: ``docker run`` subprocess timeout.
+
+    Returns the container name. Raises on ``docker run`` failure or if
+    the container never finishes cont-init within 30s.
+    """
+    args = ["docker", "run", "-d", "--name", name]
+    for e in env:
+        args.extend(["-e", e])
+    args.extend([image, *cmd.split()])
+    subprocess.run(args, check=True, capture_output=True, timeout=timeout)
+    wait_for_container_ready(name)
+    return name
+
+
+def restart_container(container: str, timeout: int = 60) -> None:
+    """Restart a container and wait for cont-init to finish.
+
+    Equivalent to ``docker restart <container>`` followed by
+    :func:`wait_for_container_ready`.
+
+    The readiness signal (``profile=default`` in
+    ``/opt/data/logs/container-boot.log``) is append-only and persists
+    across restarts, so we truncate it BEFORE restarting — otherwise
+    ``wait_for_container_ready`` would match the stale line from the
+    previous boot and return before cont-init runs on the new boot.
+    """
+    docker_exec(container, "sh", "-c",
+                "truncate -s 0 /opt/data/logs/container-boot.log 2>/dev/null || true",
+                user="root", timeout=5)
+    subprocess.run(
+        ["docker", "restart", container],
+        check=True, capture_output=True, timeout=timeout,
+    )
+    wait_for_container_ready(container)
+
+
+def poll_container(
+    container: str,
+    probe: str,
+    *,
+    deadline_s: float = 30.0,
+    interval_s: float = 0.5,
+    user: str = "hermes",
+) -> tuple[bool, str]:
+    """Repeatedly run ``probe`` inside the container until it exits 0 or
+    ``deadline_s`` elapses.
+
+    Returns ``(success, last_stdout)``. Useful for waiting on a process
+    to appear, a port to open, a file to contain a string, etc.
+    """
+    end = time.monotonic() + deadline_s
+    last = ""
+    while time.monotonic() < end:
+        r = docker_exec_sh(container, probe, user=user, timeout=10)
+        last = r.stdout
+        if r.returncode == 0:
+            return True, last
+        time.sleep(interval_s)
+    return False, last
+
+
+def wait_for_path(
+    container: str,
+    path: str,
+    *,
+    kind: str = "f",
+    deadline_s: float = 30.0,
+    interval_s: float = 0.25,
+) -> bool:
+    """Poll ``test -<kind> <path>`` inside the container until success or timeout.
+
+    ``kind`` is the ``test`` flag: ``'f'`` for file, ``'d'`` for directory,
+    ``'e'`` for existence. Returns ``True`` on success, ``False`` on timeout.
+    """
+    return poll_container(
+        container, f"test -{kind} {path}",
+        deadline_s=deadline_s, interval_s=interval_s,
+    )[0]
+
+
+def wait_for_log(
+    container: str,
+    log_path: str,
+    needle: str,
+    *,
+    deadline_s: float = 30.0,
+    interval_s: float = 0.25,
+) -> str:
+    """Poll until a log file inside the container contains ``needle``.
+
+    Returns the full log on success.
+    """
+    end = time.monotonic() + deadline_s
+    last = ""
+    while time.monotonic() < end:
+        r = docker_exec_sh(
+            container, f"cat {log_path} 2>/dev/null", timeout=5,
+        )
+        if r.returncode == 0:
+            last = r.stdout
+            if needle in last:
+                return last
+        time.sleep(interval_s)
+    raise AssertionError(f"Didn't see `{needle}` in {log_path} within {deadline_s} in container {container}")
+
+
+
+def wait_for_docker_logs(
+    container: str, needle: str, *, deadline_s: float = 30.0, interval_s: float = 0.5,
+) -> str:
+    """Poll ``docker logs`` until ``needle`` appears or deadline expires.
+
+    Returns the full docker logs on success.
+    """
+    end = time.monotonic() + deadline_s
+    last = ""
+    while time.monotonic() < end:
+        r = subprocess.run(
+            ["docker", "logs", container],
+            capture_output=True, text=True, timeout=10,
+        )
+        last = r.stdout + r.stderr
+        if needle in last:
+            return last
+        time.sleep(interval_s)
+    raise AssertionError(f"Didn't see `{needle}` in docker logs within {deadline_s} in container {container}")
--- a/tests/docker/test_config_migration.py
+++ b/tests/docker/test_config_migration.py
@@ -0,0 +1,69 @@
+"""Runtime smoke test for Docker config-schema migration on boot.
+
+Build the real image and verify: a config.yaml present in $HERMES_HOME
+is migrated by docker_config_migrate.py on boot, running as the hermes
+user.
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import docker_exec, docker_exec_sh, start_container
+
+
+def test_config_migration_runs_on_boot(
+    built_image: str, container_name: str,
+) -> None:
+    """A config.yaml in $HERMES_HOME must be migrated on boot by
+    docker_config_migrate.py, running as the hermes user."""
+    # Start container
+    start_container(built_image, container_name)
+
+    # Verify config.yaml exists (should be seeded by stage2 if not present)
+    r = docker_exec_sh(
+        container_name,
+        "test -f /opt/data/config.yaml && echo EXISTS || echo MISSING",
+        timeout=10,
+    )
+    assert "EXISTS" in r.stdout, (
+        f"config.yaml not found in $HERMES_HOME: {r.stdout}"
+    )
+
+    # Verify the migration script exists in the image
+    r = docker_exec_sh(
+        container_name,
+        "test -f /opt/hermes/scripts/docker_config_migrate.py && "
+        "echo SCRIPT_EXISTS || echo SCRIPT_MISSING",
+        timeout=10,
+    )
+    assert "SCRIPT_EXISTS" in r.stdout, (
+        f"docker_config_migrate.py not found in image: {r.stdout}"
+    )
+
+    # Verify config.yaml is owned by hermes (migration ran as hermes)
+    r = docker_exec_sh(
+        container_name,
+        'stat -c "%U" /opt/data/config.yaml',
+        timeout=10,
+    )
+    assert r.stdout.strip() == "hermes", (
+        f"config.yaml not owned by hermes (migration may have run as root): "
+        f"{r.stdout.strip()}"
+    )
+
+
+def test_config_migration_opt_out_env_var_respected(
+    built_image: str, container_name: str,
+) -> None:
+    """HERMES_SKIP_CONFIG_MIGRATION=1 must skip the migration."""
+    start_container(
+        built_image, container_name, "HERMES_SKIP_CONFIG_MIGRATION=1",
+    )
+
+    # config.yaml should still be seeded (seeding is separate from migration)
+    r = docker_exec_sh(
+        container_name,
+        "test -f /opt/data/config.yaml && echo EXISTS || echo MISSING",
+        timeout=10,
+    )
+    assert "EXISTS" in r.stdout, (
+        f"config.yaml should be seeded even with migration skipped: {r.stdout}"
+    )
--- a/tests/docker/test_container_restart.py
+++ b/tests/docker/test_container_restart.py
@@ -21,7 +21,7 @@ import time

 import pytest

-from tests.docker.conftest import docker_exec, docker_exec_sh
+from tests.docker.conftest import docker_exec, docker_exec_sh, wait_for_path, wait_for_log, wait_for_docker_logs, poll_container


 def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]:
@@ -32,41 +32,8 @@ def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]:
    )


-def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
-    return docker_exec(container, *args, timeout=timeout)


-def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
-    return docker_exec_sh(container, cmd, timeout=timeout)
-
-
-def _wait_for_path(
-    container: str,
-    path: str,
-    *,
-    kind: str = "f",
-    deadline_s: float = 30.0,
-    interval_s: float = 0.25,
-) -> bool:
-    """Poll `test -<kind> <path>` inside container until success or timeout.
-
-    `kind` is the `test` flag: 'f' for file, 'd' for directory, 'e' for
-    existence. Returns True on success, False on timeout. Strictly
-    better than a fixed `time.sleep()` because:
-
-      * we don't wait the full budget when the path appears early, and
-      * the test fails with a precise "waited N seconds" assertion
-        instead of a confusing one-line failure mid-test when the
-        sleep was too short.
-    """
-    end = time.monotonic() + deadline_s
-    while time.monotonic() < end:
-        r = _sh(container, f"test -{kind} {path}", timeout=5)
-        if r.returncode == 0:
-            return True
-        time.sleep(interval_s)
-    return False
-

 def _wait_for_reconcile_log_mention(
    container: str,
@@ -76,23 +43,8 @@ def _wait_for_reconcile_log_mention(
    interval_s: float = 0.25,
 ) -> str:
    """Poll until /opt/data/logs/container-boot.log mentions `profile`.
-
-    Returns the matching log content on success. On timeout, returns
-    the last observed contents so the assertion can render a
-    meaningful diagnostic. The container-boot.log is the explicit
-    signal that the reconciler has finished — much more reliable
-    than a fixed sleep that hopes 8 seconds is enough.
    """
-    end = time.monotonic() + deadline_s
-    last = ""
-    while time.monotonic() < end:
-        r = _sh(container, "cat /opt/data/logs/container-boot.log", timeout=5)
-        if r.returncode == 0:
-            last = r.stdout
-            if f"profile={profile}" in last:
-                return last
-        time.sleep(interval_s)
-    return last
+    return wait_for_log(container, "/opt/data/logs/container-boot.log",  f"profile={profile}")


@pytest.fixture
@@ -117,23 +69,7 @@ def restart_container(request, built_image: str):
    # it starts issuing commands. The reconciler always writes one
    # 'default' line on every boot (PR #30136 item I1) — that's our
    # readiness signal.
-    deadline = time.monotonic() + 30.0
-    while time.monotonic() < deadline:
-        r = _docker(
-            "exec", "-u", "hermes", name, "sh", "-c",
-            "cat /opt/data/logs/container-boot.log 2>/dev/null",
-            timeout=5,
-        )
-        if r.returncode == 0 and "profile=default" in r.stdout:
-            break
-        time.sleep(0.25)
-    else:
-        # Defensive: surface a timeout from the fixture itself so the
-        # test failure points at "container never finished cont-init"
-        # rather than mid-test where the symptom would be obscure.
-        raise RuntimeError(
-            f"container {name} did not finish cont-init within 30s"
-        )
+    wait_for_log(name, "/opt/data/logs/container-boot.log", "profile=default")
    yield name
    _docker("rm", "-f", name)
    _docker("volume", "rm", "-f", volume)
@@ -145,20 +81,14 @@ def test_running_gateway_survives_container_restart(restart_container: str) -> N
    # Create the profile + start its gateway. The Phase 4 hooks
    # register the s6 service slot during create and the dispatch
    # path brings it up via s6-svc -u.
-    r = _exec(container, "hermes", "profile", "create", "coder")
+    r = docker_exec(container, "hermes", "profile", "create", "coder")
    assert r.returncode == 0, f"profile create failed: {r.stderr}"

-    r = _exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60)
+    r = docker_exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60)
    assert r.returncode == 0, f"gateway start failed: {r.stderr}"

    # Give the service time to actually come up under supervision.
-    deadline = time.monotonic() + 15.0
-    while time.monotonic() < deadline:
-        r = _sh(container, "/command/s6-svstat /run/service/gateway-coder")
-        if r.returncode == 0 and "up " in r.stdout:
-            break
-        time.sleep(0.5)
-    assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}"
+    poll_container(container, "/command/s6-svstat /run/service/gateway-coder | grep -q 'up '")

    # Persist state so the reconciler will treat the slot as 'running'
    # post-restart. The gateway process itself writes gateway_state.json
@@ -170,7 +100,7 @@ def test_running_gateway_survives_container_restart(restart_container: str) -> N
        "p = pathlib.Path('/opt/data/profiles/coder/gateway_state.json'); "
        "p.write_text(json.dumps({'gateway_state': 'running', 'timestamp': 1}))"
    )
-    _exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
+    docker_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()

    # Restart. After this, /run/service/ is empty until cont-init.d
    # runs the reconciler. We need to wait long enough for the
@@ -179,25 +109,22 @@ def test_running_gateway_survives_container_restart(restart_container: str) -> N
    # restored slot. Polling the boot log gives us the first signal.
    _docker("restart", container, timeout=60).check_returncode()
    log = _wait_for_reconcile_log_mention(container, "coder", deadline_s=30.0)
-    assert "profile=coder" in log, (
-        f"reconciler never logged coder after restart: {log!r}"
-    )
    assert "action=started" in log

    # Service slot exists.
-    assert _wait_for_path(
+    assert wait_for_path(
        container, "/run/service/gateway-coder", kind="d", deadline_s=10.0,
    ), "slot not recreated after restart"

    # No `down` marker — we asked for auto-start.
-    r = _sh(container, "test -f /run/service/gateway-coder/down")
+    r = docker_exec_sh(container, "test -f /run/service/gateway-coder/down")
    assert r.returncode != 0, "down marker present despite prior_state=running"


 def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) -> None:
    container = restart_container

-    _exec(container, "hermes", "profile", "create", "writer").check_returncode()
+    docker_exec(container, "hermes", "profile", "create", "writer").check_returncode()

    # Write 'stopped' directly so we don't have to race against the
    # gateway's own state writes.
@@ -206,19 +133,18 @@ def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) ->
        "p = pathlib.Path('/opt/data/profiles/writer/gateway_state.json'); "
        "p.write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1}))"
    )
-    _exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
+    docker_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()

    _docker("restart", container, timeout=60).check_returncode()
-    log = _wait_for_reconcile_log_mention(container, "writer", deadline_s=30.0)
-    assert "profile=writer" in log
+    _wait_for_reconcile_log_mention(container, "writer", deadline_s=30.0)

    # Slot exists.
-    assert _wait_for_path(
+    assert wait_for_path(
        container, "/run/service/gateway-writer", kind="d", deadline_s=10.0,
    )

    # Down marker present.
-    r = _sh(container, "test -f /run/service/gateway-writer/down")
+    r = docker_exec_sh(container, "test -f /run/service/gateway-writer/down")
    assert r.returncode == 0, "down marker missing despite prior_state=stopped"


@@ -229,7 +155,7 @@ def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None
    process-mismatch checks."""
    container = restart_container

-    _exec(container, "hermes", "profile", "create", "ghost").check_returncode()
+    docker_exec(container, "hermes", "profile", "create", "ghost").check_returncode()

    # Stamp stale runtime files alongside a 'running' state so the
    # reconciler walks this profile.
@@ -240,15 +166,15 @@ def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None
        "(p / 'gateway.pid').write_text(json.dumps({'pid': 99999, 'host': 'old'})); "
        "(p / 'processes.json').write_text('[]')"
    )
-    _exec(container, "python3", "-c", stamp, timeout=10).check_returncode()
+    docker_exec(container, "python3", "-c", stamp, timeout=10).check_returncode()

    _docker("restart", container, timeout=60).check_returncode()
    _wait_for_reconcile_log_mention(container, "ghost", deadline_s=30.0)

    # Stale runtime files swept.
-    r = _sh(container, "test -f /opt/data/profiles/ghost/gateway.pid")
+    r = docker_exec_sh(container, "test -f /opt/data/profiles/ghost/gateway.pid")
    assert r.returncode != 0, "stale gateway.pid survived restart"
-    r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json")
+    r = docker_exec_sh(container, "test -f /opt/data/profiles/ghost/processes.json")
    assert r.returncode != 0, "stale processes.json survived restart"


@@ -271,37 +197,20 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp(
    """
    container = restart_container

-    _exec(container, "hermes", "profile", "create", "live").check_returncode()
-    r = _exec(container, "hermes", "-p", "live", "gateway", "start", timeout=60)
+    docker_exec(container, "hermes", "profile", "create", "live").check_returncode()
+    r = docker_exec(container, "hermes", "-p", "live", "gateway", "start", timeout=60)
    assert r.returncode == 0, f"gateway start failed: {r.stderr}"

    # Wait for the gateway to actually come up under supervision AND write
    # its own gateway_state=running (we do NOT stamp it ourselves).
-    deadline = time.monotonic() + 20.0
-    while time.monotonic() < deadline:
-        r = _sh(container, "/command/s6-svstat /run/service/gateway-live")
-        if r.returncode == 0 and "up " in r.stdout:
-            break
-        time.sleep(0.5)
-    assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}"
+    poll_container(container, "/command/s6-svstat /run/service/gateway-live |  grep -q 'up '")

-    # Confirm the gateway persisted its own 'running' state (sanity: we're
-    # testing the real write path, not a stamped fixture).
-    deadline = time.monotonic() + 15.0
-    state = ""
-    while time.monotonic() < deadline:
-        r = _sh(
-            container,
-            "cat /opt/data/profiles/live/gateway_state.json 2>/dev/null",
-        )
-        if r.returncode == 0 and '"gateway_state"' in r.stdout:
-            state = r.stdout
-            if '"running"' in state:
-                break
-        time.sleep(0.5)
-    assert '"running"' in state, (
-        f"gateway never persisted running state pre-restart: {state!r}"
-    )
+    # Confirm the gateway persisted its own 'running' state. The gateway has
+    # to boot Python, discover ~50 plugins, construct GatewayRunner, and
+    # reach write_runtime_status("running") at run.py start() — on a loaded
+    # CI runner with parallel docker test containers competing for CPU, this
+    # can take a while.
+    wait_for_log(container, "/opt/data/profiles/live/gateway_state.json", '"running"', deadline_s=45, interval_s=1)

    # Real restart — Docker sends SIGTERM to PID 1; s6 propagates it to the
    # supervised gateway. No planned-stop marker is written (this is not an
@@ -309,9 +218,6 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp(
    _docker("restart", container, timeout=60).check_returncode()

    log = _wait_for_reconcile_log_mention(container, "live", deadline_s=30.0)
-    assert "profile=live" in log, (
-        f"reconciler never logged live after restart: {log!r}"
-    )
    # The crux: the reconciler must AUTO-START it, not register it down.
    assert "action=started" in log, (
        f"gateway did NOT auto-start after a real restart (issue #42675 "
@@ -319,10 +225,10 @@ def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp(
    )

    # Slot recreated, and NO down marker (we expect auto-start).
-    assert _wait_for_path(
+    assert wait_for_path(
        container, "/run/service/gateway-live", kind="d", deadline_s=10.0,
    ), "slot not recreated after restart"
-    r = _sh(container, "test -f /run/service/gateway-live/down")
+    r = docker_exec_sh(container, "test -f /run/service/gateway-live/down")
    assert r.returncode != 0, (
        "down marker present despite a live gateway being restarted — "
        "the signal-initiated shutdown wrongly persisted 'stopped' (#42675)"
--- a/tests/docker/test_dashboard.py
+++ b/tests/docker/test_dashboard.py
@@ -13,39 +13,16 @@ the realistic runtime context. See the conftest module docstring.
 from __future__ import annotations

 import json
-import subprocess
 import time

-from tests.docker.conftest import docker_exec, docker_exec_sh
-
-
-def _poll(container: str, probe: str, *, deadline_s: float = 30.0,
-          interval_s: float = 0.5) -> tuple[bool, str]:
-    """Repeatedly run ``probe`` inside the container until it exits 0 or
-    ``deadline_s`` elapses. Returns (success, last stdout)."""
-    end = time.monotonic() + deadline_s
-    last = ""
-    while time.monotonic() < end:
-        r = docker_exec_sh(container, probe, timeout=10)
-        last = r.stdout
-        if r.returncode == 0:
-            return True, last
-        time.sleep(interval_s)
-    return False, last
+from tests.docker.conftest import docker_exec, docker_exec_sh, start_container, poll_container


 def test_dashboard_not_running_by_default(
    built_image: str, container_name: str,
 ) -> None:
    """Without HERMES_DASHBOARD, no dashboard process should be running."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "60"],
-        check=True, capture_output=True, timeout=30,
-    )
-    # Give the entrypoint enough time to finish bootstrap; if a dashboard
-    # were going to start it'd be visible by now.
-    time.sleep(5)
+    start_container(built_image, container_name, cmd="sleep 60")
    r = docker_exec(container_name, "pgrep", "-f", "hermes dashboard")
    # pgrep exits non-zero when no match found
    assert r.returncode != 0, (
@@ -64,12 +41,7 @@ def test_dashboard_slot_reports_down_when_disabled(
    writes a `down` marker file in the live service-dir when
    HERMES_DASHBOARD is unset, so the slot reflects reality.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "60"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(5)
+    start_container(built_image, container_name, cmd="sleep 60")
    # /command/ isn't on PATH for docker-exec sessions, so call by
    # absolute path.
    r = docker_exec(
@@ -86,56 +58,42 @@ def test_dashboard_slot_reports_up_when_enabled(
    built_image: str, container_name: str,
 ) -> None:
    """Symmetry: with HERMES_DASHBOARD=1, s6-svstat reports the slot as up."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         # The default dashboard host is 0.0.0.0, which now engages the
-         # OAuth auth gate. Without a provider registered (no
-         # HERMES_DASHBOARD_OAUTH_CLIENT_ID in this test env), start_server
-         # would fail closed and the slot would never come up. Pin the
-         # explicit insecure opt-in to keep this test focused on the s6
-         # supervision contract, not the auth gate.
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    # The default dashboard host is 0.0.0.0, which now engages the
+    # OAuth auth gate. Without a provider registered (no
+    # HERMES_DASHBOARD_OAUTH_CLIENT_ID in this test env), start_server
+    # would fail closed and the slot would never come up. Pin the
+    # explicit insecure opt-in to keep this test focused on the s6
+    # supervision contract, not the auth gate.
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
+        "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
+        cmd="sleep 120",
    )
    # uvicorn takes a moment to bind; poll svstat.
-    deadline = time.monotonic() + 30.0
-    last = ""
-    while time.monotonic() < deadline:
-        r = docker_exec(
-            container_name, "/command/s6-svstat", "/run/service/dashboard",
-        )
-        last = r.stdout
-        if r.returncode == 0 and "up " in r.stdout:
-            return  # success
-        time.sleep(0.5)
-    raise AssertionError(
-        f"Dashboard slot never reached up state; last svstat: {last!r}"
-    )
+    poll_container(container_name, "/command/s6-svstat /run/service/dashboard | grep -q 'up '")


 def test_dashboard_opt_in_starts(
    built_image: str, container_name: str,
 ) -> None:
    """With HERMES_DASHBOARD=1, a dashboard process should be visible."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         # Default bind is 0.0.0.0, which engages the auth gate. Register the
-         # bundled basic password provider so the gate has a provider and the
-         # dashboard binds (vs fail-closed). Keeps the test focused on s6
-         # supervision, not auth.
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    # Default bind is 0.0.0.0, which engages the auth gate. Register the
+    # bundled basic password provider so the gate has a provider and the
+    # dashboard binds (vs fail-closed). Keeps the test focused on s6
+    # supervision, not auth.
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
+        "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
+        cmd="sleep 120",
    )
    # Poll for the dashboard subprocess to appear — the entrypoint
    # backgrounds it and bootstrap (skills sync etc.) can take a few
    # seconds before the python process actually launches.
-    ok, _ = _poll(
+    ok, _ = poll_container(
        container_name, "pgrep -f 'hermes dashboard'", deadline_s=30.0,
    )
    assert ok, "Dashboard should be running with HERMES_DASHBOARD=1"
@@ -145,22 +103,22 @@ def test_dashboard_port_override(
    built_image: str, container_name: str,
 ) -> None:
    """HERMES_DASHBOARD_PORT changes the dashboard's listen port."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1", "-e", "HERMES_DASHBOARD_PORT=9120",
-         # Default bind is 0.0.0.0; register the basic password provider so
-         # the auth gate has a provider and the dashboard binds. See
-         # test_dashboard_slot_reports_up_when_enabled for the full rationale.
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    # Default bind is 0.0.0.0; register the basic password provider so
+    # the auth gate has a provider and the dashboard binds. See
+    # test_dashboard_slot_reports_up_when_enabled for the full rationale.
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_PORT=9120",
+        "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
+        "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
+        cmd="sleep 120",
    )
    # The dashboard process appearing in pgrep doesn't mean it's bound
    # to the port yet — uvicorn takes another second or two to come up.
    # The image doesn't ship ss/netstat, so probe /proc/net/tcp directly:
    # port 9120 = 0x23A0, state 0A = LISTEN.
-    ok, stdout = _poll(
+    ok, stdout = poll_container(
        container_name,
        "grep -E ' 0+:23A0 .* 0A ' /proc/net/tcp /proc/net/tcp6 "
        "2>/dev/null",
@@ -180,20 +138,19 @@ def test_dashboard_restarts_after_crash(
    dashboard runs as a longrun s6-rc service and s6-supervise restarts
    it after a ~1s backoff (the default).
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         # Default bind is 0.0.0.0; register the basic password provider so
-         # the auth gate has a provider and the supervised dashboard binds.
-         # See test_dashboard_slot_reports_up_when_enabled for the full
-         # rationale.
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
-         "-e", "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    # Default bind is 0.0.0.0; register the basic password provider so
+    # the auth gate has a provider and the supervised dashboard binds.
+    # See test_dashboard_slot_reports_up_when_enabled for the full
+    # rationale.
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_BASIC_AUTH_USERNAME=admin",
+        "HERMES_DASHBOARD_BASIC_AUTH_PASSWORD=test-dashboard-pw",
+        cmd="sleep 120",
    )
    # Wait for the first dashboard to come up.
-    ok, _ = _poll(
+    ok, _ = poll_container(
        container_name, "pgrep -f 'hermes dashboard'", deadline_s=30.0,
    )
    assert ok, "Dashboard never started initially"
@@ -338,13 +295,12 @@ def test_dashboard_oauth_gate_engages_on_non_loopback_bind(
       responds 200 without a cookie under both gates, so it cannot
       distinguish "gate on" from "gate off".
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         "-e", "HERMES_DASHBOARD_HOST=0.0.0.0",
-         "-e", "HERMES_DASHBOARD_OAUTH_CLIENT_ID=agent:test-instance",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_HOST=0.0.0.0",
+        "HERMES_DASHBOARD_OAUTH_CLIENT_ID=agent:test-instance",
+        cmd="sleep 120",
    )

    # (1) Provider registry visible via the public bootstrap endpoint.
@@ -398,18 +354,17 @@ def test_dashboard_insecure_env_var_no_longer_bypasses_gate(
    public-dashboard escape hatch is gone: there is no env that serves the
    dashboard on a public bind without an auth provider.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         "-e", "HERMES_DASHBOARD_HOST=0.0.0.0",
-         "-e", "HERMES_DASHBOARD_INSECURE=1",
-         built_image, "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        "HERMES_DASHBOARD_HOST=0.0.0.0",
+        "HERMES_DASHBOARD_INSECURE=1",
+        cmd="sleep 120",
    )
    # Fail-closed: the dashboard process must NOT successfully serve. Probe
    # for a few seconds; /api/status should never become reachable because
    # start_server raised SystemExit before binding.
-    ok, _ = _poll(
+    ok, _ = poll_container(
        container_name,
        "curl -fsS -m 2 http://127.0.0.1:9119/api/status >/dev/null 2>&1",
        deadline_s=12.0,
--- a/tests/docker/test_docker_exec_privilege_drop.py
+++ b/tests/docker/test_docker_exec_privilege_drop.py
@@ -22,6 +22,7 @@ These tests verify:
 """

 from __future__ import annotations
+from tests.docker.conftest import docker_exec

 import subprocess
 import time
@@ -36,8 +37,8 @@ _RUN_READY_TIMEOUT_S = 20

 def _wait_for_init(container: str) -> None:
    """Block until /init is up enough that `docker exec` is responsive."""
-    deadline = time.time() + _RUN_READY_TIMEOUT_S
-    while time.time() < deadline:
+    deadline = time.monotonic() + _RUN_READY_TIMEOUT_S
+    while time.monotonic() < deadline:
        r = subprocess.run(
            ["docker", "exec", container, "true"],
            capture_output=True, timeout=5,
@@ -287,4 +288,4 @@ def test_e2e_login_then_supervised_gateway_can_read_auth(
        "Files written by `docker exec` are unreadable to the hermes user "
        f"(supervised gateway UID): {unreadable}. The shim failed to drop "
        "privileges before the write."
-    )
+    )
--- a/tests/docker/test_dump_build_sha.py
+++ b/tests/docker/test_dump_build_sha.py
@@ -6,7 +6,7 @@ fails inside the published image and ``hermes dump`` used to report
 ``$HERMES_GIT_SHA`` build-arg to ``/opt/hermes/.hermes_build_sha`` and
 ``hermes_cli/build_info.py`` reads it as a fallback.

-CI (``.github/workflows/docker-publish.yml``) always sets the build-arg
+CI (``.github/workflows/docker.yml``) always sets the build-arg
 to ``${{ github.sha }}``.  Local ``docker build`` (the ``built_image``
 fixture in ``tests/docker/conftest.py``) does NOT — so locally the file
 is absent and ``hermes dump`` correctly falls back to ``(unknown)``.
--- a/tests/docker/test_gateway_bootstrap_state.py
+++ b/tests/docker/test_gateway_bootstrap_state.py
@@ -0,0 +1,157 @@
+"""Runtime smoke tests for Docker gateway_state.json bootstrap seeding.
+
+Build the real image and verify the actual runtime behavior:
+
+  1. HERMES_GATEWAY_BOOTSTRAP_STATE=running on a fresh volume seeds
+     gateway_state.json with running state
+  2. An existing gateway_state.json is never clobbered (first-boot-only)
+  3. No env var = no seed (default down-on-first-boot preserved)
+  4. Only literal "running" is honored; other values are ignored
+"""
+from __future__ import annotations
+
+import json
+import subprocess
+
+from tests.docker.conftest import docker_exec, docker_exec_sh, wait_for_container_ready
+
+
+def _start_container(
+    built_image: str, name: str, *env: str,
+) -> str:
+    """Start a container with given env vars, return its name."""
+    args = ["docker", "run", "-d", "--name", name]
+    for e in env:
+        args.extend(["-e", e])
+    args.extend([built_image, "sleep", "infinity"])
+    subprocess.run(args, check=True, capture_output=True, timeout=60)
+    wait_for_container_ready(name)
+    return name
+
+
+def test_seeds_running_state_on_blank_volume(
+    built_image: str, container_name: str,
+) -> None:
+    """HERMES_GATEWAY_BOOTSTRAP_STATE=running on a fresh volume must
+    seed gateway_state.json with a valid running state."""
+    _start_container(
+        built_image, container_name,
+        "HERMES_GATEWAY_BOOTSTRAP_STATE=running",
+    )
+
+    r = docker_exec_sh(
+        container_name,
+        "cat /opt/data/gateway_state.json 2>/dev/null || echo NONE",
+        timeout=10,
+    )
+    assert r.stdout.strip() != "NONE", (
+        f"gateway_state.json not seeded on fresh volume: {r.stdout}"
+    )
+    state = json.loads(r.stdout.strip())
+    assert state.get("gateway_state") == "running", (
+        f"expected gateway_state=running, got: {state}"
+    )
+
+
+def test_does_not_clobber_existing_state(
+    built_image: str, container_name: str,
+) -> None:
+    """An existing gateway_state.json must never be overwritten by the
+    seed, even when the bootstrap env var says running.
+
+    We use a named volume so we can pre-create the state file before
+    the container boots. The [ ! -f ] guard in stage2 must skip seeding
+    because the file already exists. We check the file immediately after
+    boot — before the gateway service has a chance to write its own
+    state — by reading it as fast as possible after container start.
+    """
+    import json as _json
+
+    volume = f"{container_name}-vol"
+    subprocess.run(
+        ["docker", "volume", "create", volume],
+        check=True, capture_output=True, timeout=10,
+    )
+
+    # Pre-create the state file via a throwaway container
+    existing = _json.dumps({"gateway_state": "stopped", "pid": 123})
+    subprocess.run(
+        ["docker", "run", "--rm", "-v", f"{volume}:/opt/data",
+         "--entrypoint", "sh", built_image,
+         "-c", f"printf '{existing}\\n' > /opt/data/gateway_state.json"],
+        check=True, capture_output=True, timeout=30,
+    )
+
+    # Boot with the env var set — stage2 must NOT clobber the existing file
+    subprocess.run(
+        ["docker", "run", "-d", "--name", container_name,
+         "-v", f"{volume}:/opt/data",
+         "-e", "HERMES_GATEWAY_BOOTSTRAP_STATE=running",
+         built_image, "sleep", "infinity"],
+        check=True, capture_output=True, timeout=60,
+    )
+    # Read the file as quickly as possible — the gateway service may
+    # start and write its own state, but the stage2 [ ! -f ] guard runs
+    # during cont-init (before any service starts), so the file must
+    # still be our "stopped" state at this point.
+    wait_for_container_ready(container_name)
+    r = docker_exec_sh(
+        container_name, "cat /opt/data/gateway_state.json", timeout=10,
+    )
+    state = _json.loads(r.stdout.strip())
+    assert state.get("gateway_state") == "stopped", (
+        f"existing state was clobbered by bootstrap seed: {state}"
+    )
+
+    # Cleanup
+    subprocess.run(
+        ["docker", "rm", "-f", container_name],
+        capture_output=True, timeout=10,
+    )
+    subprocess.run(
+        ["docker", "volume", "rm", "-f", volume],
+        capture_output=True, timeout=10,
+    )
+
+
+def test_no_seed_when_env_unset(
+    built_image: str, container_name: str,
+) -> None:
+    """No HERMES_GATEWAY_BOOTSTRAP_STATE = no seed file written."""
+    _start_container(built_image, container_name)
+
+    r = docker_exec_sh(
+        container_name,
+        "test -f /opt/data/gateway_state.json && "
+        "echo EXISTS || echo ABSENT",
+        timeout=10,
+    )
+    assert "ABSENT" in r.stdout, (
+        f"gateway_state.json was seeded without the env var: {r.stdout}"
+    )
+
+
+def test_non_running_value_ignored(
+    built_image: str, container_name: str,
+) -> None:
+    """Only literal 'running' is honored; any other value is ignored."""
+    for bogus in ("stopped", "Running", "1", "true", "starting"):
+        # Need a fresh container per iteration
+        name = f"{container_name}-{bogus}"
+        _start_container(
+            built_image, name,
+            f"HERMES_GATEWAY_BOOTSTRAP_STATE={bogus}",
+        )
+        r = docker_exec_sh(
+            name,
+            "test -f /opt/data/gateway_state.json && "
+            "echo EXISTS || echo ABSENT",
+            timeout=10,
+        )
+        assert "ABSENT" in r.stdout, (
+            f"bogus value {bogus!r} should not seed a state file: {r.stdout}"
+        )
+        subprocess.run(
+            ["docker", "rm", "-f", name],
+            capture_output=True, timeout=10,
+        )
--- a/tests/docker/test_gateway_run_supervised.py
+++ b/tests/docker/test_gateway_run_supervised.py
@@ -23,15 +23,15 @@ from __future__ import annotations
 import subprocess
 import time

-from tests.docker.conftest import docker_exec_sh
-
-
-def _sh(container: str, command: str, timeout: int = 30):
-    return docker_exec_sh(container, command, timeout=timeout)
+from tests.docker.conftest import (
+    docker_exec_sh,
+    start_container,
+    wait_for_docker_logs,
+)


 def _svstat(container: str, slot: str = "gateway-default") -> str:
-    r = _sh(container, f"/command/s6-svstat /run/service/{slot}")
+    r = docker_exec_sh(container, f"/command/s6-svstat /run/service/{slot}")
    return r.stdout if r.returncode == 0 else ""


@@ -46,6 +46,43 @@ def _svstat_wants_up(container: str, slot: str = "gateway-default") -> bool:
    return "want up" in state


+def _wait_for_gateway_or_exit(
+    container: str,
+    *,
+    deadline_s: float = 60.0,
+) -> str:
+    """Poll until the container is either running a foreground gateway
+    process or has exited.  Returns the final container status.
+
+    Used by the ``--no-supervise`` tests where the gateway runs as the
+    CMD process (not supervised by s6).  Under CI load the gateway can
+    take well over 6s to finish Python imports and reach the gateway
+    entrypoint — a fixed ``time.sleep(6)`` races.  Polling for
+    ``pgrep -f 'hermes.*gateway'`` (the gateway is running) or
+    ``docker inspect`` returning ``exited`` is both faster on quick
+    machines and flake-free on slow ones.
+    """
+    end = time.monotonic() + deadline_s
+    while time.monotonic() < end:
+        r = subprocess.run(
+            ["docker", "inspect", "-f", "{{.State.Status}}", container],
+            capture_output=True, text=True, timeout=10,
+        )
+        status = r.stdout.strip()
+        if status == "exited":
+            return "exited"
+        if status == "running":
+            # Check if the gateway process is actually running in the
+            # foreground (the no-supervise path).  If it is, we're done.
+            pgrep = docker_exec_sh(
+                container, "pgrep -f 'hermes.*gateway' >/dev/null 2>&1",
+            )
+            if pgrep.returncode == 0:
+                return "running"
+        time.sleep(0.5)
+    return status
+
+
 def test_gateway_run_redirects_to_supervised(
    built_image: str, container_name: str,
 ) -> None:
@@ -64,15 +101,27 @@ def test_gateway_run_redirects_to_supervised(
    # exit immediately (which is what would happen pre-this-PR on the
    # s6 image — the foreground gateway would crash without config,
    # the CMD would exit, /init would shut down).
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "gateway", "run"],
-        check=True, capture_output=True, timeout=30,
-    )
+    start_container(built_image, container_name, cmd="gateway run")

-    # Give /init time to run cont-init.d, the wrapper time to dispatch
-    # the redirect, and s6-supervise time to spin up the slot.
-    time.sleep(5)
+    # Wait for the redirect breadcrumb to appear in docker logs.
+    # Under heavy parallel load (32-way docker test fan-out), the CMD
+    # process (main-wrapper.sh → python → hermes gateway run) can take
+    # well over 5s to reach the redirect logic. The breadcrumb is the
+    # definitive signal that the redirect fired — polling for it is
+    # both faster on quick machines and flake-free on slow ones.
+    # Under heavy parallel docker load (32-way fan-out), the CMD process
+    # (main-wrapper.sh → python → hermes gateway run) can take well over
+    # 30s to import the codebase, load config, and reach the redirect
+    # logic. 60s matches the deadline other boot-readiness polls use.
+    logs = wait_for_docker_logs(
+        container_name, "s6 supervision", deadline_s=60.0,
+    )
+    assert "s6 supervision" in logs, (
+        f"expected loud breadcrumb in docker logs; got:\n{logs}"
+    )
+    assert "--no-supervise" in logs, (
+        f"breadcrumb missing opt-out hint; got:\n{logs}"
+    )

    # Container should still be running. If the redirect didn't fire,
    # the foreground gateway would have crashed and the container
@@ -83,7 +132,7 @@ def test_gateway_run_redirects_to_supervised(
    )
    assert r.returncode == 0 and r.stdout.strip() == "running", (
        f"container exited prematurely: {r.stdout!r}; "
-        f"docker logs:\n{subprocess.run(['docker', 'logs', container_name], capture_output=True, text=True).stdout}"
+        f"docker logs:\n{logs}"
    )

    # s6's intent for the default-profile gateway slot should be up.
@@ -96,26 +145,24 @@ def test_gateway_run_redirects_to_supervised(
    )

    # The CMD process (PID under /init that the wrapper exec'd into)
-    # should be sleeping, not the gateway. We grep `ps` for the
-    # `sleep infinity` heartbeat.
-    r = _sh(container_name, "ps -eo pid,cmd | grep -v grep | grep 'sleep infinity'")
-    assert r.returncode == 0 and "sleep infinity" in r.stdout, (
-        f"expected `sleep infinity` heartbeat process; got ps:\n{r.stdout}\n"
-        f"stderr: {r.stderr}"
+    # should be sleeping, not the gateway. We count `sleep infinity`
+    # processes parented to the CMD wrapper (main-wrapper.sh / rc.init
+    # top), NOT the static main-hermes service's sleep — a bare grep
+    # for `sleep infinity` would false-positive on the main-hermes
+    # sleep and pass even before the redirect fires.
+    r = docker_exec_sh(
+        container_name,
+        "ps -eo pid,ppid,cmd | grep -v grep | awk "
+        "'/main-wrapper.sh|rc.init top/ { wrapper_pid=$1 } "
+        "$3==\"sleep\" && $4==\"infinity\" && $2==wrapper_pid { c++ } "
+        "END { print c+0 }'",
    )
-
-    # And the loud breadcrumb should be in `docker logs` so users see
-    # the upgrade explanation.
-    r = subprocess.run(
-        ["docker", "logs", container_name],
-        capture_output=True, text=True, timeout=10,
-    )
-    logs = r.stdout + r.stderr
-    assert "s6 supervision" in logs, (
-        f"expected loud breadcrumb in docker logs; got:\n{logs}"
-    )
-    assert "--no-supervise" in logs, (
-        f"breadcrumb missing opt-out hint; got:\n{logs}"
+    assert r.returncode == 0
+    redirected_sleeps = int(r.stdout.strip() or 0)
+    assert redirected_sleeps == 1, (
+        f"expected one `sleep infinity` heartbeat parented to the CMD "
+        f"wrapper (the redirect); found {redirected_sleeps}. "
+        f"ps:\n{docker_exec_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
    )


@@ -139,25 +186,13 @@ def test_gateway_run_no_supervise_flag_preserves_legacy_behavior(
      * The ``gateway-default`` s6 service slot is NOT created.
      * No supervision-redirect breadcrumb appears in docker logs.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "gateway", "run", "--no-supervise"],
-        check=True, capture_output=True, timeout=30,
-    )
-    # Give startup time. The unconfigured-profile case used to fail
-    # fast; with a config bind-mounted profile (and a real volume on
-    # most realistic deployments) the gateway just runs.
-    time.sleep(6)
+    start_container(built_image, container_name, cmd="gateway run --no-supervise")

-    # Container should still be running OR have exited cleanly with
-    # the gateway's status code. Either is correct for pre-s6
-    # semantics — what's NOT correct is the supervised behavior
-    # (sleep infinity heartbeat + supervised gateway slot).
-    inspect = subprocess.run(
-        ["docker", "inspect", "-f", "{{.State.Status}}", container_name],
-        capture_output=True, text=True, timeout=10,
-    )
-    status = inspect.stdout.strip()
+    # Wait for the gateway to start in the foreground or the container
+    # to exit (no-config crash is also valid pre-s6 semantics).
+    # A fixed time.sleep(6) races under CI parallel docker load —
+    # the gateway can take well over 6s to finish Python imports.
+    status = _wait_for_gateway_or_exit(container_name, deadline_s=60.0)

    # No redirect breadcrumb anywhere.
    logs = subprocess.run(
@@ -175,7 +210,7 @@ def test_gateway_run_no_supervise_flag_preserves_legacy_behavior(
    if status == "running":
        # Gateway running in foreground — the CMD process should be
        # the gateway itself, NOT a sleep-infinity heartbeat.
-        r = _sh(
+        r = docker_exec_sh(
            container_name,
            "ps -eo pid,ppid,cmd | grep -v grep | awk '/main-wrapper.sh|rc.init top/ { wrapper_pid=$1 } "
            "$3==\"sleep\" && $4==\"infinity\" && $2==wrapper_pid { c++ } END { print c+0 }'",
@@ -186,7 +221,7 @@ def test_gateway_run_no_supervise_flag_preserves_legacy_behavior(
            f"--no-supervise: expected NO `sleep infinity` parented to "
            f"the CMD wrapper (foreground gateway should be the CMD), "
            f"found {redirected_sleeps}. "
-            f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
+            f"ps:\n{docker_exec_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
        )

        # The gateway-default s6 slot exists (the cont-init.d
@@ -211,13 +246,15 @@ def test_gateway_run_no_supervise_env_var(
    Useful when users can't easily change their `docker run` args
    (orchestration templates, K8s manifests) but can set env vars.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_GATEWAY_NO_SUPERVISE=1",
-         built_image, "gateway", "run"],
-        check=True, capture_output=True, timeout=30,
+    start_container(
+        built_image, container_name,
+        "HERMES_GATEWAY_NO_SUPERVISE=1",
+        cmd="gateway run",
    )
-    time.sleep(6)
+
+    # Same as the CLI-flag test: wait for the gateway to start or
+    # the container to exit, instead of a blind time.sleep(6).
+    status = _wait_for_gateway_or_exit(container_name, deadline_s=60.0)

    logs = subprocess.run(
        ["docker", "logs", container_name],
@@ -231,11 +268,7 @@ def test_gateway_run_no_supervise_env_var(

    # Same as the CLI-flag test: the slot exists (reconciler creates
    # it) but should not have want-state up.
-    inspect = subprocess.run(
-        ["docker", "inspect", "-f", "{{.State.Status}}", container_name],
-        capture_output=True, text=True, timeout=10,
-    )
-    if inspect.stdout.strip() == "running":
+    if status == "running":
        assert not _svstat_wants_up(container_name, "gateway-default"), (
            "HERMES_GATEWAY_NO_SUPERVISE=1: gateway-default has "
            "want-state up, implying the redirect dispatched `start` "
@@ -260,25 +293,33 @@ def test_supervised_gateway_does_not_recurse(
    supervised gateway). Two or more would imply recursive spawning
    via the redirect → start → run → redirect → ... loop.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "gateway", "run"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(6)
+    start_container(built_image, container_name, cmd="gateway run")

-    # Count python processes running `hermes gateway run`. If the
-    # recursion guard fails, s6 would respawn fresh `gateway run`
-    # processes on every cycle, leaving multiple Python-process
-    # descendants under the gateway-default supervise tree.
-    r = _sh(container_name, "ps -eo pid,cmd | grep -v grep | grep -E 'python.*hermes.*gateway run' | wc -l")
+    # Wait for the redirect to fire by polling for the breadcrumb.
+    # Under CI parallel docker test fan-out, the CMD process
+    # (main-wrapper.sh → python → hermes gateway run) can take well
+    # over 6s to reach the redirect logic. A fixed sleep would race:
+    # if we check too early, the CMD process hasn't exec'd into
+    # `sleep infinity` yet and the s6-supervised gateway hasn't
+    # started either — so we'd see the CMD's `hermes gateway run`
+    # AND the supervised one (2 processes) and falsely conclude
+    # recursion. Polling the breadcrumb is the definitive signal
+    # that the redirect fired and the CMD process is now `sleep`.
+    wait_for_docker_logs(container_name, "s6 supervision")
+
+    # Now that the redirect fired, count python processes running
+    # `hermes gateway run`. If the recursion guard fails, s6 would
+    # respawn fresh `gateway run` processes on every cycle, leaving
+    # multiple Python-process descendants under the gateway-default
+    # supervise tree.
+    r = docker_exec_sh(container_name, "ps -eo pid,cmd | grep -v grep | grep -E 'python.*hermes.*gateway run' | wc -l")
    assert r.returncode == 0
    n = int(r.stdout.strip() or 0)
    assert n <= 1, (
        f"expected at most one supervised python `hermes gateway run` "
        f"process (the legitimately-supervised gateway); found {n}. "
        f"Recursion guard may have failed. "
-        f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
+        f"ps:\n{docker_exec_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
    )

    # Stronger positive assertion: there should be exactly one
@@ -286,7 +327,7 @@ def test_supervised_gateway_does_not_recurse(
    # CMD process (PID 17 typically). The static `main-hermes`
    # service has its own `sleep infinity` child; THAT one is fine
    # and unrelated to our redirect.
-    r = _sh(
+    r = docker_exec_sh(
        container_name,
        # Find PID of the CMD process (main-wrapper.sh or its sh
        # parent), then count `sleep infinity` children.
@@ -298,7 +339,7 @@ def test_supervised_gateway_does_not_recurse(
    assert redirected == 1, (
        f"expected exactly one `sleep infinity` parented to the CMD "
        f"wrapper (the redirect heartbeat); found {redirected}. "
-        f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
+        f"ps:\n{docker_exec_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}"
    )


@@ -312,20 +353,47 @@ def test_dashboard_supervised_when_env_set(
    redirect: one container = supervised gateway + supervised
    dashboard, with zero extra user effort.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name,
-         "-e", "HERMES_DASHBOARD=1",
-         built_image, "gateway", "run"],
-        check=True, capture_output=True, timeout=30,
+    start_container(
+        built_image, container_name,
+        "HERMES_DASHBOARD=1",
+        cmd="gateway run",
    )
-    time.sleep(5)

-    # Both slots should report want-up.
-    assert _svstat_wants_up(container_name, "gateway-default"), (
-        f"gateway-default slot not up: {_svstat(container_name)!r}"
+    # Wait for the redirect to fire (the breadcrumb appears in docker
+    # logs when the CMD process reaches the redirect logic). This is
+    # the same signal the other gateway-run tests use.
+    # A fixed time.sleep(5) was racing: start_container returns when
+    # cont-init finishes, but the redirect (which creates the
+    # gateway-default s6 slot) happens later in the CMD process.
+    wait_for_docker_logs(
+        container_name, "s6 supervision", deadline_s=60.0,
    )
-    assert _svstat_wants_up(container_name, "dashboard"), (
-        f"dashboard slot not up: {_svstat(container_name, 'dashboard')!r}"
+
+    # Poll for both slots to report want-up, using the same
+    # _svstat_wants_up helper the other tests use. A simple
+    # `grep 'want up'` is wrong: when the service is already up,
+    # s6-svstat output is "up (pid ...) Ns" with no literal "want up"
+    # — the want-up intent is implied by the absence of "want down".
+    ok_gateway = False
+    end = time.monotonic() + 30.0
+    while time.monotonic() < end:
+        if _svstat_wants_up(container_name, "gateway-default"):
+            ok_gateway = True
+            break
+        time.sleep(0.5)
+    assert ok_gateway, (
+        f"gateway-default slot not want-up: {_svstat(container_name)!r}"
+    )
+
+    ok_dash = False
+    end = time.monotonic() + 30.0
+    while time.monotonic() < end:
+        if _svstat_wants_up(container_name, "dashboard"):
+            ok_dash = True
+            break
+        time.sleep(0.5)
+    assert ok_dash, (
+        f"dashboard slot not want-up: {_svstat(container_name, 'dashboard')!r}"
    )


@@ -354,14 +422,17 @@ def test_supervised_gateway_stdout_reaches_docker_logs(
    Python-logging output, so its presence in ``docker logs`` proves
    the stdout-tee is working.
    """
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "gateway", "run"],
-        check=True, capture_output=True, timeout=30,
-    )
-    # Banner is printed during gateway startup — give it time to
-    # initialize past the imports + config-load phase.
-    time.sleep(8)
+    start_container(built_image, container_name, cmd="gateway run")
+
+    # Poll docker logs for the banner glyph (⚕) or "Hermes Gateway
+    # Starting" — the gateway's rich-console startup banner. A fixed
+    # sleep(8) races under CI parallel docker test fan-out: the
+    # supervised gateway can take well over 8s to finish imports +
+    # config-load + banner print under load, and the assertion would
+    # fail not because the stdout-tee is broken but because we checked
+    # too early. Polling with a generous deadline is both faster on
+    # quick machines and flake-free on slow ones.
+    wait_for_docker_logs(container_name, "⚕", deadline_s=60.0)

    logs = subprocess.run(
        ["docker", "logs", container_name],
@@ -377,14 +448,14 @@ def test_supervised_gateway_stdout_reaches_docker_logs(
        "This means the `1` action directive in _render_log_run isn't "
        "forwarding stdout to /init. "
        f"docker logs (last 2000 chars):\n{combined[-2000:]}\n"
-        f"file contents:\n{_sh(container_name, 'cat /opt/data/logs/gateways/default/current').stdout}"
+        f"file contents:\n{docker_exec_sh(container_name, 'cat /opt/data/logs/gateways/default/current').stdout}"
    )

    # Cross-check: the same banner must also be in the rotated log
    # file (we kept the file destination, just added stdout). The
    # file version has s6-log's ISO 8601 timestamp prefix; the
    # docker logs version is raw.
-    file_contents = _sh(
+    file_contents = docker_exec_sh(
        container_name, "cat /opt/data/logs/gateways/default/current",
    ).stdout
    assert "⚕" in file_contents or "Hermes Gateway Starting" in file_contents, (
@@ -392,4 +463,3 @@ def test_supervised_gateway_stdout_reaches_docker_logs(
        "destination may have been dropped by the new s6-log script. "
        f"File contents:\n{file_contents}"
    )
-
--- a/tests/docker/test_home_override_scripts.py
+++ b/tests/docker/test_home_override_scripts.py
@@ -0,0 +1,169 @@
+"""Runtime smoke tests for Docker HOME overrides and script behavior.
+
+Build the real image and verify the actual runtime behavior:
+
+  1. main-wrapper preserves the Docker ``-w`` working directory
+  2. dashboard service resets HOME to /opt/data before privilege drop
+  3. dashboard does not auto-add ``--insecure`` from a non-loopback bind host
+  4. stage2 hook repairs profiles/ and cron/ ownership on every boot
+"""
+from __future__ import annotations
+
+import subprocess
+
+from tests.docker.conftest import docker_exec, docker_exec_sh, start_container, restart_container
+
+
+def test_main_wrapper_preserves_docker_workdir(
+    built_image: str, container_name: str,
+) -> None:
+    """The main-wrapper MUST save and restore the original working directory
+    so the container starts in the Docker ``-w`` directory, not /opt/data.
+
+    Regression test for #35472. We pass ``-w /tmp`` and a command that
+    prints its cwd; the output must be ``/tmp``, proving the wrapper
+    restored the cwd after its internal ``cd /opt/data``.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", "-w", "/tmp",
+         built_image, "sh", "-c", "pwd"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, f"container failed: {r.stderr[-1000:]}"
+    # The stage2 hook emits boot logs (config migration, skills sync)
+    # to stdout before the CMD runs. The actual pwd output is the LAST
+    # line of stdout.
+    last_line = r.stdout.strip().split("\n")[-1].strip()
+    assert last_line == "/tmp", (
+        f"expected cwd /tmp, got {last_line!r} — "
+        f"main-wrapper did not preserve the Docker -w directory"
+    )
+
+
+def test_dashboard_service_resets_home(
+    built_image: str, container_name: str,
+) -> None:
+    """The dashboard run script must export HOME=/opt/data before dropping
+    privileges, so HOME-anchored state (discord lockfile, XDG dirs) doesn't
+    try to write to /root (the /init context's HOME).
+
+    We check this by inspecting the environment of the dashboard service
+    process if it's running, or by verifying the run script sets HOME
+    before the exec. At runtime, the cleanest check is: start the
+    container with HERMES_DASHBOARD=1 and verify the dashboard process
+    (if it starts) has HOME=/opt/data.
+
+    Since the dashboard requires an auth provider on non-loopback binds,
+    we bind to 127.0.0.1 where the auth gate doesn't engage, and check
+    the process env.
+    """
+    start_container(built_image, container_name, "HERMES_DASHBOARD=1", "HERMES_DASHBOARD_HOST=127.0.0.1")
+
+    # Check if the dashboard process is running and inspect its HOME.
+    r = docker_exec_sh(
+        container_name,
+        # Find the dashboard process (hermes dashboard) and read its HOME
+        # from /proc/<pid>/environ. If not running, verify the run script
+        # itself exports HOME=/opt/data by grepping the script source.
+        'pid=$(pgrep -f "hermes dashboard" | head -1); '
+        'if [ -n "$pid" ]; then '
+        '  tr "\\0" "\\n" < /proc/$pid/environ | grep "^HOME="; '
+        'else '
+        '  grep -q "export HOME=/opt/data" '
+        '    /opt/hermes/docker/s6-rc.d/dashboard/run && '
+        '  echo "HOME=/opt/data"; '
+        'fi',
+        timeout=15,
+    )
+    assert "HOME=/opt/data" in r.stdout, (
+        f"dashboard process or run script does not set HOME=/opt/data: "
+        f"stdout={r.stdout!r} stderr={r.stderr!r}"
+    )
+
+
+def test_dashboard_does_not_auto_insecure_from_host(
+    built_image: str, container_name: str,
+) -> None:
+    """The dashboard MUST NOT auto-add ``--insecure`` based on
+    HERMES_DASHBOARD_HOST. The auth gate is the authority now.
+
+    The auth gate is the authority on whether non-loopback binds are
+    safe; ``--insecure`` must never be auto-derived from the bind host.
+
+    We start the container with a non-loopback bind host and verify
+    the dashboard process does NOT receive ``--insecure`` in its
+    command line. If the dashboard fails to start (because the auth
+    gate correctly blocks an unauthenticated non-loopback bind), that's
+    also acceptable — the point is no auto-insecure.
+    """
+    start_container(built_image, container_name, "HERMES_DASHBOARD=1", "HERMES_DASHBOARD_HOST=0.0.0.0")
+
+    # Check the dashboard process command line for --insecure.
+    r = docker_exec_sh(
+        container_name,
+        'pid=$(pgrep -f "hermes dashboard" | head -1); '
+        'if [ -n "$pid" ]; then '
+        '  tr "\\0" " " < /proc/$pid/cmdline; '
+        'fi',
+        timeout=10,
+    )
+    cmdline = r.stdout.strip()
+    # If the process is running, it must NOT have --insecure.
+    if cmdline:
+        assert "--insecure" not in cmdline, (
+            f"dashboard process has --insecure in cmdline (auto-derived "
+            f"from host): {cmdline!r}"
+        )
+
+
+def test_stage2_repairs_profiles_and_cron_ownership(
+    built_image: str, container_name: str,
+) -> None:
+    """profiles/ and cron/ must both be reclaimed after root-context writes.
+
+    The stage2 hook chowns these dirs to hermes:hermes on every boot.
+    We simulate a root-owned file in each, then restart the container
+    and verify ownership is repaired.
+    """
+    start_container(built_image, container_name)
+
+    # Create root-owned files in profiles/ and cron/ to simulate
+    # docker exec (root) writes.
+    docker_exec(
+        container_name, "mkdir", "-p", "/opt/data/profiles/testprof",
+        user="root", timeout=5,
+    )
+    docker_exec(
+        container_name, "touch", "/opt/data/profiles/testprof/marker",
+        user="root", timeout=5,
+    )
+    docker_exec(
+        container_name, "touch", "/opt/data/cron/root_owned.json",
+        user="root", timeout=5,
+    )
+
+    # Verify they're root-owned before restart.
+    r = docker_exec_sh(
+        container_name,
+        'stat -c "%U" /opt/data/profiles/testprof/marker '
+        '/opt/data/cron/root_owned.json',
+        timeout=5,
+    )
+    assert "root" in r.stdout, (
+        f"expected root-owned files before restart, got: {r.stdout!r}"
+    )
+
+    # Restart — stage2 hook runs again and repairs ownership.
+    restart_container(container_name)
+
+    # Verify files are now owned by hermes.
+    r = docker_exec_sh(
+        container_name,
+        'stat -c "%U" /opt/data/profiles/testprof/marker '
+        '/opt/data/cron/root_owned.json',
+        timeout=5,
+    )
+    assert "hermes" in r.stdout, (
+        f"expected hermes-owned files after restart, got: {r.stdout!r} — "
+        f"stage2 hook did not repair profiles/ and cron/ ownership"
+    )
--- a/tests/docker/test_immutable_install.py
+++ b/tests/docker/test_immutable_install.py
@@ -0,0 +1,140 @@
+"""Runtime smoke tests for Docker immutable install tree and install-method stamp.
+
+Build the real image and verify at runtime:
+
+  1. /opt/hermes is not writable by the hermes user (immutable install tree)
+  2. PYTHONDONTWRITEBYTECODE and HERMES_DISABLE_LAZY_INSTALLS are set
+  3. /opt/hermes/.install_method contains "docker" (code-scoped stamp)
+  4. $HERMES_HOME/.install_method is NOT stamped as "docker" by stage2
+  5. A stale "docker" stamp in $HERMES_HOME is healed (removed) on boot
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import (
+    docker_exec,
+    docker_exec_sh,
+    restart_container,
+    start_container,
+)
+
+
+def test_install_tree_not_writable_by_hermes(
+    built_image: str, container_name: str,
+) -> None:
+    """The hermes user must not be able to modify /opt/hermes.
+
+    The install tree (source, venv, TUI bundle, node_modules) must remain
+    root-owned and non-writable so an agent session cannot self-modify
+    the installation and brick the gateway.
+    """
+    start_container(built_image, container_name)
+
+    r = docker_exec_sh(
+        container_name,
+        # Try to create a file under /opt/hermes as the hermes user
+        "touch /opt/hermes/test_write 2>&1 && "
+        "echo WRITE_SUCCEEDED || echo WRITE_FAILED",
+        timeout=10,
+    )
+    assert "WRITE_FAILED" in r.stdout, (
+        f"hermes user can write to /opt/hermes (install tree not immutable): "
+        f"{r.stdout}"
+    )
+
+    # Also check a key subdirectory
+    r = docker_exec_sh(
+        container_name,
+        "touch /opt/hermes/.venv/test_write 2>&1 && "
+        "echo WRITE_SUCCEEDED || echo WRITE_FAILED",
+        timeout=10,
+    )
+    assert "WRITE_FAILED" in r.stdout, (
+        f"hermes user can write to /opt/hermes/.venv: {r.stdout}"
+    )
+
+
+def test_hermes_disable_lazy_installs_and_dont_write_bytecode(
+    built_image: str, container_name: str,
+) -> None:
+    """The container must set PYTHONDONTWRITEBYTECODE and
+    HERMES_DISABLE_LAZY_INSTALLS=1 so no .pyc files are written to the
+    immutable install tree and no lazy installs attempt to modify it."""
+    start_container(built_image, container_name)
+
+    r = docker_exec_sh(
+        container_name,
+        'test "$PYTHONDONTWRITEBYTECODE" = "1" && '
+        'test "$HERMES_DISABLE_LAZY_INSTALLS" = "1" && '
+        'echo ENV_OK || echo ENV_MISSING',
+        timeout=10,
+    )
+    assert "ENV_OK" in r.stdout, (
+        f"expected PYTHONDONTWRITEBYTECODE=1 and "
+        f"HERMES_DISABLE_LAZY_INSTALLS=1, got: {r.stdout} stderr={r.stderr}"
+    )
+
+
+def test_install_method_stamp_is_code_scoped(
+    built_image: str, container_name: str,
+) -> None:
+    """The 'docker' install-method stamp must be baked at
+    /opt/hermes/.install_method (code-scoped), NOT in $HERMES_HOME."""
+    start_container(built_image, container_name)
+
+    # Code-scoped stamp must exist and say "docker"
+    r = docker_exec_sh(
+        container_name,
+        "cat /opt/hermes/.install_method",
+        timeout=10,
+    )
+    assert r.returncode == 0, (
+        f"/opt/hermes/.install_method not found: {r.stderr}"
+    )
+    assert r.stdout.strip() == "docker", (
+        f"expected 'docker' stamp, got: {r.stdout.strip()!r}"
+    )
+
+    # $HERMES_HOME must NOT have a 'docker' stamp
+    r = docker_exec_sh(
+        container_name,
+        "cat /opt/data/.install_method 2>/dev/null || echo NONE",
+        timeout=10,
+    )
+    assert r.stdout.strip() != "docker", (
+        f"$HERMES_HOME/.install_method is stamped 'docker' - stage2 must "
+        f"not stamp the data volume (shared with host installs)"
+    )
+
+
+def test_stale_docker_stamp_in_home_is_healed_on_boot(
+    built_image: str, container_name: str,
+) -> None:
+    """A stale 'docker' stamp left in $HERMES_HOME by an older image
+    must be removed on boot so shared homes self-heal."""
+    # Start container, write a stale stamp
+    start_container(built_image, container_name)
+
+    # Write a stale 'docker' stamp as root
+    docker_exec(
+        container_name, "sh", "-c",
+        "printf 'docker\\n' > /opt/data/.install_method",
+        user="root", timeout=5,
+    )
+    # Verify it exists
+    r = docker_exec_sh(container_name, "cat /opt/data/.install_method", timeout=5)
+    assert r.stdout.strip() == "docker"
+
+    # Restart - stage2 should heal it
+    restart_container(container_name)
+
+    # The stale stamp must be gone
+    r = docker_exec_sh(
+        container_name,
+        "test -f /opt/data/.install_method && "
+        "cat /opt/data/.install_method || echo HEALED",
+        timeout=10,
+    )
+    assert "HEALED" in r.stdout or r.stdout.strip() != "docker", (
+        f"stale 'docker' stamp in $HERMES_HOME was not healed on boot: "
+        f"{r.stdout}"
+    )
--- a/tests/docker/test_license_file_present.py
+++ b/tests/docker/test_license_file_present.py
@@ -0,0 +1,26 @@
+"""Runtime smoke test for Docker image license-file presence.
+
+Build the real image and verify the LICENSE file is present inside the
+container (PEP 639 license-files metadata must resolve inside the
+Docker image).
+"""
+from __future__ import annotations
+
+import subprocess
+
+
+def test_docker_image_contains_license_file(built_image: str) -> None:
+    """The LICENSE file must be present inside the built Docker image.
+
+    PEP 639 license-files metadata references LICENSE, and the Docker
+    build context must not exclude it.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--entrypoint", "test",
+         built_image, "-f", "/opt/hermes/LICENSE"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, (
+        f"LICENSE file not found at /opt/hermes/LICENSE inside the Docker "
+        f"image: {r.stderr[-500:]}"
+    )
--- a/tests/docker/test_log_dir_seed.py
+++ b/tests/docker/test_log_dir_seed.py
@@ -0,0 +1,47 @@
+"""Runtime smoke test for Docker $HERMES_HOME/logs/gateways seeding.
+
+Build the real image and verify logs/ and logs/gateways/ exist and are
+owned by the hermes user after container boot.
+
+Regression guard for #45258: if the first gateway log service runs in
+root context, logs/gateways/ is created root-owned; every profile
+registered later runs its log service as the dropped hermes user and
+s6-log crash-loops on mkdir: Permission denied.
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import docker_exec_sh, start_container
+
+
+def test_logs_gateways_seeded_and_hermes_owned(
+    built_image: str, container_name: str,
+) -> None:
+    """logs/ and logs/gateways/ must exist and be owned by hermes after boot."""
+    start_container(built_image, container_name)
+
+    # Both directories must exist
+    r = docker_exec_sh(
+        container_name,
+        "test -d /opt/data/logs && "
+        "test -d /opt/data/logs/gateways && "
+        "echo DIRS_OK || echo DIRS_MISSING",
+        timeout=10,
+    )
+    assert "DIRS_OK" in r.stdout, (
+        f"logs/ or logs/gateways/ not seeded: {r.stdout}"
+    )
+
+    # Both must be owned by hermes
+    r = docker_exec_sh(
+        container_name,
+        'logs_owner=$(stat -c "%U" /opt/data/logs); '
+        'gateways_owner=$(stat -c "%U" /opt/data/logs/gateways); '
+        'echo "logs=$logs_owner gateways=$gateways_owner"',
+        timeout=10,
+    )
+    assert "logs=hermes" in r.stdout, (
+        f"logs/ not owned by hermes: {r.stdout}"
+    )
+    assert "gateways=hermes" in r.stdout, (
+        f"logs/gateways/ not owned by hermes: {r.stdout}"
+    )
--- a/tests/docker/test_profile_gateway.py
+++ b/tests/docker/test_profile_gateway.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import subprocess
 import time

-from tests.docker.conftest import docker_exec_sh
+from tests.docker.conftest import docker_exec_sh, start_container

 PROFILE = "test-harness-profile"

@@ -69,12 +69,7 @@ def _svstat_wants_up(container: str) -> bool:
 def test_profile_create_then_gateway_start(
    built_image: str, container_name: str,
 ) -> None:
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(3)
+    start_container(built_image, container_name, cmd="sleep 120")

    r = _sh(container_name, f"hermes profile create {PROFILE}")
    assert r.returncode == 0, f"profile create failed: {r.stderr}"
@@ -114,12 +109,7 @@ def test_profile_delete_stops_gateway(
 ) -> None:
    """Deleting a profile should stop its gateway and remove the s6
    service slot."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(3)
+    start_container(built_image, container_name, cmd="sleep 120")

    _sh(container_name, f"hermes profile create {PROFILE}")
    _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
@@ -135,4 +125,4 @@ def test_profile_delete_stops_gateway(
    time.sleep(2)
    # Service slot should be gone.
    r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}")
-    assert r.returncode != 0, "s6 service slot still present after profile delete"
+    assert r.returncode != 0, "s6 service slot still present after profile delete"
--- a/tests/docker/test_puid_pgid_remap.py
+++ b/tests/docker/test_puid_pgid_remap.py
@@ -0,0 +1,88 @@
+"""Runtime smoke tests for Docker PUID/PGID and UID/GID remap.
+
+Build the real image and verify the actual runtime behavior:
+
+  1. PUID/PGID env vars remap the hermes user UID/GID at boot
+  2. HERMES_UID/HERMES_GID take precedence over PUID/PGID aliases
+  3. NAS-style low UIDs (99:100) are accepted and remapped
+  4. Invalid UIDs are rejected
+  5. The remapped user can write to the data volume
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import docker_exec_sh, start_container
+
+
+def test_puid_pgid_remaps_hermes_user(
+    built_image: str, container_name: str,
+) -> None:
+    """PUID=1000 PGID=1000 must remap the hermes user to UID 1000."""
+    start_container(built_image, container_name, "PUID=1000", "PGID=1000")
+
+    r = docker_exec_sh(
+        container_name,
+        "id -u hermes",
+        timeout=10,
+    )
+    assert r.stdout.strip() == "1000", (
+        f"expected hermes UID 1000 after PUID remap, got: {r.stdout.strip()}"
+    )
+
+    r = docker_exec_sh(
+        container_name,
+        "id -g hermes",
+        timeout=10,
+    )
+    assert r.stdout.strip() == "1000", (
+        f"expected hermes GID 1000 after PGID remap, got: {r.stdout.strip()}"
+    )
+
+
+def test_hermes_uid_gid_take_precedence_over_aliases(
+    built_image: str, container_name: str,
+) -> None:
+    """HERMES_UID/HERMES_GID must win over PUID/PGID when both are set."""
+    start_container(built_image, container_name, "HERMES_UID=2000", "HERMES_GID=2001", "PUID=1000", "PGID=1000")
+
+    r = docker_exec_sh(container_name, "id -u hermes", timeout=10)
+    assert r.stdout.strip() == "2000", (
+        f"expected hermes UID 2000 (HERMES_UID wins), got: {r.stdout.strip()}"
+    )
+
+    r = docker_exec_sh(container_name, "id -g hermes", timeout=10)
+    assert r.stdout.strip() == "2001", (
+        f"expected hermes GID 2001 (HERMES_GID wins), got: {r.stdout.strip()}"
+    )
+
+
+def test_nas_low_uid_accepted(
+    built_image: str, container_name: str,
+) -> None:
+    """NAS-style low UIDs (99:100, common on Unraid) must be accepted."""
+    start_container(built_image, container_name, "PUID=99", "PGID=100")
+
+    r = docker_exec_sh(container_name, "id -u hermes", timeout=10)
+    assert r.stdout.strip() == "99", (
+        f"expected hermes UID 99, got: {r.stdout.strip()}"
+    )
+
+    r = docker_exec_sh(container_name, "id -g hermes", timeout=10)
+    assert r.stdout.strip() == "100", (
+        f"expected hermes GID 100, got: {r.stdout.strip()}"
+    )
+
+
+def test_remap_enables_data_volume_writes(
+    built_image: str, container_name: str,
+) -> None:
+    """After remap, the hermes user must be able to write to /opt/data."""
+    start_container(built_image, container_name, "PUID=1000", "PGID=1000")
+
+    r = docker_exec_sh(
+        container_name,
+        "touch /opt/data/test_write && echo WRITE_OK || echo WRITE_FAIL",
+        timeout=10,
+    )
+    assert "WRITE_OK" in r.stdout, (
+        f"hermes user cannot write to /opt/data after remap: {r.stdout}"
+    )
--- a/tests/docker/test_s6_profile_gateway_integration.py
+++ b/tests/docker/test_s6_profile_gateway_integration.py
@@ -19,10 +19,7 @@ operations work correctly under UID 10000.
 """
 from __future__ import annotations

-import subprocess
-import time
-
-from tests.docker.conftest import docker_exec
+from tests.docker.conftest import docker_exec, start_container


 _REGISTER_SCRIPT = """
@@ -45,49 +42,39 @@ print("UNREGISTERED")
 """


-def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess:
-    return docker_exec(container, *args, timeout=timeout)
-
-
 def test_s6_register_creates_service_dir_in_live_container(
    built_image: str, container_name: str,
 ) -> None:
    """S6ServiceManager.register_profile_gateway must create
    ``/run/service/gateway-<profile>/`` and trigger s6-svscan rescan
    against the real s6 supervision tree."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
-    )
-    # Give the supervision tree a moment to come up.
-    time.sleep(3)
+    start_container(built_image, container_name, cmd="sleep 120")

-    r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
+    r = docker_exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
    assert "REGISTERED" in r.stdout, (
        f"register failed: stderr={r.stderr!r} stdout={r.stdout!r}"
    )

    # Service directory exists with the expected structure.
-    r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
+    r = docker_exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
    assert r.returncode == 0, "service directory not created"

-    r = _exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run")
+    r = docker_exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run")
    assert r.returncode == 0, "run script not created"

-    r = _exec(container_name, "test", "-f",
+    r = docker_exec(container_name, "test", "-f",
              "/run/service/gateway-phase3test/log/run")
    assert r.returncode == 0, "log/run script not created"

    # s6-svscan picked it up — s6-svstat works against the dir.
    # `docker exec` doesn't put /command/ on PATH (only the supervision
    # tree does), so call s6-svstat by absolute path.
-    r = _exec(container_name, "/command/s6-svstat",
+    r = docker_exec(container_name, "/command/s6-svstat",
              "/run/service/gateway-phase3test")
    assert r.returncode == 0, f"s6-svstat failed: {r.stderr or r.stdout}"

    # list_profile_gateways picks it up.
-    r = _exec(container_name, "python3", "-c", (
+    r = docker_exec(container_name, "python3", "-c", (
        "from hermes_cli.service_manager import S6ServiceManager;"
        "print(S6ServiceManager().list_profile_gateways())"
    ))
@@ -100,29 +87,24 @@ def test_s6_unregister_removes_service_dir_in_live_container(
    """unregister_profile_gateway must stop the service, remove the
    directory, and trigger s6-svscan rescan so the supervise process
    is dropped."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "120"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(3)
+    start_container(built_image, container_name, cmd="sleep 120")

    # First register so we have something to unregister.
-    r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
+    r = docker_exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
    assert "REGISTERED" in r.stdout

    # Then unregister.
-    r = _exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30)
+    r = docker_exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30)
    assert "UNREGISTERED" in r.stdout, (
        f"unregister failed: stderr={r.stderr!r} stdout={r.stdout!r}"
    )

    # Directory is gone.
-    r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
+    r = docker_exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
    assert r.returncode != 0, "service directory still exists after unregister"

    # list_profile_gateways no longer includes it.
-    r = _exec(container_name, "python3", "-c", (
+    r = docker_exec(container_name, "python3", "-c", (
        "from hermes_cli.service_manager import S6ServiceManager;"
        "print(S6ServiceManager().list_profile_gateways())"
    ))
--- a/tests/docker/test_smoke.py
+++ b/tests/docker/test_smoke.py
@@ -0,0 +1,60 @@
+"""Runtime smoke tests for the Docker image entrypoint and subcommands.
+
+Converted from the former ``.github/actions/hermes-smoke-test`` composite
+action.  These tests exercise the image's real ENTRYPOINT (``/init`` +
+``main-wrapper.sh``) via ``docker run --rm <image> --help`` and
+``docker run --rm <image> dashboard --help`` to catch basic runtime
+regressions before publishing.
+
+The harness expects the ``built_image`` fixture from
+``tests/docker/conftest.py``.  When Docker isn't available every test
+here is skipped at collection time.
+"""
+from __future__ import annotations
+
+import subprocess
+
+
+def test_hermes_help(built_image: str) -> None:
+    """``docker run --rm <image> --help`` must exit 0.
+
+    Uses the image's real ENTRYPOINT (``/init`` + ``main-wrapper.sh``)
+    so this exercises the actual production startup path.  PR #30136
+    review caught that an ``--entrypoint`` override in the old composite
+    action had been silently neutered by the s6-overlay migration —
+    ``stage2-hook`` ignores CMD args passed after an overridden
+    entrypoint, so the smoke test was a no-op.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", built_image, "--help"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, (
+        f"hermes --help failed (exit {r.returncode}): "
+        f"stdout={r.stdout[-2000:]!r} stderr={r.stderr[-2000:]!r}"
+    )
+    assert "Traceback" not in r.stderr, (
+        f"hermes --help produced a traceback: {r.stderr[-2000:]!r}"
+    )
+
+
+def test_dashboard_subcommand_present(built_image: str) -> None:
+    """``docker run --rm <image> dashboard --help`` must exit 0.
+
+    Regression guard for #9153: the ``dashboard`` subcommand was present
+    in source but missing from the published image.  If this fails,
+    something in the Dockerfile is excluding the dashboard subcommand
+    from the installed package.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", built_image, "dashboard", "--help"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, (
+        f"hermes dashboard --help failed (exit {r.returncode}): "
+        f"stdout={r.stdout[-2000:]!r} stderr={r.stderr[-2000:]!r}"
+    )
+    combined = (r.stdout + r.stderr).lower()
+    assert "dashboard" in combined or "usage" in combined, (
+        f"dashboard --help output unexpected: {combined[-2000:]!r}"
+    )
--- a/tests/docker/test_stage2_browser_discovery.py
+++ b/tests/docker/test_stage2_browser_discovery.py
@@ -0,0 +1,82 @@
+"""Runtime smoke tests for Docker stage2 browser executable discovery.
+
+Build the real image and verify the chromium binary is actually
+discovered at boot: ``AGENT_BROWSER_EXECUTABLE_PATH`` is set, points to
+a real executable, and is a browser binary (not a shared library picked
+up by a broad ``find | grep``).
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import docker_exec_sh, start_container
+
+
+def test_stage2_discovers_chromium_binary(
+    built_image: str, container_name: str,
+) -> None:
+    """The stage2 hook must discover the Playwright chromium binary and
+    export AGENT_BROWSER_EXECUTABLE_PATH so the browser tool can find it.
+
+    The discovery uses filename matching, not a broad ``find | grep``:
+    shared libraries (libGLESv2.so etc.) inherit the executable bit from
+    Playwright's tarball but must not be picked up. This test verifies the
+    discovered binary is a real browser, not a .so.
+    """
+    start_container(built_image, container_name)
+
+    # AGENT_BROWSER_EXECUTABLE_PATH must be set via s6 container_environment.
+    r = docker_exec_sh(
+        container_name,
+        "cat /run/s6/container_environment/AGENT_BROWSER_EXECUTABLE_PATH",
+        timeout=10,
+    )
+    assert r.returncode == 0, (
+        f"AGENT_BROWSER_EXECUTABLE_PATH not set by stage2 hook: {r.stderr}"
+    )
+    browser_path = r.stdout.strip()
+    assert browser_path, "AGENT_BROWSER_EXECUTABLE_PATH is empty"
+
+    # Must be a real file and executable.
+    r = docker_exec_sh(
+        container_name,
+        f'test -x "{browser_path}"',
+        timeout=5,
+    )
+    assert r.returncode == 0, (
+        f"discovered browser path is not executable: {browser_path}"
+    )
+
+    # Must be a browser binary by basename — NOT a shared library.
+    accepted_names = (
+        "chrome", "chromium", "chrome-headless-shell",
+        "headless_shell", "chromium-browser",
+    )
+    r = docker_exec_sh(
+        container_name,
+        f'basename "{browser_path}"',
+        timeout=5,
+    )
+    basename = r.stdout.strip()
+    assert basename in accepted_names, (
+        f"discovered binary basename {basename!r} is not a recognized "
+        f"browser name (accepted: {accepted_names}) — the discovery may "
+        f"have picked up a shared library (.so) instead of the real browser"
+    )
+
+
+def test_stage2_browser_path_accessible_to_hermes_user(
+    built_image: str, container_name: str,
+) -> None:
+    """The discovered browser binary must be accessible to the
+    unprivileged hermes user (UID 10000), since that's who runs
+    agent-browser subprocesses."""
+    start_container(built_image, container_name)
+
+    r = docker_exec_sh(
+        container_name,
+        'path="$(cat /run/s6/container_environment/AGENT_BROWSER_EXECUTABLE_PATH)" '
+        '&& test -r "$path" && test -x "$path"',
+        timeout=10,
+    )
+    assert r.returncode == 0, (
+        f"browser binary not readable+executable by hermes user: {r.stderr}"
+    )
--- a/tests/docker/test_tini_compat_shim.py
+++ b/tests/docker/test_tini_compat_shim.py
@@ -0,0 +1,54 @@
+"""Runtime smoke test for the Docker tini compatibility shim (#34192).
+
+Build the real image and verify:
+
+  1. /usr/bin/tini exists and is a symlink to /init (the compat shim
+     for orchestration templates that still reference /usr/bin/tini)
+  2. The actual ENTRYPOINT is /init (s6-overlay), not /usr/bin/tini
+"""
+from __future__ import annotations
+
+import subprocess
+
+
+def test_tini_compat_symlink_exists(built_image: str) -> None:
+    """/usr/bin/tini must exist as a symlink to /init.
+
+    Regression for #34192: orchestration templates (e.g. Hostinger's
+    'Hermes WebUI' catalog) still pin /usr/bin/tini as the entrypoint.
+    The shim symlinks it to /init so legacy wrappers exec the right
+    PID-1 reaper without behavior change.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--entrypoint", "sh",
+         built_image, "-c",
+         'test -L /usr/bin/tini && '
+         'test "$(readlink -f /usr/bin/tini)" = "/init"'],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, (
+        f"/usr/bin/tini is not a symlink to /init: {r.stderr[-500:]}"
+    )
+
+
+def test_entrypoint_is_init_not_tini(built_image: str) -> None:
+    """The image's actual ENTRYPOINT must be /init (s6-overlay).
+
+    The tini shim is only for legacy external wrappers; the image's own
+    runtime must continue to use the canonical /init.
+    """
+    r = subprocess.run(
+        ["docker", "inspect", built_image,
+         "--format", "{{json .Config.Entrypoint}}"],
+        capture_output=True, text=True, timeout=30,
+    )
+    assert r.returncode == 0, f"docker inspect failed: {r.stderr}"
+    entrypoint = r.stdout.strip()
+    assert "/init" in entrypoint, (
+        f"ENTRYPOINT is not /init: {entrypoint!r}"
+    )
+    # The entrypoint array should be ["/init", "/opt/hermes/docker/main-wrapper.sh"]
+    # /usr/bin/tini should NOT be in the entrypoint.
+    assert "tini" not in entrypoint.lower(), (
+        f"ENTRYPOINT references tini instead of /init: {entrypoint!r}"
+    )
--- a/tests/docker/test_toplevel_chown.py
+++ b/tests/docker/test_toplevel_chown.py
@@ -0,0 +1,93 @@
+"""Runtime smoke tests for Docker top-level state-file ownership repair.
+
+Build the real image and verify the actual runtime behavior:
+
+  1. Root-owned top-level state files (auth.json, state.db, gateway.lock,
+     gateway_state.json) are chowned to hermes on boot
+  2. Non-allowlisted host-owned files are NOT touched (targeted, not
+     blanket find -user root sweep)
+"""
+from __future__ import annotations
+
+from tests.docker.conftest import (
+    docker_exec,
+    docker_exec_sh,
+    restart_container,
+    start_container,
+)
+
+
+# The files the stage2 hook should repair (mirrors the allowlist in
+# stage2-hook.sh). We test a representative subset.
+ALLOWLISTED_FILES = ("auth.json", "state.db", "gateway.lock", "gateway_state.json")
+
+
+def test_root_owned_state_files_repaired_on_boot(
+    built_image: str, container_name: str,
+) -> None:
+    """Root-owned top-level state files must be chowned to hermes on boot."""
+    start_container(built_image, container_name)
+
+    # Create root-owned state files to simulate docker exec (root) writes
+    for f in ALLOWLISTED_FILES:
+        docker_exec(
+            container_name, "touch", f"/opt/data/{f}",
+            user="root", timeout=5,
+        )
+
+    # Verify they're root-owned
+    r = docker_exec_sh(
+        container_name,
+        " ".join(f'stat -c %U /opt/data/{f}' for f in ALLOWLISTED_FILES),
+        timeout=5,
+    )
+    for line in r.stdout.split():
+        assert line == "root", f"expected root-owned, got: {line}"
+
+    # Restart - stage2 should repair ownership
+    restart_container(container_name)
+
+    # Verify files are now hermes-owned
+    r = docker_exec_sh(
+        container_name,
+        " ".join(f'stat -c %U /opt/data/{f}' for f in ALLOWLISTED_FILES),
+        timeout=5,
+    )
+    for line in r.stdout.split():
+        assert line == "hermes", (
+            f"expected hermes-owned after restart, got: {line}"
+        )
+
+
+def test_non_allowlisted_host_file_not_touched(
+    built_image: str, container_name: str,
+) -> None:
+    """A non-allowlisted host-owned file must NOT be chowned, even if
+    root-owned. Regression guard for #19788 / #19795: a bind-mounted
+    $HERMES_HOME may contain host-owned files Hermes does not manage."""
+    start_container(built_image, container_name)
+
+    # Create a non-allowlisted file as root
+    docker_exec(
+        container_name, "touch", "/opt/data/host_secret.json",
+        user="root", timeout=5,
+    )
+    # Make it root-owned explicitly (it already is, but be sure)
+    docker_exec(
+        container_name, "chown", "root:root", "/opt/data/host_secret.json",
+        user="root", timeout=5,
+    )
+
+    # Restart
+    restart_container(container_name)
+
+    # The file must STILL be root-owned (not touched by stage2)
+    r = docker_exec_sh(
+        container_name,
+        "stat -c %U /opt/data/host_secret.json",
+        timeout=5,
+    )
+    assert r.stdout.strip() == "root", (
+        f"non-allowlisted host file was chowned by stage2 (should be "
+        f"preserved): {r.stdout.strip()}"
+    )
--- a/tests/docker/test_user_flag_guard.py
+++ b/tests/docker/test_user_flag_guard.py
@@ -0,0 +1,66 @@
+"""Runtime smoke tests for Docker --user flag guard.
+
+Build the real image and verify the actual runtime behavior:
+
+  1. docker run --user <arbitrary-uid> is rejected with actionable guidance
+  2. Root start (default) works fine
+  3. --user <hermes-uid> (10000) is allowed (supported non-root start)
+"""
+from __future__ import annotations
+
+import subprocess
+
+
+def test_arbitrary_user_uid_rejected(
+    built_image: str,
+) -> None:
+    """docker run --user 1000 must be rejected with actionable guidance."""
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--user", "1000:1000",
+         built_image, "echo", "should_not_reach"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode != 0, (
+        f"container started with arbitrary --user UID unexpectedly: {r.stdout}"
+    )
+    assert "should_not_reach" not in r.stdout, (
+        f"container ran despite --user rejection: {r.stdout}"
+    )
+    combined = r.stdout + r.stderr
+    assert "not supported" in combined.lower(), (
+        f"rejection message missing 'not supported': {combined[-500:]}"
+    )
+    # Must mention the remediation env vars
+    assert "HERMES_UID" in combined or "PUID" in combined, (
+        f"rejection message missing remediation guidance: {combined[-500:]}"
+    )
+
+
+def test_root_start_works(
+    built_image: str,
+) -> None:
+    """Root start (the default) must work without issues."""
+    r = subprocess.run(
+        ["docker", "run", "--rm", built_image, "sh", "-c", "echo OK"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, f"root start failed: {r.stderr[-500:]}"
+    assert "OK" in r.stdout
+
+
+def test_user_pinned_to_hermes_uid_works(
+    built_image: str,
+) -> None:
+    """docker run --user 10000:10000 (the hermes UID) must be allowed.
+
+    This is the supported non-root start from #34648 / #34837.
+    """
+    r = subprocess.run(
+        ["docker", "run", "--rm", "--user", "10000:10000",
+         built_image, "sh", "-c", "echo OK"],
+        capture_output=True, text=True, timeout=60,
+    )
+    assert r.returncode == 0, (
+        f"--user 10000:10000 (hermes UID) was rejected: {r.stderr[-500:]}"
+    )
+    assert "OK" in r.stdout
--- a/tests/docker/test_zombie_reaping.py
+++ b/tests/docker/test_zombie_reaping.py
@@ -12,22 +12,16 @@ docstring.
 """
 from __future__ import annotations

-import subprocess
 import time

-from tests.docker.conftest import docker_exec, docker_exec_sh
+from tests.docker.conftest import docker_exec, docker_exec_sh, start_container, start_container


 def test_orphan_zombies_reaped(
    built_image: str, container_name: str,
 ) -> None:
    """Spawn an orphan child that exits immediately. PID 1 must reap it."""
-    subprocess.run(
-        ["docker", "run", "-d", "--name", container_name, built_image,
-         "sleep", "60"],
-        check=True, capture_output=True, timeout=30,
-    )
-    time.sleep(2)
+    start_container(built_image, container_name, cmd="sleep 60")

    # `( ( sleep 0.1 & ) & ); sleep 1` creates a grandchild detached from
    # the original docker exec session — it becomes an orphan reparented
@@ -42,4 +36,4 @@ def test_orphan_zombies_reaped(
        line for line in r.stdout.split("\n")
        if line.strip().startswith("Z")
    ]
-    assert not zombies, f"Zombies not reaped by PID 1: {zombies}"
+    assert not zombies, f"Zombies not reaped by PID 1: {zombies}"
--- a/tests/hermes_cli/test_dashboard_auth_401_reauth.py
+++ b/tests/hermes_cli/test_dashboard_auth_401_reauth.py
@@ -27,7 +27,6 @@ import pytest
 # against each other (and against any other file that also touches
 # ``app.state``) — the marker name is shared across all dashboard-auth test
 # files that gate the app.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
 from fastapi import FastAPI
 from fastapi.responses import Response
 from fastapi.testclient import TestClient
--- a/tests/hermes_cli/test_dashboard_auth_gate.py
+++ b/tests/hermes_cli/test_dashboard_auth_gate.py
@@ -10,7 +10,6 @@ import pytest
 # against each other (and against any other file that also touches
 # ``app.state``) — the marker name is shared across all dashboard-auth test
 # files that gate the app.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
 from fastapi.testclient import TestClient

 from hermes_cli import web_server
--- a/tests/hermes_cli/test_dashboard_auth_middleware.py
+++ b/tests/hermes_cli/test_dashboard_auth_middleware.py
@@ -16,12 +16,6 @@ from __future__ import annotations

 import pytest

-# Phase 5 / Phase 6: these tests mutate ``web_server.app.state.auth_required``
-# at module level. Run them in the same xdist worker so they don't race
-# against each other (and against any other file that also touches
-# ``app.state``) — the marker name is shared across all dashboard-auth test
-# files that gate the app.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
 from fastapi.testclient import TestClient

 from hermes_cli import web_server
--- a/tests/hermes_cli/test_dashboard_auth_password_login.py
+++ b/tests/hermes_cli/test_dashboard_auth_password_login.py
@@ -16,11 +16,6 @@ import time

 import pytest

-# These tests mutate ``web_server.app.state.auth_required`` at module level,
-# so they share the dashboard-auth app-state xdist group to avoid racing
-# other gate tests.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
-
 from fastapi.testclient import TestClient

 from hermes_cli import web_server
--- a/tests/hermes_cli/test_dashboard_auth_prefix.py
+++ b/tests/hermes_cli/test_dashboard_auth_prefix.py
@@ -32,10 +32,6 @@ from __future__ import annotations

 import pytest

-# Same xdist group as the other dashboard-auth tests — they all mutate
-# web_server.app.state.auth_required at module level.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
-
 from fastapi.testclient import TestClient

 from hermes_cli import web_server
--- a/tests/hermes_cli/test_dashboard_auth_status_endpoint.py
+++ b/tests/hermes_cli/test_dashboard_auth_status_endpoint.py
@@ -20,10 +20,6 @@ from hermes_cli import web_server
 from hermes_cli.dashboard_auth import clear_providers, register_provider
 from tests.hermes_cli.conftest_dashboard_auth import StubAuthProvider

-# These tests mutate ``web_server.app.state.auth_required`` so they share
-# the same xdist group as the other dashboard-auth gated_app tests.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
-

@pytest.fixture
 def gated_client():
--- a/tests/hermes_cli/test_dashboard_auth_ws_auth.py
+++ b/tests/hermes_cli/test_dashboard_auth_ws_auth.py
@@ -17,12 +17,6 @@ from types import SimpleNamespace

 import pytest

-# Phase 5 / Phase 6: these tests mutate ``web_server.app.state.auth_required``
-# at module level. Run them in the same xdist worker so they don't race
-# against each other (and against any other file that also touches
-# ``app.state``) — the marker name is shared across all dashboard-auth test
-# files that gate the app.
-pytestmark = pytest.mark.xdist_group("dashboard_auth_app_state")
 from fastapi.testclient import TestClient

 from hermes_cli import web_server
--- a/tests/test_docker_home_override_scripts.py
+++ b/tests/test_docker_home_override_scripts.py
@@ -1,91 +0,0 @@
-"""Regression tests for Docker HOME overrides under s6/with-contenv."""
-
-from pathlib import Path
-
-
-REPO_ROOT = Path(__file__).resolve().parent.parent
-DASHBOARD_RUN = REPO_ROOT / "docker" / "s6-rc.d" / "dashboard" / "run"
-MAIN_WRAPPER = REPO_ROOT / "docker" / "main-wrapper.sh"
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-def test_main_wrapper_preserves_docker_workdir() -> None:
-    """The main-wrapper MUST save and restore the original working
-    directory so the container starts in the Docker ``-w`` directory,
-    not /opt/data.  Regression test for #35472.
-    """
-    text = MAIN_WRAPPER.read_text(encoding="utf-8")
-
-    # Must save original cwd before cd /opt/data.
-    assert "_hermes_orig_cwd" in text, (
-        "main-wrapper.sh must save the original cwd before cd /opt/data"
-    )
-    assert 'HERMES_ORIG_CWD:-$PWD' in text, (
-        "main-wrapper.sh must capture PWD as the fallback original cwd"
-    )
-
-    # Must cd to /opt/data for init (existing behaviour preserved).
-    assert "cd /opt/data" in text
-
-    # Must restore original cwd before exec'ing the user command.
-    # The restore cd must appear AFTER venv activation but BEFORE the
-    # first exec / if-block.
-    activate_idx = text.index("/opt/hermes/.venv/bin/activate")
-    restore_idx = text.index('cd "$_hermes_orig_cwd"')
-    exec_idx = text.index("if [ $# -eq 0 ]")
-    assert activate_idx < restore_idx < exec_idx, (
-        "cd $_hermes_orig_cwd must appear after venv activation and "
-        "before the exec routing block"
-    )
-
-
-def test_dashboard_run_resets_home_before_dropping_privileges() -> None:
-    text = DASHBOARD_RUN.read_text(encoding="utf-8")
-
-    assert "#!/command/with-contenv sh" in text
-    assert "export HOME=/opt/data" in text
-    assert "exec s6-setuidgid hermes hermes dashboard" in text
-
-
-def test_dashboard_run_does_not_derive_insecure_from_bind_host() -> None:
-    """The s6 dashboard run script MUST NOT auto-add ``--insecure`` based on
-    ``HERMES_DASHBOARD_HOST``. Doing so disables the OAuth auth gate on
-    every non-loopback bind even when an auth provider is registered —
-    the exact regression that exposed every wildcard-subdomain agent
-    dashboard publicly until early 2026.
-
-    The opt-in is now explicit: ``HERMES_DASHBOARD_INSECURE=1`` (truthy).
-    The auth gate is the authority on whether non-loopback binds are safe.
-    """
-    text = DASHBOARD_RUN.read_text(encoding="utf-8")
-
-    # No legacy host-derived flip.
-    assert '127.0.0.1|localhost' not in text, (
-        "Run script still derives --insecure from the bind host. The gate "
-        "is the authority now — opt in via HERMES_DASHBOARD_INSECURE instead."
-    )
-    assert 'case "$dash_host" in' not in text, (
-        "Legacy host-derived --insecure case-statement is back."
-    )
-
-    # New opt-in env var present.
-    assert "HERMES_DASHBOARD_INSECURE" in text, (
-        "Explicit HERMES_DASHBOARD_INSECURE opt-in is missing."
-    )
-    # Truthy values aligned with the rest of the s6 scripts
-    # (e.g. HERMES_DASHBOARD).
-    for truthy in ("1", "true", "TRUE", "True", "yes", "YES", "Yes"):
-        assert truthy in text, (
-            f"HERMES_DASHBOARD_INSECURE should accept truthy value {truthy!r}"
-        )
-
-
-def test_stage2_hook_repairs_profiles_and_cron_ownership_on_every_boot() -> None:
-    """profiles/ and cron/ must both be reclaimed after root-context writes."""
-    text = STAGE2_HOOK.read_text(encoding="utf-8")
-
-    assert 'if [ -d "$HERMES_HOME/profiles" ]; then' in text
-    assert 'chown -R hermes:hermes "$HERMES_HOME/profiles" 2>/dev/null || true' in text
-
-    assert 'if [ -d "$HERMES_HOME/cron" ]; then' in text
-    assert 'chown -R hermes:hermes "$HERMES_HOME/cron" 2>/dev/null || true' in text
--- a/tests/test_docker_stage2_browser_discovery.py
+++ b/tests/test_docker_stage2_browser_discovery.py
@@ -1,19 +0,0 @@
-"""Regression tests for Docker stage2 browser executable discovery."""
-
-from pathlib import Path
-
-
-def test_stage2_discovers_playwright_arm64_headless_shell() -> None:
-    """Playwright's --only-shell layout may use a headless_shell basename."""
-    script = Path("docker/stage2-hook.sh").read_text()
-
-    assert "-name 'headless_shell'" in script
-
-
-def test_stage2_discovery_stays_filename_matched() -> None:
-    """Avoid broad path grep that can pick executable shared libraries."""
-    script = Path("docker/stage2-hook.sh").read_text()
-
-    discovery_block = script.split("browser_bin=$(", 1)[1].split(")\n    if", 1)[0]
-    assert "find \"$PLAYWRIGHT_BROWSERS_PATH\" -type f -executable" in discovery_block
-    assert "grep" not in discovery_block
--- a/tests/test_dockerfile_tini_compat_shim.py
+++ b/tests/test_dockerfile_tini_compat_shim.py
@@ -1,49 +0,0 @@
-"""Regression test for #34192 — Dockerfile must keep the tini compat shim
-for orchestration templates that still reference /usr/bin/tini.
-
-This is a documentation-as-test guard: removing the shim is a real
-choice, but it should be done deliberately (e.g. once Hostinger's
-'Hermes WebUI' catalog updates to /init) and not by accident.
-"""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-
-def _dockerfile_text() -> str:
-    return (Path(__file__).parent.parent / "Dockerfile").read_text(encoding="utf-8")
-
-
-def test_tini_compat_symlink_present():
-    """The /usr/bin/tini -> /init symlink line must exist for #34192."""
-    df = _dockerfile_text()
-    assert "ln -sf /init /usr/bin/tini" in df, (
-        "Dockerfile must keep the tini compat symlink (#34192). "
-        "Removing it breaks orchestration templates that still pin "
-        "/usr/bin/tini as the entrypoint (Hostinger 'Hermes WebUI' "
-        "catalog as of v0.14.x)."
-    )
-
-
-def test_tini_compat_comment_explains_why():
-    """The symlink line is comment-anchored to #34192 so a future reader
-    knows why it exists. Removing the comment makes it look like dead
-    code worth deleting."""
-    df = _dockerfile_text()
-    assert "#34192" in df, (
-        "The Dockerfile tini compat shim must keep its #34192 anchor "
-        "comment so future maintainers know why the symlink is there."
-    )
-
-
-def test_entrypoint_still_init_not_tini():
-    """Sanity check: the actual ENTRYPOINT is still /init (s6-overlay).
-    The shim is for legacy external wrappers, not for the image's own
-    runtime — that path must continue to use the canonical /init."""
-    df = _dockerfile_text()
-    assert 'ENTRYPOINT [ "/init"' in df, (
-        "Dockerfile ENTRYPOINT must remain /init (s6-overlay). The "
-        "tini shim is only for external wrappers that haven't been "
-        "updated yet."
-    )
--- a/tests/test_docker_webui_install_surface.py
+++ b/tests/test_docker_webui_install_surface.py
@@ -1,5 +1,6 @@
-"""Guards for the multi-container Hermes WebUI install surface."""
-
+"""Test that setup.py uses temporary output directories when the source
+tree is read-only (as it is inside the Docker WebUI install surface).
+"""
 from __future__ import annotations

 from pathlib import Path
@@ -20,18 +21,6 @@ def _is_under(path: str, root: Path) -> bool:
    return True


-def test_docker_context_includes_license_file() -> None:
-    """PEP 639 license-files metadata must resolve inside the Docker image."""
-    dockerignore = (REPO_ROOT / ".dockerignore").read_text(encoding="utf-8")
-    active_lines = [
-        line.strip()
-        for line in dockerignore.splitlines()
-        if line.strip() and not line.lstrip().startswith("#")
-    ]
-
-    assert "LICENSE" not in active_lines
-
-
 def test_setup_uses_temporary_outputs_when_source_tree_is_read_only(
    monkeypatch,
 ) -> None:
--- a/tests/tools/test_dockerfile_immutable_install.py
+++ b/tests/tools/test_dockerfile_immutable_install.py
@@ -12,22 +12,16 @@ def _dockerfile_text() -> str:
    return DOCKERFILE.read_text()


-def test_dockerfile_makes_opt_hermes_root_owned_and_non_writable() -> None:
+def test_dockerfile_makes_opt_hermes_readonly_for_hermes_user() -> None:
    text = _dockerfile_text()

-    assert "COPY --chown=hermes:hermes . ." not in text
-    assert "COPY . ." in text
-    assert "chown -R root:root /opt/hermes" in text
-    assert "chmod -R a+rX /opt/hermes" in text
-    assert "chmod -R a-w /opt/hermes" in text
-
-    immutable_block = re.search(
-        r"RUN mkdir -p /opt/hermes/bin && \\\n"
-        r"(?:.*\\\n)+?"
-        r"\s+chmod -R a-w /opt/hermes",
-        text,
-    )
-    assert immutable_block, "Dockerfile must lock /opt/hermes after installing code/deps"
+    # --chmod on the source COPY bakes read-only perms at copy time instead
+    # of a separate chmod -R pass (which walked ~30k files — #49113).
+    assert "COPY --link --chmod=a+rX,go-w . ." in text
+    # The old tree-walking passes must not be present.
+    assert "chown -R root:root /opt/hermes" not in text
+    assert "chmod -R a+rX /opt/hermes" not in text
+    assert "chmod -R a-w /opt/hermes" not in text


 def test_dockerfile_keeps_mutable_state_under_opt_data() -> None:
@@ -68,22 +62,20 @@ def test_dockerfile_bakes_code_scoped_install_method_stamp() -> None:
    (/opt/hermes/.install_method) first; baking it at build time keeps the
    published image self-identifying as 'docker' WITHOUT writing into the
    shared $HERMES_HOME data volume (which a host install may also use).
-    It must live inside the immutable block so the runtime user can't alter it.
+    The stamp is created by root in the shim-wiring RUN block; the hermes
+    user can't modify it (go-w from the --chmod on the source COPY).
    """
    text = _dockerfile_text()
    assert "printf 'docker\\n' > /opt/hermes/.install_method" in text

-    immutable_block = re.search(
+    # The stamp must be in the RUN block that wires the exec shim.
+    shim_block = re.search(
        r"RUN mkdir -p /opt/hermes/bin && \\\n"
        r"(?:.*\\\n)+?"
-        r"\s+chmod -R a-w /opt/hermes",
+        r"\s+printf 'docker\\n' > /opt/hermes/\.install_method",
        text,
    )
-    assert immutable_block, "immutable block must exist"
-    assert ".install_method" in immutable_block.group(0), (
-        "the code-scoped install-method stamp must be baked inside the "
-        "immutable /opt/hermes block"
-    )
+    assert shim_block, "install-method stamp must be in the shim-wiring RUN block"


 def test_dockerfile_redirects_lazy_installs_to_durable_target() -> None:
--- a/tests/tools/test_stage2_hook_gateway_bootstrap_state.py
+++ b/tests/tools/test_stage2_hook_gateway_bootstrap_state.py
@@ -1,152 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook seeds gateway_state.json from
-HERMES_GATEWAY_BOOTSTRAP_STATE on first boot, so a freshly-provisioned
-container can come up with the gateway already running.
-
-Background. On a blank volume there is no gateway_state.json, so the boot
-reconciler (cont-init.d/02-reconcile-profiles ->
-container_boot.reconcile_profile_gateways) registers the gateway-default s6
-slot but leaves it DOWN — it only auto-starts when the last recorded state was
-"running". A container provisioned on a fresh volume therefore comes up with
-the gateway down until something starts it.
-
-An orchestrator that wants the gateway running from first boot sets
-HERMES_GATEWAY_BOOTSTRAP_STATE=running; stage2-hook.sh (installed as
-/etc/cont-init.d/01-hermes-setup, which runs lexicographically BEFORE
-02-reconcile-profiles) seeds the state file so the reconciler sees
-prior_state=running and brings the slot up on the very first boot.
-
-This mirrors the existing HERMES_AUTH_JSON_BOOTSTRAP env-seed pattern: it seeds
-the SAME gateway_state.json the reconciler already consults, guarded by
-``[ ! -f ]`` so persisted runtime state always wins on subsequent boots (a
-deliberately-stopped gateway must stay stopped across restarts).
-"""
-from __future__ import annotations
-
-import json
-import re
-import shutil
-import subprocess
-import tempfile
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def _seed_block(text: str) -> str:
-    """Extract the ``if [ ! -f "$HERMES_HOME/gateway_state.json" ] && … fi``
-    block that seeds the gateway state file from the bootstrap env var."""
-    m = re.search(
-        r'(if \[ ! -f "\$HERMES_HOME/gateway_state\.json" \] && \\\n'
-        r"(?:.*\n)*?fi)",
-        text,
-    )
-    assert m, (
-        "stage2-hook.sh must contain the gateway_state.json bootstrap-seed block "
-        "guarded on HERMES_GATEWAY_BOOTSTRAP_STATE"
-    )
-    return m.group(1)
-
-
-def test_seed_block_present_and_guarded(stage2_text: str) -> None:
-    block = _seed_block(stage2_text)
-    # Must be a first-boot-only seed (the [ ! -f ] guard) keyed on the env var.
-    assert '[ ! -f "$HERMES_HOME/gateway_state.json" ]' in block, (
-        "seed must be guarded by [ ! -f ] so persisted state wins on restart"
-    )
-    assert "HERMES_GATEWAY_BOOTSTRAP_STATE" in block
-    assert "gateway_state" in block
-
-
-def _run_seed(
-    text: str, *, env_value: str | None, preexisting: str | None
-) -> str | None:
-    """Run the extracted seed block in a sandbox $HERMES_HOME.
-
-    ``env_value`` is the HERMES_GATEWAY_BOOTSTRAP_STATE value (None = unset).
-    ``preexisting`` is the contents of a gateway_state.json placed before the
-    block runs (None = no file). Returns the file's contents afterwards, or
-    None if it doesn't exist. ``chown``/``chmod`` are stubbed so the block
-    runs without real root.
-    """
-    bash = shutil.which("bash")
-    if bash is None:
-        pytest.skip("bash not available")
-    block = _seed_block(text)
-
-    with tempfile.TemporaryDirectory() as d:
-        dpath = Path(d)
-        home = dpath / "home"
-        home.mkdir()
-        state_file = home / "gateway_state.json"
-        if preexisting is not None:
-            state_file.write_text(preexisting)
-
-        env_line = (
-            f'export HERMES_GATEWAY_BOOTSTRAP_STATE="{env_value}"\n'
-            if env_value is not None
-            else "unset HERMES_GATEWAY_BOOTSTRAP_STATE\n"
-        )
-        script = (
-            "set -e\n"
-            f'HERMES_HOME="{home}"\n'
-            # Stub privilege ops — the sandbox isn't root.
-            "chown() { :; }\n"
-            "chmod() { :; }\n"
-            + env_line
-            + block
-        )
-        script_path = dpath / "harness.sh"
-        script_path.write_text(script)
-
-        proc = subprocess.run(
-            [bash, str(script_path)], capture_output=True, text=True
-        )
-        assert proc.returncode == 0, proc.stderr
-
-        if not state_file.exists():
-            return None
-        return state_file.read_text()
-
-
-def test_seeds_running_state_on_blank_volume(stage2_text: str) -> None:
-    """env=running + no pre-existing file -> writes a valid running state."""
-    out = _run_seed(stage2_text, env_value="running", preexisting=None)
-    assert out is not None, "seed must create gateway_state.json"
-    assert json.loads(out).get("gateway_state") == "running"
-
-
-def test_does_not_clobber_existing_state(stage2_text: str) -> None:
-    """The [ ! -f ] guard: an existing state file is never overwritten, even
-    when the bootstrap env var says running. A deliberately-stopped gateway
-    must stay stopped across restarts."""
-    existing = json.dumps({"gateway_state": "stopped", "pid": 123})
-    out = _run_seed(stage2_text, env_value="running", preexisting=existing)
-    assert out == existing, "seed must not clobber a persisted state file"
-
-
-def test_no_seed_when_env_unset(stage2_text: str) -> None:
-    """No env var -> no file written (preserves the default down-on-first-boot
-    behaviour for orchestrators that don't opt in)."""
-    out = _run_seed(stage2_text, env_value=None, preexisting=None)
-    assert out is None, "seed must not run when HERMES_GATEWAY_BOOTSTRAP_STATE is unset"
-
-
-def test_non_running_value_ignored(stage2_text: str) -> None:
-    """Only a literal "running" is honoured; any other value is ignored so a
-    typo can't write a bogus state. (The reconciler's _AUTOSTART_STATES is
-    exactly {"running"}.)"""
-    for bogus in ("stopped", "Running", "1", "true", "starting"):
-        out = _run_seed(stage2_text, env_value=bogus, preexisting=None)
-        assert out is None, (
-            f"only 'running' should seed a state file, not {bogus!r}"
-        )
--- a/tests/tools/test_stage2_hook_immutable_install.py
+++ b/tests/tools/test_stage2_hook_immutable_install.py
@@ -1,48 +0,0 @@
-"""Contract tests for the Docker stage2 immutable install-tree policy.
-
-Hosted/container Hermes keeps user-writable state under HERMES_HOME
-(/opt/data). The installed source, venv, TUI bundle, and node_modules under
-/opt/hermes must remain root-owned/non-writable by the runtime hermes user so
-an agent session cannot self-modify the installation and brick the gateway.
-"""
-from __future__ import annotations
-
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def test_stage2_does_not_chown_install_tree_to_hermes(stage2_text: str) -> None:
-    assert "Fixing ownership of build trees under $INSTALL_DIR" not in stage2_text
-    assert 'chown -R hermes:hermes \\\n        "$INSTALL_DIR/.venv"' not in stage2_text
-
-    assert "venv_owner=$(stat -c %u \"$INSTALL_DIR/.venv\"" not in stage2_text
-    assert "chown of build trees failed" not in stage2_text
-    for install_tree in (
-        '"$INSTALL_DIR/.venv" \\',
-        '"$INSTALL_DIR/ui-tui" \\',
-        '"$INSTALL_DIR/gateway" \\',
-        '"$INSTALL_DIR/node_modules" \\',
-    ):
-        assert install_tree not in stage2_text, (
-            f"stage2 must not chown {install_tree} back to hermes; "
-            "the Dockerfile keeps /opt/hermes immutable and writable state "
-            "belongs under HERMES_HOME"
-        )
-
-
-def test_stage2_documents_immutable_install_contract(stage2_text: str) -> None:
-    assert "Immutable install tree" in stage2_text
-    assert "PYTHONDONTWRITEBYTECODE" in stage2_text
-    assert "HERMES_DISABLE_LAZY_INSTALLS=1" in stage2_text
-    assert "/opt/hermes" in stage2_text
--- a/tests/tools/test_stage2_hook_install_method_stamp.py
+++ b/tests/tools/test_stage2_hook_install_method_stamp.py
@@ -1,61 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook must NOT stamp the install method
-into the shared $HERMES_HOME, and must heal a stale 'docker' stamp left there
-by older images.
-
-Background (shared-$HERMES_HOME bug)
------------------------------------
-$HERMES_HOME (/opt/data) is a DATA volume that users commonly bind-mount from
-the host (``~/.hermes:/opt/data``) and sometimes share with a host-side
-Desktop/CLI install. Older images wrote ``printf 'docker' > $HERMES_HOME/.install_method``
-at boot, which clobbered the host install's own marker — so the host's in-app
-updater read 'docker' and refused to run ``hermes update`` ("doesn't apply
-inside the Docker container").
-
-The fix scopes the stamp to the install tree (baked at
-``/opt/hermes/.install_method`` in the Dockerfile, read first by
-``detect_install_method``). stage2 must therefore:
-
-  * NOT write the 'docker' stamp into $HERMES_HOME any more, and
-  * proactively remove a stale 'docker' stamp from $HERMES_HOME so homes
-    already poisoned by an older image self-heal on the next boot.
-"""
-from __future__ import annotations
-
-import re
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def test_stage2_does_not_write_install_method_into_home(stage2_text: str) -> None:
-    # No write/tee of the home-scoped install-method stamp anywhere.
-    assert not re.search(
-        r"(tee|>)\s*\"?\$HERMES_HOME/\.install_method", stage2_text
-    ), (
-        "stage2 must not stamp $HERMES_HOME/.install_method — that data dir "
-        "may be shared with a host install whose marker would be clobbered"
-    )
-
-
-def test_stage2_heals_stale_docker_home_stamp(stage2_text: str) -> None:
-    # It must remove a stale 'docker' stamp from $HERMES_HOME so already
-    # poisoned shared homes recover.
-    assert 'rm -f "$HERMES_HOME/.install_method"' in stage2_text, (
-        "stage2 must remove a stale 'docker' stamp from $HERMES_HOME to heal "
-        "homes poisoned by older images"
-    )
-    # The removal must be guarded on the value being 'docker' so we never
-    # delete a legitimately-different stamp a user/host install put there.
-    assert re.search(r'\[\s*"\$stamped"\s*=\s*"docker"\s*\]', stage2_text), (
-        "the stale-stamp removal must be guarded on the value == 'docker'"
-    )
--- a/tests/tools/test_stage2_hook_log_dir_seed.py
+++ b/tests/tools/test_stage2_hook_log_dir_seed.py
@@ -1,60 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook seeds $HERMES_HOME/logs/gateways
-as the hermes user.
-
-Regression guard for #45258: the per-profile gateway log service
-(`gateway-<profile>/log/run`) creates `logs/gateways/` via `mkdir -p` but only
-chowns the leaf `logs/gateways/<profile>`. If the first log service to boot
-runs in root context, the `gateways/` parent is created root-owned and stays
-that way; every profile registered later runs its log service as the dropped
-hermes user and s6-log crash-loops on `mkdir: Permission denied`.
-
-Seeding `logs/gateways` in stage2 (cont-init runs before any service starts)
-guarantees the parent already exists hermes-owned by the time the first
-log/run executes its `mkdir -p`.
-"""
-from __future__ import annotations
-
-import re
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def _seed_mkdir_block(text: str) -> str:
-    """Extract the `as_hermes mkdir -p \\ ...` seed block."""
-    m = re.search(r"as_hermes mkdir -p \\\n(?:[^\n]*\\\n)*[^\n]*\n", text)
-    assert m, "stage2-hook.sh must contain the as_hermes mkdir -p seed block"
-    return m.group(0)
-
-
-def test_logs_gateways_is_seeded(stage2_text: str) -> None:
-    block = _seed_mkdir_block(stage2_text)
-    assert '"$HERMES_HOME/logs/gateways"' in block, (
-        "logs/gateways must be seeded hermes-owned in stage2 so profiles "
-        "added after first boot can create their log dirs (#45258)"
-    )
-    # The parent must also be seeded so mkdir -p inside the block never
-    # creates logs/ implicitly with surprising ownership.
-    assert '"$HERMES_HOME/logs"' in block
-
-
-def test_logs_subtree_is_healed_when_chown_needed(stage2_text: str) -> None:
-    """The needs_chown repair loop must cover the logs subtree recursively —
-    that is what makes the seed entry above sufficient (no separate
-    logs/gateways loop entry needed)."""
-    m = re.search(r"for sub in ([^;]*); do", stage2_text)
-    assert m, "stage2-hook.sh must contain the needs_chown subdir repair loop"
-    assert "logs" in m.group(1).split(), (
-        "the needs_chown loop must recursively chown logs/ — it covers "
-        "logs/gateways, so the seed list does not need a loop twin"
-    )
--- a/tests/tools/test_stage2_hook_puid_pgid.py
+++ b/tests/tools/test_stage2_hook_puid_pgid.py
@@ -1,110 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook accepts PUID/PGID as aliases for
-HERMES_UID/HERMES_GID.
-
-Regression guard for #15290.  NAS platforms (UGOS, Synology, unRAID) bind-mount
-/opt/data from a host directory owned by the user's own UID and expect the
-LinuxServer.io PUID/PGID convention.  Without the alias those vars are silently
-ignored, the s6-setuidgid drop lands on UID 10000, and the runtime cannot read
-the volume.  HERMES_UID/HERMES_GID must still take precedence when both are
-set.
-
-The s6-overlay rework moved bootstrap from docker/entrypoint.sh (now a shim)
-to docker/stage2-hook.sh, which is installed as /etc/cont-init.d/01-hermes-setup
-by the Dockerfile.  This test targets the post-rework location.
-"""
-from __future__ import annotations
-
-import os
-import shutil
-import subprocess
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def _alias_lines(text: str) -> list[str]:
-    """The stage2 hook lines that resolve HERMES_UID/HERMES_GID from aliases."""
-    return [
-        line.strip()
-        for line in text.splitlines()
-        if line.strip().startswith(("HERMES_UID=", "HERMES_GID="))
-    ]
-
-
-def test_stage2_hook_resolves_puid_pgid_aliases(stage2_text: str) -> None:
-    alias_lines = _alias_lines(stage2_text)
-    assert any("PUID" in line for line in alias_lines), (
-        "docker/stage2-hook.sh must resolve HERMES_UID from a PUID alias; see #15290"
-    )
-    assert any("PGID" in line for line in alias_lines), (
-        "docker/stage2-hook.sh must resolve HERMES_GID from a PGID alias; see #15290"
-    )
-
-
-def _resolve(stage2_text: str, env: dict[str, str]) -> str:
-    """Run the stage2 hook's alias-resolution lines in isolation and report the
-    resolved ``HERMES_UID:HERMES_GID`` pair."""
-    bash = shutil.which("bash")
-    if bash is None:
-        pytest.skip("bash not available")
-    script = "\n".join(_alias_lines(stage2_text))
-    script += '\necho "${HERMES_UID:-}:${HERMES_GID:-}"\n'
-    proc = subprocess.run(
-        [bash, "-ec", script],
-        env={"PATH": os.environ.get("PATH", "")} | env,
-        capture_output=True,
-        text=True,
-    )
-    assert proc.returncode == 0, proc.stderr
-    return proc.stdout.strip()
-
-
-def test_puid_pgid_populate_hermes_uid_gid(stage2_text: str) -> None:
-    assert _resolve(stage2_text, {"PUID": "1000", "PGID": "10"}) == "1000:10"
-
-
-def test_hermes_uid_gid_take_precedence_over_aliases(stage2_text: str) -> None:
-    resolved = _resolve(
-        stage2_text,
-        {"HERMES_UID": "2000", "HERMES_GID": "2001", "PUID": "1000", "PGID": "10"},
-    )
-    assert resolved == "2000:2001"
-
-
-def test_no_uid_vars_leaves_values_empty(stage2_text: str) -> None:
-    # An empty resolution means the stage2 hook keeps the default hermes user.
-    assert _resolve(stage2_text, {}) == ":"
-
-
-def test_stage2_hook_creates_s6_envdir_before_writing_browser_path(stage2_text: str) -> None:
-    """Regression guard for browser-path export on runtimes where the
-    s6 container_environment directory is absent when the cont-init hook runs.
-    """
-    mkdir_line = "mkdir -p /run/s6/container_environment"
-    write_line = (
-        "printf '%s' \"$browser_bin\" > "
-        "/run/s6/container_environment/AGENT_BROWSER_EXECUTABLE_PATH"
-    )
-
-    assert mkdir_line in stage2_text
-    assert write_line in stage2_text
-    assert stage2_text.index(mkdir_line) < stage2_text.index(write_line)
-
-
-def test_stage2_hook_runs_config_migration_as_hermes(stage2_text: str) -> None:
-    assert "scripts/docker_config_migrate.py" in stage2_text
-    assert 's6-setuidgid hermes "$INSTALL_DIR/.venv/bin/python"' in stage2_text
-
-
-def test_stage2_hook_documents_config_migration_opt_out(stage2_text: str) -> None:
-    assert "HERMES_SKIP_CONFIG_MIGRATION" in stage2_text
--- a/tests/tools/test_stage2_hook_toplevel_chown.py
+++ b/tests/tools/test_stage2_hook_toplevel_chown.py
@@ -1,138 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook resets ownership of hermes-owned
-top-level state files in $HERMES_HOME — but only those, never arbitrary
-host-owned files.
-
-Regression guard for the gateway restart loop reported in #35098: files such
-as gateway.lock / state.db / auth.json live directly under $HERMES_HOME (not in
-a subdir), so the targeted subdir chown misses them. When created or rewritten
-by `docker exec <container> hermes …` (root unless `-u` is passed) they land
-root-owned and the unprivileged hermes runtime then hits PermissionError on next
-startup.
-
-The fix uses an explicit allowlist rather than a blanket `find -user root`
-sweep, preserving the targeted-ownership contract from #19788 / PR #19795: a
-bind-mounted $HERMES_HOME may contain host-owned files Hermes does not manage,
-and those must never be chowned.
-
-The s6-overlay rework moved bootstrap from docker/entrypoint.sh (now a shim) to
-docker/stage2-hook.sh, installed as /etc/cont-init.d/01-hermes-setup. This test
-targets that location.
-"""
-from __future__ import annotations
-
-import os
-import re
-import shutil
-import subprocess
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def _toplevel_chown_loop(text: str) -> str:
-    """Extract the `for f in … chown hermes:hermes "$HERMES_HOME/$f" … done`
-    block that repairs top-level state-file ownership."""
-    m = re.search(
-        r"(for f in \\\n(?:.*\\\n)*?.*; do\n(?:.*\n)*?done)",
-        text,
-    )
-    assert m, "stage2-hook.sh must contain the top-level-file chown for-loop (#35098)"
-    block = m.group(1)
-    assert 'chown hermes:hermes "$HERMES_HOME/$f"' in block, (
-        "the top-level-file loop must chown each allowlisted file to hermes"
-    )
-    return block
-
-
-def test_toplevel_chown_loop_present(stage2_text: str) -> None:
-    block = _toplevel_chown_loop(stage2_text)
-    # The reported-broken files must be covered.
-    for required in ("auth.json", "state.db", "gateway.lock", "gateway_state.json"):
-        assert required in block, (
-            f"top-level chown allowlist must include {required!r} (#35098)"
-        )
-
-
-def test_no_blanket_find_user_root_sweep(stage2_text: str) -> None:
-    """The fix must NOT reintroduce a blanket `find … -user root` chown of
-    $HERMES_HOME contents — that would clobber host-owned files in a bind mount
-    (#19788 / PR #19795)."""
-    assert not re.search(r"find\s+\"?\$\{?HERMES_HOME\}?\"?[^\n]*-user\s+root", stage2_text), (
-        "stage2-hook.sh must not blanket-chown root-owned files under "
-        "$HERMES_HOME via `find -user root`; use the targeted allowlist instead "
-        "so host-owned bind-mounted files are preserved (#19788, #19795)."
-    )
-
-
-def _run_loop(text: str, present_files: list[str]) -> list[str]:
-    """Run the extracted chown loop in a sandbox $HERMES_HOME, with `chown`
-    stubbed to record which paths it was asked to touch. Returns the basenames
-    the loop attempted to chown."""
-    bash = shutil.which("bash")
-    if bash is None:
-        pytest.skip("bash not available")
-    block = _toplevel_chown_loop(text)
-
-    import tempfile
-
-    with tempfile.TemporaryDirectory() as d:
-        dpath = Path(d)
-        home = dpath / "home"
-        home.mkdir()
-        for f in present_files:
-            (home / f).touch()
-        # A non-allowlisted, "host-owned" file that must never be chowned.
-        (home / "host_secret.json").touch()
-
-        # Stub chown to record the basename of its last argument (the path),
-        # so we observe exactly which files the allowlist loop selected
-        # without needing real root privileges.
-        script = (
-            "set -e\n"
-            f'HERMES_HOME="{home}"\n'
-            f'chown() {{ for a in "$@"; do :; done; echo "${{a##*/}}" >> "{dpath}/chown.log"; }}\n'
-            + block
-        )
-        script_path = dpath / "harness.sh"
-        script_path.write_text(script)
-
-        proc = subprocess.run([bash, str(script_path)], capture_output=True, text=True)
-        assert proc.returncode == 0, proc.stderr
-
-        log = dpath / "chown.log"
-        if not log.exists():
-            return []
-        return [ln for ln in log.read_text().splitlines() if ln]
-
-
-def test_loop_chowns_present_allowlisted_files(stage2_text: str) -> None:
-    touched = _run_loop(stage2_text, ["auth.json", "state.db", "gateway.lock"])
-    assert "auth.json" in touched
-    assert "state.db" in touched
-    assert "gateway.lock" in touched
-
-
-def test_loop_skips_nonallowlisted_host_file(stage2_text: str) -> None:
-    """A file NOT on the allowlist (e.g. a host-owned file in a bind mount) must
-    never be chowned, even if present."""
-    touched = _run_loop(stage2_text, ["auth.json"])
-    assert "host_secret.json" not in touched, (
-        "the allowlist loop must not touch non-allowlisted files (#19788)"
-    )
-
-
-def test_loop_skips_absent_files(stage2_text: str) -> None:
-    """Allowlisted files that don't exist are skipped (no spurious chown)."""
-    touched = _run_loop(stage2_text, ["auth.json"])
-    # state.db wasn't created, so it must not appear.
-    assert "state.db" not in touched
--- a/tests/tools/test_stage2_hook_unraid_uid.py
+++ b/tests/tools/test_stage2_hook_unraid_uid.py
@@ -1,86 +0,0 @@
-"""Regression tests for Docker stage2 UID/GID handling on NAS hosts.
-
-Unraid commonly runs appdata as nobody:users (99:100). The stage2 hook must
-accept those non-root numeric IDs and keep legacy/new pairing stores writable
-after targeted ownership reconciliation.
-"""
-from __future__ import annotations
-
-import os
-import re
-import shutil
-import subprocess
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-
-
-@pytest.fixture(scope="module")
-def stage2_text() -> str:
-    if not STAGE2_HOOK.exists():
-        pytest.skip("docker/stage2-hook.sh not present in this checkout")
-    return STAGE2_HOOK.read_text()
-
-
-def _uid_gid_validator(text: str) -> str:
-    marker = "# --- UID/GID remap ---"
-    before_marker = text.split(marker, 1)[0]
-    start = before_marker.index("validate_uid_gid()")
-    return before_marker[start:]
-
-
-def _validate_uid_gid(text: str, value: str) -> bool:
-    bash = shutil.which("bash")
-    if bash is None:
-        pytest.skip("bash not available")
-    script = _uid_gid_validator(text) + '\nvalidate_uid_gid "$CANDIDATE"\n'
-    proc = subprocess.run(
-        [bash, "-c", script],
-        env={"PATH": os.environ.get("PATH", ""), "CANDIDATE": value},
-        capture_output=True,
-        text=True,
-    )
-    return proc.returncode == 0
-
-
-@pytest.mark.parametrize("value", ["1", "99", "100", "1000", "65534"])
-def test_uid_gid_validator_accepts_non_root_nas_ids(stage2_text: str, value: str) -> None:
-    assert _validate_uid_gid(stage2_text, value), (
-        f"stage2 hook must accept NAS UID/GID {value}; Unraid uses 99:100 (#38070)"
-    )
-
-
-@pytest.mark.parametrize("value", ["", "0", "abc", "99x", "65535"])
-def test_uid_gid_validator_rejects_root_invalid_and_out_of_range(
-    stage2_text: str,
-    value: str,
-) -> None:
-    assert not _validate_uid_gid(stage2_text, value)
-
-
-def _targeted_chown_subdirs(text: str) -> list[str]:
-    m = re.search(
-        r"for sub in (?P<items>.*?); do\n\s*if \[ -e \"\$HERMES_HOME/\$sub\" \]",
-        text,
-        re.DOTALL,
-    )
-    assert m, "stage2-hook.sh must contain the targeted subdir chown loop"
-    return m.group("items").split()
-
-
-def test_targeted_chown_covers_legacy_and_new_pairing_dirs(stage2_text: str) -> None:
-    subdirs = _targeted_chown_subdirs(stage2_text)
-    assert "pairing" in subdirs
-    assert "platforms/pairing" in subdirs
-
-
-def test_seeded_directory_list_covers_legacy_and_new_pairing_dirs(stage2_text: str) -> None:
-    seed_block = stage2_text.split("as_hermes mkdir -p \\", 1)[1].split(
-        "# --- Install-method stamp",
-        1,
-    )[0]
-    assert '"$HERMES_HOME/pairing"' in seed_block
-    assert '"$HERMES_HOME/platforms/pairing"' in seed_block
--- a/tests/tools/test_stage2_hook_user_flag_guard.py
+++ b/tests/tools/test_stage2_hook_user_flag_guard.py
@@ -1,119 +0,0 @@
-"""Contract test: the s6-overlay stage2 hook and main-wrapper reject an
-unsupported `docker run --user <arbitrary-uid>:<gid>` start with actionable
-guidance, while still allowing:
-
-  - root start (id -u == 0)
-  - `--user <hermes-uid>` (the supported non-root start, #34648 / #34837)
-
-Background: in the tini era `docker run --user $(id -u):$(id -g)` was used to
-make container-written files match the host user. Under s6-overlay this can't
-work — the bootstrap (UID remap, volume/build-tree chown, config seeding) needs
-root, and the baked image dirs are owned by the hermes build UID, so an
-arbitrary pinned UID can't write them (EACCES on a bind mount, hard crash on a
-named volume). The supported path is root start + HERMES_UID/HERMES_GID (or the
-PUID/PGID aliases), which remaps the hermes user and chowns the volume.
-
-The guard fires only when the current UID is neither root NOR the hermes UID,
-so the #34648 `--user 10000:10000` case (pinning to the hermes UID itself) is
-unaffected.
-
-Extraction + stubbed-shell-run mirrors
-tests/tools/test_stage2_hook_toplevel_chown.py.
-"""
-from __future__ import annotations
-
-import re
-import shutil
-import subprocess
-import tempfile
-from pathlib import Path
-
-import pytest
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-STAGE2_HOOK = REPO_ROOT / "docker" / "stage2-hook.sh"
-MAIN_WRAPPER = REPO_ROOT / "docker" / "main-wrapper.sh"
-
-
-def _read(p: Path) -> str:
-    if not p.exists():
-        pytest.skip(f"{p} not present in this checkout")
-    return p.read_text()
-
-
-def _guard_block(text: str) -> str:
-    """Extract the `cur_uid=...; if [ ... ]; then ... exit 1; fi` guard."""
-    m = re.search(
-        r"(cur_uid=\"\$\(id -u\)\"\nif \[ \"\$cur_uid\" != 0 \](?:.*\n)*?fi)",
-        text,
-    )
-    assert m, "expected the --user guard block (cur_uid + non-root/non-hermes check)"
-    return m.group(1)
-
-
-@pytest.mark.parametrize("path", [STAGE2_HOOK, MAIN_WRAPPER])
-def test_guard_present_and_mentions_remediation(path: Path) -> None:
-    text = _read(path)
-    block = _guard_block(text)
-    # Must check non-root AND non-hermes-uid (so --user 10000:10000 is allowed).
-    assert '"$cur_uid" != 0' in block
-    assert '"$cur_uid" != "$(id -u hermes)"' in block
-    assert "exit 1" in block
-    # Must point users at the supported env vars.
-    assert "HERMES_UID" in block and "HERMES_GID" in block
-    assert "PUID" in block and "PGID" in block
-
-
-def _run_guard(text: str, *, cur_uid: int, hermes_uid: int = 10000) -> subprocess.CompletedProcess:
-    """Run the extracted guard with `id` stubbed. Returns the completed process
-    (rc 1 + stderr message when rejected, rc 0 when allowed through)."""
-    bash = shutil.which("bash")
-    if bash is None:
-        pytest.skip("bash not available")
-    block = _guard_block(text)
-    with tempfile.TemporaryDirectory() as d:
-        script = (
-            "set -e\n"
-            # Stub `id`: `id -u` -> cur_uid; `id -u hermes` -> hermes_uid.
-            f'id() {{ if [ "$2" = hermes ]; then echo {hermes_uid}; else echo {cur_uid}; fi; }}\n'
-            + block
-            + "\necho GUARD_PASSED\n"  # only reached when the guard allows through
-        )
-        sp = Path(d) / "h.sh"
-        sp.write_text(script)
-        return subprocess.run([bash, str(sp)], capture_output=True, text=True)
-
-
-def test_arbitrary_user_uid_is_rejected() -> None:
-    """An arbitrary host UID (1000), neither root nor hermes, is rejected."""
-    for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)):
-        proc = _run_guard(text, cur_uid=1000, hermes_uid=10000)
-        assert proc.returncode == 1, f"expected rejection, got rc={proc.returncode}"
-        assert "not supported" in proc.stderr
-        assert "GUARD_PASSED" not in proc.stdout
-
-
-def test_root_start_passes() -> None:
-    """Root start (uid 0) is never blocked."""
-    for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)):
-        proc = _run_guard(text, cur_uid=0, hermes_uid=10000)
-        assert proc.returncode == 0, proc.stderr
-        assert "GUARD_PASSED" in proc.stdout
-
-
-def test_user_pinned_to_hermes_uid_passes() -> None:
-    """`--user 10000:10000` (the hermes UID itself) is the supported non-root
-    start from #34648 / #34837 and must NOT be blocked."""
-    for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)):
-        proc = _run_guard(text, cur_uid=10000, hermes_uid=10000)
-        assert proc.returncode == 0, proc.stderr
-        assert "GUARD_PASSED" in proc.stdout
-
-
-def test_user_pinned_to_remapped_hermes_uid_passes() -> None:
-    """After a HERMES_UID remap the hermes UID is e.g. 4242; a container pinned
-    to that same UID must still pass (cur_uid == hermes_uid)."""
-    for text in (_read(STAGE2_HOOK), _read(MAIN_WRAPPER)):
-        proc = _run_guard(text, cur_uid=4242, hermes_uid=4242)
-        assert proc.returncode == 0, proc.stderr
-        assert "GUARD_PASSED" in proc.stdout
Author	SHA1	Message	Date
ethernet	ffe043998f	change(ci): remove lint PR comment it's already in the job summary. having it as a comment just makes people ignore it. don't waste sapce.	2026-06-25 19:51:48 -04:00
ethernet	5cca2b1c2d	feat(ci): add CI timing report	2026-06-25 19:51:48 -04:00
ethernet	725ca2ab20	fix(ci): rip out some xdist legacy stuff... how did these ever work??	2026-06-25 19:47:43 -04:00
ethernet	a7e32ca9c2	change(ci): upload-artifact from v4 -> v7	2026-06-25 19:15:00 -04:00
ethernet	901f107976	try pytest alone..	2026-06-25 19:12:49 -04:00
ethernet	c73adbd91b	wip ignore	2026-06-25 19:12:49 -04:00
ethernet	01a7dfc339	change(ci): update all UV installs	2026-06-25 18:46:28 -04:00
ethernet	db03c207aa	change(ci): migrate docker smoketests to real tests	2026-06-25 18:46:28 -04:00
ethernet	e74f230462	change(ci): pretty names	2026-06-25 18:46:28 -04:00
ethernet	eb114af7f1	change(tests): don't pass pytest args when counting tests	2026-06-25 18:46:28 -04:00
ethernet	2118bc5ab3	change(nix): simpler dev setup	2026-06-25 18:46:28 -04:00
ethernet	9f51ec0280	change(nix): ship fat hermes agent by default	2026-06-25 18:46:28 -04:00
ethernet	03046b9b9a	change(ci): docker-publish.yml -> docker.yml	2026-06-25 18:46:28 -04:00
ethernet	5e50b121ab	change(ci): docker runs again on PRs	2026-06-25 18:46:28 -04:00
ethernet	5a20177fc3	refactor(ci): more test slices	2026-06-25 18:46:28 -04:00
ethernet	3668c2c482	refactor(ci): run tests thru run_tests.sh	2026-06-25 18:23:09 -04:00
ethernet	2fcc3ad9cb	refactor(ci): rewrite docker tests to check built container	2026-06-25 16:30:11 -04:00
ethernet	a6d54c9bbe	refactor(ci): faster docker builds via --link and chmod removal	2026-06-25 12:25:07 -04:00