diff --git a/.agent/.gitkeep b/.agent/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/.github/scripts/nightly-triage.sh b/.github/scripts/nightly-triage.sh
new file mode 100644
index 0000000..0a3964b
--- /dev/null
+++ b/.github/scripts/nightly-triage.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+# nightly-triage.sh — classify a nightly stress run's results and file/append a
+# single labelled GitHub issue per (date, class), idempotently.
+#
+# Invoked by the `triage` job in .github/workflows/nightly.yml AFTER it has
+# downloaded every matrix cell's `test-output/` artifact (each into a directory
+# named `nightly-logs-<cell-id>/`, each carrying that cell's own
+# `cell-conclusion.txt`). It reads only files on disk + `gh`; it makes no test
+# decisions of its own beyond parsing the preserved logs.
+#
+# CLASSIFICATION:
+#   correctness  — any `^FAIL:` line in a suite log (a genuine assertion failure).
+#                  Files/append a `nightly-correctness` issue. The one class that
+#                  demands investigation. (A job that concluded `failure`/timed out
+#                  WITHOUT a `^FAIL:` line is infra, not correctness — see below.)
+#   envelope     — no FAIL anywhere, but at least one `WARN[env-relaxed]` line in a
+#                  log of a cell that *succeeded*. Tracked (`nightly-envelope`); the
+#                  three wall-clock envelope assertions stretched under load — by
+#                  design under GCL_ENVELOPE_TIER=relax — so NO investigation action.
+#   infra        — a cell's artifact is missing, the cell job neither succeeded nor
+#                  cleanly failed-on-an-assertion (timeout / cancelled / checkout
+#                  failure / errored before any suite ran), OR — the EMPTY-ROUND
+#                  GUARD — *no* cell produced any log at all. Filed `nightly-infra`.
+#                  Crucially, "0 FAIL across 0 logs" is NEVER read as green: with no
+#                  evidence we classify infra, not success.
+#
+# Idempotency: one open issue per (run-date, class). We search open issues by a
+# stable title prefix + label; if one exists we append a comment, else we create.
+# Re-running triage for the same date therefore appends rather than spamming.
+#
+# All-green (every cell success, no FAIL, no env warn, every artifact present) ⇒
+# NO issue of any kind is filed.
+#
+# Inputs (environment):
+#   ARTIFACTS_DIR   dir holding the downloaded per-cell artifact directories
+#                   (default: ./artifacts). Each cell dir is `nightly-logs-<id>/`.
+#   (Per-cell job conclusions are read from FILES, not env: each stress cell writes
+#                   its own `result` — success|failure|cancelled|skipped — to
+#                   `<cell-dir>/cell-conclusion.txt` under always(), and the script
+#                   reads that file directly. Ground truth PER CELL, never a matrix
+#                   aggregate.)
+#   EXPECTED_CELLS  space-separated list of cell ids that were supposed to run
+#                   (default: the six N1..N6 ids). Lets the empty-round / missing-
+#                   artifact guard know what to expect.
+#   RUN_DATE        UTC date stamp for the issue title (default: today, UTC).
+#   GITHUB_REPOSITORY / GH_TOKEN(GITHUB_TOKEN)  the usual `gh` env.
+#   DRY_RUN=1       print the `gh` actions instead of running them (for local tests).
+set -uo pipefail
+
+ARTIFACTS_DIR="${ARTIFACTS_DIR:-./artifacts}"
+EXPECTED_CELLS="${EXPECTED_CELLS:-N1 N2 N3 N4 N5 N6}"
+RUN_DATE="${RUN_DATE:-$(date -u +%Y-%m-%d)}"
+DRY_RUN="${DRY_RUN:-0}"
+
+log() { printf '%s\n' "$*" >&2; }
+
+# A cell's log directory and its suite logs (may be absent ⇒ infra).
+cell_logdir() { printf '%s/nightly-logs-%s' "$ARTIFACTS_DIR" "$1"; }
+
+# ── Read a cell's OWN recorded conclusion from its artifact (ground truth: each
+#    stress cell writes job.status to cell-conclusion.txt under always()). Absent
+#    file ⇒ `unknown` (handled like a missing artifact). ──────────────────────────
+cell_conclusion() {
+  local cell="$1" f val=""
+  f="$(cell_logdir "$cell")/cell-conclusion.txt"
+  if [ -f "$f" ]; then
+    val="$(tr -d '[:space:]' < "$f" 2>/dev/null)"
+  fi
+  printf '%s' "${val:-unknown}"
+}
+
+# ── Classify each expected cell. Accumulate evidence lines per class. ───────────
+correctness_evidence=""
+envelope_evidence=""
+infra_evidence=""
+
+any_log_seen=0          # for the empty-round guard
+
+for cell in $EXPECTED_CELLS; do
+  dir="$(cell_logdir "$cell")"
+  concl="$(cell_conclusion "$cell")"
+
+  # Gather this cell's suite logs (unit/interop/integration *.log under the artifact).
+  logs=()
+  if [ -d "$dir" ]; then
+    while IFS= read -r f; do logs+=("$f"); done \
+      < <(find "$dir" -type f -name '*.log' 2>/dev/null)
+  fi
+
+  if [ "${#logs[@]}" -eq 0 ]; then
+    # No artifact / no logs for an expected cell. Distinguish: a clean job that
+    # somehow uploaded nothing is still suspect ⇒ infra (we cannot prove it green).
+    infra_evidence+="- ${cell}: no logs found (artifact missing or empty; job conclusion='${concl}')"$'\n'
+    log "[$cell] INFRA: no logs (conclusion=$concl)"
+    continue
+  fi
+  any_log_seen=1
+
+  # Scan the logs.
+  cell_fail=0
+  cell_envwarn=0
+  fail_lines=""
+  for f in "${logs[@]}"; do
+    if grep -qE '^FAIL:' "$f" 2>/dev/null; then
+      cell_fail=1
+      # Keep up to 5 FAIL lines per log as evidence.
+      fail_lines+="$(grep -nE '^FAIL:' "$f" 2>/dev/null | head -5 | sed "s#^#    ${f##*/}: #")"$'\n'
+    fi
+    if grep -qE 'WARN\[env-relaxed\]' "$f" 2>/dev/null; then
+      cell_envwarn=1
+    fi
+  done
+
+  if [ "$cell_fail" -eq 1 ]; then
+    # A real `^FAIL:` assertion line ⇒ correctness, regardless of job conclusion.
+    correctness_evidence+="- ${cell}: job='${concl}', FAIL lines present:"$'\n'"${fail_lines}"
+    log "[$cell] CORRECTNESS (cell_fail=$cell_fail conclusion=$concl)"
+  elif [ "$concl" != "success" ]; then
+    # Logs exist but the job did not cleanly succeed and there is no assertion FAIL:
+    # failure-without-^FAIL / timeout / cancelled / errored late ⇒ infra, not
+    # correctness and not green (a failure WITHOUT a FAIL line is a step
+    # timeout/late error, which is infra).
+    infra_evidence+="- ${cell}: logs present but job conclusion='${concl}' (failure/timeout/cancel without ^FAIL: line)"$'\n'
+    log "[$cell] INFRA (conclusion=$concl, no FAIL)"
+  elif [ "$cell_envwarn" -eq 1 ]; then
+    envelope_evidence+="- ${cell}: succeeded with WARN[env-relaxed] (envelope assertion(s) stretched under load — expected)"$'\n'
+    log "[$cell] ENVELOPE (success + env-relaxed warn)"
+  else
+    log "[$cell] OK (success, no FAIL, no env warn)"
+  fi
+done
+
+# ── EMPTY-ROUND GUARD: if not a single expected cell produced any log, the run
+#    errored before any suite ran (checkout failure, total infra collapse). That is
+#    INFRA, never green — do not let "0 FAIL across 0 logs" pass as success. ──────
+if [ "$any_log_seen" -eq 0 ]; then
+  empty_msg="EMPTY ROUND: none of the expected cells (${EXPECTED_CELLS}) produced any suite log. The workflow errored before any suite ran (checkout failure / total infra collapse) — this is NOT a passing nightly."
+  infra_evidence="${empty_msg}"$'\n'"${infra_evidence}"
+  log "EMPTY-ROUND GUARD fired: no logs from any cell."
+fi
+
+# ── File/append issues, idempotently, one per (date, class). ────────────────────
+# Title prefix is stable per class+date so search-then-append is reliable.
+file_issue() {  # $1=class-label  $2=title  $3=body
+  local label="$1" title="$2" body="$3" existing=""
+
+  if [ "$DRY_RUN" = 1 ]; then
+    log "DRY_RUN: would search open issues label=$label title~='$title'"
+    log "DRY_RUN: title='$title'"
+    log "DRY_RUN: body:"; printf '%s\n' "$body" >&2
+    return 0
+  fi
+
+  # Search OPEN issues with this label whose title exactly matches (idempotency key).
+  # `gh issue list --search` uses GitHub search; we additionally filter the JSON by
+  # exact title to avoid a substring collision.
+  existing="$(gh issue list --state open --label "$label" \
+                --search "$title in:title" --json number,title \
+                --jq ".[] | select(.title == \"$title\") | .number" 2>/dev/null | head -1)"
+
+  if [ -n "$existing" ]; then
+    log "Appending to existing issue #$existing ($label)"
+    if gh issue comment "$existing" --body "$body" >/dev/null; then
+      log "Appended comment to #$existing"
+    else
+      log "WARN: failed to append to #$existing"
+    fi
+  else
+    log "Creating new issue ($label): $title"
+    if gh issue create --title "$title" --label "$label" --body "$body" >/dev/null; then
+      log "Created issue ($label)"
+    else
+      log "WARN: failed to create issue ($label)"
+    fi
+  fi
+}
+
+run_url="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-}/actions/runs/${GITHUB_RUN_ID:-}"
+filed=0
+
+if [ -n "$correctness_evidence" ]; then
+  body="Nightly stress run on **${RUN_DATE}** has CORRECTNESS failures (a \`^FAIL:\` assertion line in a suite log). **Investigate.**
+
+$correctness_evidence
+Run: ${run_url}
+
+(Auto-filed by nightly-triage.sh; idempotent per (date, class) — re-runs append.)"
+  file_issue "nightly-correctness" "Nightly correctness failure — ${RUN_DATE}" "$body"
+  filed=1
+fi
+
+if [ -n "$infra_evidence" ]; then
+  body="Nightly stress run on **${RUN_DATE}** had INFRA issues (missing artifact / timeout / cancel / a cell job that failed or errored WITHOUT any \`^FAIL:\` line). Not a product failure, but the run did not produce trustworthy results — re-dispatch or investigate the runner.
+
+$infra_evidence
+Run: ${run_url}
+
+(Auto-filed by nightly-triage.sh; idempotent per (date, class).)"
+  file_issue "nightly-infra" "Nightly infra issue — ${RUN_DATE}" "$body"
+  filed=1
+fi
+
+# Envelope is filed ONLY when there is no correctness failure (a correctness issue
+# subsumes it — under a red run the env warns are noise). Tracked, no action.
+if [ -z "$correctness_evidence" ] && [ -n "$envelope_evidence" ]; then
+  body="Nightly stress run on **${RUN_DATE}**: no correctness failures, but envelope (wall-clock) assertions were relaxed under load (\`WARN[env-relaxed]\`). This is EXPECTED under GCL_ENVELOPE_TIER=relax — tracked, **no investigation needed** unless it becomes persistent at low load.
+
+$envelope_evidence
+Run: ${run_url}
+
+(Auto-filed by nightly-triage.sh; idempotent per (date, class).)"
+  file_issue "nightly-envelope" "Nightly envelope warning — ${RUN_DATE}" "$body"
+  filed=1
+fi
+
+if [ "$filed" -eq 0 ]; then
+  log "ALL GREEN: every expected cell succeeded, no FAIL, no env warn, all artifacts present. No issue filed."
+fi
+
+# Triage itself succeeds whenever it ran to completion — it must not red the
+# workflow for finding failures (those are surfaced as issues). It only fails if it
+# could not run at all (handled by `set -uo pipefail` on a genuine scripting error).
+exit 0
diff --git a/.github/workflows/deep-sweep.yml b/.github/workflows/deep-sweep.yml
new file mode 100644
index 0000000..2877b52
--- /dev/null
+++ b/.github/workflows/deep-sweep.yml
@@ -0,0 +1,214 @@
+# deep-sweep — Tier D of the load-testing strategy (see docs/load-testing-strategy.md).
+#
+# ON-DEMAND ONLY. This workflow is `workflow_dispatch`-only: it NEVER runs on push
+# or pull_request, and it NEVER gates anything (it is not a required check — this is
+# a single-dev project with no branch protection). It exists purely as a deep
+# flake-hunting tool — the
+# "50-clean hunt" instrument from the load-testing strategy: dispatch it (often many
+# times in parallel), pick a stress kind/magnitude, and repeat the full suite N
+# times per job to surface intermittent, scheduling-sensitive flakes that a single
+# zero-load per-PR run would never reproduce.
+#
+# Deep + loaded runs are SLOW (heavy CPU/disk oversubscription stretches every
+# wall-clock-derived step), so timeouts here are deliberately generous and the
+# envelope tier defaults to `relax` (an oversubscribed runner must not turn a
+# latency miss into a red — only a real correctness FAIL should).
+#
+# The job names are intentionally distinct (`deep-*`). With no branch protection
+# there is no required `tests-passed` context to avoid publishing, so this is now
+# only cosmetic / for clarity — but kept so a deep run is never confused with the
+# per-PR `tests` matrix in the checks UI.
+
+name: deep-sweep
+
+on:
+  workflow_dispatch:
+    inputs:
+      stress_kind:
+        description: 'Background load kind to apply via tests/with-load.sh'
+        type: choice
+        options: [none, cpu, disk, both]
+        default: both
+      stress_load:
+        description: 'Raw per-kind hog count override (GCL_STRESS_LOAD). Blank = use the ratio.'
+        type: string
+        default: ''
+      repeat:
+        description: 'How many times to repeat the suite run within each job (intermittent-flake hunt).'
+        type: string
+        default: '1'
+      envelope_tier:
+        description: 'GCL_ENVELOPE_TIER — relax (default) warns on latency misses; strict fails them.'
+        type: string
+        default: relax
+
+# Per-run-unique group so MANY parallel dispatches each get their own group and run
+# concurrently (a fresh dispatch never cancels or is cancelled by an in-flight one);
+# cancel-in-progress:false means a re-dispatch into the same run_id (impossible —
+# run_id is unique per run) would still queue rather than cancel. In practice every
+# dispatch is its own run, so the deep sweeps fan out freely and accept queue waves.
+concurrency:
+  group: deep-${{ github.run_id }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  deep:
+    name: deep-${{ matrix.os }}${{ matrix.leg != 'all' && format(' ({0})', matrix.leg) || '' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false               # every cell's verdict is a useful deep signal; let the rest finish
+      matrix:
+        # Mirrors tests.yml's per-OS legs (ubuntu all / macos all / windows unit /
+        # windows interop+integration); the canary runs as a step within the unit/all
+        # legs here, not a separate cell. Windows stays split because the bash-only
+        # unit suite is the wall-clock bottleneck there and the suites must not run
+        # concurrently inside one timing-sensitive 2-core runner. Generous deep
+        # timeouts: deep + loaded + repeated is far slower than the per-PR gate.
+        include:
+          - { os: ubuntu-24.04, leg: all, job_timeout: 180 }
+          - { os: macos-15, leg: all, job_timeout: 180 }
+          - { os: windows-2025, leg: unit, job_timeout: 120 }
+          - { os: windows-2025, leg: interop-integration, job_timeout: 120 }
+    timeout-minutes: ${{ matrix.job_timeout }}   # backstop only: repeat * (loaded suite budgets) + upload headroom
+    defaults:
+      run:
+        shell: bash                  # on windows-2025 this is Git Bash (MINGW) — what the interop suite requires
+    env:
+      GCL_TEST_FULL: 1               # full fan-out — CI runners are dedicated
+      GCL_TEST_SWEEP: 1              # deep runs exercise the Axis-A waiter-count sweep too
+      GCL_ENVELOPE_TIER: ${{ inputs.envelope_tier }}
+      GCL_STRESS_KIND: ${{ inputs.stress_kind }}
+      GCL_STRESS_LOAD: ${{ inputs.stress_load }}   # blank => with-load.sh falls back to the ratio
+    steps:
+      - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0   # v6.0.3, SHA-pinned
+        with:
+          persist-credentials: false   # no job uses the token after fetch
+
+      - name: Toolchain versions (for reconstructing failures)
+        run: |
+          uname -a
+          bash --version | head -1
+          git --version
+          if command -v pwsh >/dev/null; then
+            pwsh -NoProfile -Command '"pwsh " + $PSVersionTable.PSVersion.ToString()'
+          else
+            echo "pwsh: NOT FOUND (interop suite will skip; integration runs bash-only)"
+          fi
+          if command -v powershell >/dev/null; then
+            powershell -NoProfile -Command '"powershell " + $PSVersionTable.PSVersion.ToString()'
+          else
+            echo "powershell (Windows PowerShell 5.1): NOT FOUND (interop Test 17 skips; expected on POSIX legs)"
+          fi
+          stat --version 2>/dev/null | head -1 || echo "stat: BSD variant"
+          command -v stress-ng >/dev/null && stress-ng --version | head -1 || echo "stress-ng: NOT FOUND (with-load.sh uses the portable spinner)"
+          echo "dispatch inputs: kind=${GCL_STRESS_KIND} load='${GCL_STRESS_LOAD}' repeat=${{ inputs.repeat }} envelope=${GCL_ENVELOPE_TIER}"
+
+      # Each suite is repeated `repeat` times under load. The loop fails fast: the
+      # first failing iteration `exit 1`s the step (so the step — and job — go red on
+      # the earliest flake), and every iteration names its index in the log so a
+      # failure is attributable to a specific repeat. Under `shell: bash` (-eo
+      # pipefail) a failing suite pipeline already trips the step; the explicit
+      # PIPESTATUS check is a defensive backstop that also names the failing iteration.
+      - name: Unit suite (deep, looped x repeat, under load)
+        if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }}
+        timeout-minutes: ${{ matrix.os == 'windows-2025' && 100 || 90 }}
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/unit
+        run: |
+          mkdir -p test-output
+          n='${{ inputs.repeat }}'
+          case "$n" in ''|*[!0-9]*) n=1 ;; esac
+          [ "$n" -lt 1 ] && n=1
+          echo "== unit: repeating $n time(s) under load =="
+          for i in $(seq 1 "$n"); do
+            echo "== unit iteration $i/$n =="
+            bash tests/with-load.sh bash tests/git-commit-lock.test.sh 2>&1 \
+              | tee "test-output/unit-suite.iter$i.log"
+            rc=${PIPESTATUS[0]}
+            if [ "$rc" -ne 0 ]; then
+              echo "== unit iteration $i/$n FAILED (rc=$rc) — stopping deep sweep =="
+              exit 1
+            fi
+          done
+
+      - name: Canary suite (deep, looped x repeat, under load)
+        # The concurrency canary moved into its own file; the deep flake hunt should
+        # exercise it (a concurrency canary is exactly what a deep+loaded+repeated hunt
+        # is for). Same legs as the unit suite, same loop/fail-fast wrapping.
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'unit') }}
+        timeout-minutes: ${{ matrix.os == 'windows-2025' && 100 || 90 }}
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary
+        run: |
+          mkdir -p test-output
+          n='${{ inputs.repeat }}'
+          case "$n" in ''|*[!0-9]*) n=1 ;; esac
+          [ "$n" -lt 1 ] && n=1
+          echo "== canary: repeating $n time(s) under load =="
+          for i in $(seq 1 "$n"); do
+            echo "== canary iteration $i/$n =="
+            bash tests/with-load.sh bash tests/git-commit-lock.canary.test.sh 2>&1 \
+              | tee "test-output/canary-suite.iter$i.log"
+            rc=${PIPESTATUS[0]}
+            if [ "$rc" -ne 0 ]; then
+              echo "== canary iteration $i/$n FAILED (rc=$rc) — stopping deep sweep =="
+              exit 1
+            fi
+          done
+
+      - name: Interop suite (deep, looped x repeat, under load)
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }}   # run even if an earlier suite failed — every signal is useful
+        timeout-minutes: 90
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/interop
+        run: |
+          mkdir -p test-output
+          n='${{ inputs.repeat }}'
+          case "$n" in ''|*[!0-9]*) n=1 ;; esac
+          [ "$n" -lt 1 ] && n=1
+          echo "== interop: repeating $n time(s) under load =="
+          for i in $(seq 1 "$n"); do
+            echo "== interop iteration $i/$n =="
+            bash tests/with-load.sh bash tests/git-commit-lock.interop.test.sh 2>&1 \
+              | tee "test-output/interop-suite.iter$i.log"
+            rc=${PIPESTATUS[0]}
+            if [ "$rc" -ne 0 ]; then
+              echo "== interop iteration $i/$n FAILED (rc=$rc) — stopping deep sweep =="
+              exit 1
+            fi
+          done
+
+      - name: Integration suite (deep, looped x repeat, under load)
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }}
+        timeout-minutes: 60           # its internal AGENT_LOCK_MAX_WAIT cap is 240s; x repeat under load
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/integration
+        run: |
+          mkdir -p test-output
+          n='${{ inputs.repeat }}'
+          case "$n" in ''|*[!0-9]*) n=1 ;; esac
+          [ "$n" -lt 1 ] && n=1
+          echo "== integration: repeating $n time(s) under load =="
+          for i in $(seq 1 "$n"); do
+            echo "== integration iteration $i/$n =="
+            bash tests/with-load.sh bash tests/git-commit-lock.integration.test.sh 2>&1 \
+              | tee "test-output/integration-suite.iter$i.log"
+            rc=${PIPESTATUS[0]}
+            if [ "$rc" -ne 0 ]; then
+              echo "== integration iteration $i/$n FAILED (rc=$rc) — stopping deep sweep =="
+              exit 1
+            fi
+          done
+
+      - name: Upload deep-sweep artifacts (logs + load manifests, on success too)
+        if: ${{ always() }}   # deep runs want the negatives to read the positives; upload even when green or cancelled
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a   # v7.0.1, SHA-pinned
+        with:
+          name: deep-logs-${{ matrix.os }}-${{ matrix.leg }}-${{ inputs.stress_kind }}   # unique per (os, leg, kind)
+          path: test-output/
+          include-hidden-files: true   # lock logs + the load-manifest live under the scratch .git/ and test-output/; suite-generated, no secrets
+          if-no-files-found: warn
+          retention-days: 14
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 0000000..238d234
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,316 @@
+name: nightly
+
+# Scheduled stress run: the test suites under calibrated background load (the
+# `tests/with-load.sh` wrapper) at one oversubscription level R≈2, plus a kcov
+# line-coverage gate and auto-triage of the results into labelled issues.
+#
+# This is NON-BLOCKING: there is no branch protection on this single-dev project
+# (decision 2026-06-18), so nightly never gates a PR. Its job is to catch
+# load-sensitive flakes and coverage regressions that the per-PR `tests.yml`
+# (no-load, strict) cannot.
+#
+# NOTE for a future maintainer: GitHub auto-DISABLES a `schedule` trigger after
+# ~60 days of repo inactivity. If the nightly history is empty, that may mean the
+# schedule was disabled (not that every run passed) — re-enable / revive it with a
+# manual `workflow_dispatch` run from the Actions tab. Rely on `workflow_dispatch`
+# as the always-available manual trigger.
+
+on:
+  schedule:
+    - cron: '23 8 * * *'   # 08:23 UTC daily — off-peak (low GitHub-hosted-runner contention)
+  workflow_dispatch:
+
+# One nightly at a time; a newer run supersedes an in-flight one.
+concurrency:
+  group: nightly
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  # The suites run at full fan-out, with the envelope (wall-clock) assertions
+  # RELAXED so an oversubscribed runner cannot turn a latency stretch into a red
+  # (only correctness assertions can fail the suite under load), and with the
+  # Axis-A waiter-count sweep {4,12,24} enabled.
+  GCL_TEST_FULL: 1
+  GCL_ENVELOPE_TIER: relax
+  GCL_TEST_SWEEP: 1
+  # One oversubscription level R≈2 (stressors ≈ 2 * nproc per kind, total capped at
+  # GCL_STRESS_RATIO_MAX * nproc by with-load.sh).
+  GCL_STRESS_RATIO: 2
+
+jobs:
+  # ── The 6 stress cells. Each runs the relevant suite(s) wrapped in with-load.sh
+  #    under one GCL_STRESS_KIND. `leg` selects which suites run (the all/unit/
+  #    interop-integration legs as in tests.yml; the canary runs as a step here, not a
+  #    leg): ubuntu/macos run the full set; windows splits unit vs interop-integration. ──
+  stress:
+    name: ${{ matrix.id }} ${{ matrix.os }} (${{ matrix.kind }}${{ matrix.leg != 'all' && format(', {0}', matrix.leg) || '' }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false   # every cell's verdict is signal — and triage needs them all
+      matrix:
+        include:
+          - { id: N1, os: ubuntu-24.04, leg: all,                  kind: cpu,  job_timeout: 70 }
+          - { id: N2, os: ubuntu-24.04, leg: all,                  kind: disk, job_timeout: 70 }
+          - { id: N3, os: ubuntu-24.04, leg: all,                  kind: both, job_timeout: 70 }
+          - { id: N4, os: macos-15,     leg: all,                  kind: disk, job_timeout: 70 }
+          - { id: N5, os: windows-2025, leg: interop-integration,  kind: disk, job_timeout: 55 }
+          - { id: N6, os: windows-2025, leg: unit,                 kind: both, job_timeout: 60 }
+    timeout-minutes: ${{ matrix.job_timeout }}   # generous: load slows everything; backstop only
+    defaults:
+      run:
+        shell: bash                  # on windows-2025 this is Git Bash (MINGW) — what the interop suite requires
+    env:
+      GCL_STRESS_KIND: ${{ matrix.kind }}
+    steps:
+      - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0   # v6.0.3, SHA-pinned
+        with:
+          persist-credentials: false
+
+      - name: Toolchain versions (for reconstructing failures)
+        run: |
+          uname -a
+          bash --version | head -1
+          git --version
+          command -v stress-ng >/dev/null && stress-ng --version | head -1 || echo "stress-ng: NOT FOUND (with-load.sh uses the portable bash spinner)"
+          if command -v pwsh >/dev/null; then
+            pwsh -NoProfile -Command '"pwsh " + $PSVersionTable.PSVersion.ToString()'
+          else
+            echo "pwsh: NOT FOUND (interop suite will skip; integration runs bash-only)"
+          fi
+          if command -v powershell >/dev/null; then
+            powershell -NoProfile -Command '"powershell " + $PSVersionTable.PSVersion.ToString()'
+          else
+            echo "powershell (Windows PowerShell 5.1): NOT FOUND (interop Test 17 skips; expected on POSIX legs)"
+          fi
+          stat --version 2>/dev/null | head -1 || echo "stat: BSD variant"
+
+      - name: Unit suite (under load)
+        if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }}
+        timeout-minutes: ${{ matrix.os == 'windows-2025' && 40 || 25 }}   # raised: load + the N=24 sweep stretch wall-clock; a step timeout FAILS the step so the upload still runs
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/unit
+        run: |
+          mkdir -p test-output
+          bash tests/with-load.sh bash tests/git-commit-lock.test.sh 2>&1 | tee test-output/unit-suite.log
+
+      - name: Canary suite (under load)
+        # The concurrency canary moved out of the unit suite into its own file; still
+        # exercise it under oversubscription here (concurrency + load is the highest-value
+        # canary scenario). Runs in the same legs the unit suite does (sequentially after
+        # it — nightly is non-blocking, so no separate parallel cell is warranted).
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'unit') }}
+        timeout-minutes: ${{ matrix.os == 'windows-2025' && 20 || 12 }}   # load stretches the full-width canary; a step timeout FAILS the step so the upload still runs
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary
+        run: |
+          mkdir -p test-output
+          bash tests/with-load.sh bash tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/canary-suite.log
+
+      - name: Interop suite (under load; bash + pwsh)
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }}   # run even if an earlier suite failed — every signal is useful
+        timeout-minutes: 30
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/interop
+        run: |
+          mkdir -p test-output
+          bash tests/with-load.sh bash tests/git-commit-lock.interop.test.sh 2>&1 | tee test-output/interop-suite.log
+
+      - name: Integration suite (under load; real concurrent commits)
+        if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }}
+        timeout-minutes: 20           # its internal AGENT_LOCK_MAX_WAIT cap is 240s; load + sweep stretch it
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/integration
+        run: |
+          mkdir -p test-output
+          bash tests/with-load.sh bash tests/git-commit-lock.integration.test.sh 2>&1 | tee test-output/integration-suite.log
+
+      - name: Record this cell's conclusion (ground truth for triage)
+        if: ${{ always() }}   # capture the cell's own status — even on timeout/cancel — into its artifact
+        run: |
+          mkdir -p test-output
+          # job.status here reflects THIS cell's run so far: success | failure | cancelled.
+          # A step timeout fails the step, which makes the job status `failure` by the time
+          # this always() step runs — so a no-FAIL timeout is recorded as `failure`, and the
+          # triage script (seeing logs present but conclusion!=success and no ^FAIL:) classes
+          # it infra. The per-cell status file is the authoritative signal triage reads.
+          printf '%s' "${{ job.status }}" > test-output/cell-conclusion.txt
+          echo "cell ${{ matrix.id }} conclusion: $(cat test-output/cell-conclusion.txt)"
+
+      - name: Upload cell logs + load-manifest (on success too — we read the positives by the negatives)
+        if: ${{ always() }}   # upload whether the cell passed, failed, or timed out — triage needs every cell's evidence
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a   # v7.0.1, SHA-pinned
+        with:
+          name: nightly-logs-${{ matrix.id }}   # unique per cell; the triage job downloads these by name
+          path: test-output/
+          include-hidden-files: true   # lock logs live under the scratch repo's .git/ (hidden); suite-generated, no secrets
+          if-no-files-found: warn
+          retention-days: 14
+
+  # ── kcov line-coverage gate. Linux-only, no load, strict, unit + canary at FULL.
+  #    Build kcov v43 from source (no apt package / prebuilt). Gate at 0.80. ──────
+  kcov:
+    name: kcov coverage (Linux, no load, strict)
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+    env:
+      COVERAGE_FLOOR: '0.80'   # tracks achieved (~83%) — RATCHET UP toward ~0.90 as Tier-A tests land; do not let it lead coverage
+    steps:
+      - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0   # v6.0.3, SHA-pinned
+        with:
+          persist-credentials: false
+
+      - name: Install kcov build dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            cmake g++ make pkg-config \
+            libdw-dev libelf-dev binutils-dev libcurl4-openssl-dev zlib1g-dev libiberty-dev
+
+      - name: Build kcov v43 from source
+        run: |
+          set -euo pipefail
+          cd /tmp
+          curl -fsSL https://github.com/SimonKagstrom/kcov/archive/refs/tags/v43.tar.gz | tar xz
+          mkdir kcov-build && cd kcov-build
+          cmake ../kcov-43
+          make -j"$(nproc)"
+          ./src/kcov --version
+
+      - name: Run unit + canary suites under kcov (FULL, strict, no load)
+        env:
+          GCL_TEST_FULL: 1
+          # Set strict EXPLICITLY here to override the workflow-level GCL_ENVELOPE_TIER: relax
+          # (which this step would otherwise inherit) — we want a true, clean coverage run with
+          # the wall-clock envelope assertions enforced, no load applied.
+          GCL_ENVELOPE_TIER: strict
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/kcov-unit
+        run: |
+          set -euo pipefail
+          mkdir -p test-output coverage
+          # The concurrency canary now lives in its own file; run BOTH the unit suite
+          # and the canary under kcov into the SAME output dir. kcov ACCUMULATES
+          # coverage across multiple runs that share one output dir, writing the union
+          # to coverage/kcov-out/kcov-merged/cobertura.xml (NOT a top-level
+          # cobertura.xml — the enforcement step below reads the merged union by
+          # selecting the highest lines-covered), so the canary's coverage of
+          # git-commit-lock.sh is preserved and the 0.80 floor cannot regress from the split.
+          /tmp/kcov-build/src/kcov --include-path="$(pwd)/git-commit-lock.sh" \
+            coverage/kcov-out tests/git-commit-lock.test.sh 2>&1 | tee test-output/kcov-unit-suite.log
+          /tmp/kcov-build/src/kcov --include-path="$(pwd)/git-commit-lock.sh" \
+            coverage/kcov-out tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/kcov-canary-suite.log
+
+      - name: Enforce coverage floor (parse cobertura line-rate)
+        run: |
+          set -euo pipefail
+          # kcov does NOT write a top-level coverage/kcov-out/cobertura.xml. The two runs
+          # (unit + canary) into one outdir produce per-binary reports under
+          # coverage/kcov-out/<binary>.<hash>/cobertura.xml and a merged union at
+          # coverage/kcov-out/kcov-merged/cobertura.xml. All cover the same source
+          # git-commit-lock.sh, so they share an identical lines-valid — a lines-valid
+          # tie-break would keep whatever find returns first (a single-suite report).
+          # Pick the highest lines-COVERED instead: the merged union has the most covered
+          # lines, so this robustly selects it (for a single run there's just one report).
+          cob=""
+          best_covered=-1
+          while IFS= read -r f; do
+            c="$(grep -oE 'lines-covered="[0-9]+"' "$f" 2>/dev/null | head -1 | grep -oE '[0-9]+')"
+            c="${c:-0}"
+            if [ "$c" -gt "$best_covered" ]; then best_covered="$c"; cob="$f"; fi
+          done < <(find coverage/kcov-out -name cobertura.xml 2>/dev/null)
+          if [ -z "$cob" ] || [ ! -f "$cob" ]; then
+            echo "::error::no cobertura.xml found under coverage/kcov-out — kcov produced no report"
+            find coverage/kcov-out -maxdepth 3 -type f 2>/dev/null | sed 's/^/  /'
+            exit 1
+          fi
+          echo "Parsing coverage from: $cob (lines-covered=$best_covered)"
+          # Prefer the precise lines-covered/lines-valid ratio (exact); fall back to the
+          # rounded line-rate attribute. Both live on the top-level <coverage ...> tag.
+          covered="$(grep -oE 'lines-covered="[0-9]+"' "$cob" | head -1 | grep -oE '[0-9]+')"
+          valid="$(grep -oE 'lines-valid="[0-9]+"' "$cob" | head -1 | grep -oE '[0-9]+')"
+          rate="$(grep -oE 'line-rate="[0-9.]+"' "$cob" | head -1 | grep -oE '[0-9.]+')"
+          if [ -n "$covered" ] && [ -n "$valid" ] && [ "$valid" -gt 0 ]; then
+            # exact ratio to 4 dp, integer arithmetic (no bc/python dependency)
+            rate="$(awk -v c="$covered" -v v="$valid" 'BEGIN { printf "%.4f", c / v }')"
+            echo "Line coverage: $covered / $valid = $rate"
+          else
+            echo "Line coverage (from line-rate attribute): $rate (lines-covered/valid unavailable)"
+          fi
+          floor="$COVERAGE_FLOOR"
+          # Compare rate >= floor with awk (float-safe).
+          if awk -v r="$rate" -v f="$floor" 'BEGIN { exit !(r + 0 >= f + 0) }'; then
+            echo "PASS: line coverage $rate >= floor $floor"
+            echo "NOTE: the floor ($floor) tracks the achieved coverage (~0.83); ratchet it up toward ~0.90 as more Linux-coverable tests land. The Linux ceiling is ~0.94 (~30 lines are platform-gated)."
+          else
+            echo "::error::line coverage $rate is BELOW the floor $floor — coverage regressed"
+            echo "The floor tracks achieved coverage (~0.83) and should only ratchet UP as tests land. A drop means a test stopped exercising lines it used to. Investigate before lowering the floor."
+            exit 1
+          fi
+
+      - name: Upload coverage report (HTML + cobertura)
+        if: ${{ !cancelled() }}
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a   # v7.0.1, SHA-pinned
+        with:
+          name: kcov-coverage
+          path: |
+            coverage/kcov-out/
+            test-output/kcov-unit-suite.log
+            test-output/kcov-canary-suite.log
+          include-hidden-files: true
+          if-no-files-found: warn
+          retention-days: 30
+
+  # ── Auto-triage. Downloads every cell's artifact, classifies (correctness /
+  #    envelope / infra), and files/append ONE labelled issue per (date, class).
+  #    Runs always() so a failed/cancelled cell is still triaged; the empty-round
+  #    guard prevents "0 FAIL across 0 logs" being read as green. ─────────────────
+  triage:
+    name: Triage nightly results
+    needs: [stress, kcov]
+    if: ${{ always() }}
+    runs-on: ubuntu-24.04
+    timeout-minutes: 10
+    permissions:
+      issues: write
+      contents: read
+    steps:
+      - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0   # v6.0.3, SHA-pinned
+        with:
+          persist-credentials: false
+
+      - name: Download all cell artifacts
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53   # v6.0.0, SHA-pinned
+        with:
+          path: artifacts
+          # pattern restricts to the per-cell logs (not kcov-coverage); merge-multiple off
+          # so each lands in its own `nightly-logs-<id>/` dir, as the triage script expects.
+          pattern: nightly-logs-*
+        continue-on-error: true   # a totally-missing artifact set must reach the empty-round guard, not error the job
+
+      - name: Ensure triage labels exist (idempotent)
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -uo pipefail
+          gh label create nightly-correctness -c '#d73a4a' -d 'Nightly stress: a correctness assertion failed — investigate' --force || true
+          gh label create nightly-envelope    -c '#fbca04' -d 'Nightly stress: wall-clock envelope relaxed under load — expected, tracked' --force || true
+          gh label create nightly-infra        -c '#0e8a16' -d 'Nightly stress: infra issue (missing artifact / timeout / errored) — not a product failure' --force || true
+
+      - name: Classify results and file/append issues
+        env:
+          GH_TOKEN: ${{ github.token }}
+          ARTIFACTS_DIR: artifacts
+          EXPECTED_CELLS: 'N1 N2 N3 N4 N5 N6'
+          GITHUB_SERVER_URL: ${{ github.server_url }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+        run: |
+          set -uo pipefail
+          # Each cell's status is ground truth from its OWN artifact
+          # (nightly-logs-<id>/cell-conclusion.txt, written by the stress job under
+          # always()), so the script never relies on the misleading matrix-aggregate
+          # `needs.stress.result`. The empty-round guard fires if NO cell artifact exists.
+          echo "Artifacts present:"; ls -la artifacts 2>/dev/null || echo "  (none)"
+          bash .github/scripts/nightly-triage.sh
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3d1424c..1b579e2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -32,16 +32,23 @@ jobs:
     strategy:
       fail-fast: false               # an OS-specific failure is the signal we want; let the others finish
       matrix:
-        # Windows splits into two parallel jobs — the bash-only unit suite is the
-        # wall-clock bottleneck there (~309s vs interop 100s + integration 28s;
-        # process-spawn overhead, not the PowerShell engines). Suites must NOT run
-        # concurrently inside one runner: they're timing-sensitive on 2-core
-        # runners. POSIX legs are fast enough to stay single-job.
+        # The concurrency CANARY (Test 1, full-width 25x8) is its OWN suite file and
+        # runs as a separate parallel `canary` cell on EVERY arch — it is ~half the
+        # Windows unit wall-clock (process-spawn overhead, not the PowerShell engines)
+        # and cheap on POSIX, so parallelising it is the per-PR wall-clock win.
+        # Windows otherwise splits unit vs interop-integration. Suites must NOT run
+        # concurrently inside one runner: they're timing-sensitive on 2-core runners.
+        # `leg: all` runs unit+interop+integration but NOT canary (the canary step
+        # gates on `leg == 'canary'` only). The job-name + artifact-name templates
+        # already key on matrix.leg, so the `canary` leg is named/uploaded uniquely.
         include:
           - { os: ubuntu-24.04, leg: all, job_timeout: 35 }
+          - { os: ubuntu-24.04, leg: canary, job_timeout: 15 }
           - { os: macos-15, leg: all, job_timeout: 35 }
+          - { os: macos-15, leg: canary, job_timeout: 15 }
           - { os: windows-2025, leg: unit, job_timeout: 20 }
           - { os: windows-2025, leg: interop-integration, job_timeout: 22 }
+          - { os: windows-2025, leg: canary, job_timeout: 15 }
     timeout-minutes: ${{ matrix.job_timeout }}   # backstop only: sum of the leg's step budgets + upload headroom
     defaults:
       run:
@@ -70,6 +77,15 @@ jobs:
           fi
           stat --version 2>/dev/null | head -1 || echo "stat: BSD variant"
 
+      - name: Canary suite (full-width concurrency canary)
+        if: ${{ matrix.leg == 'canary' }}
+        timeout-minutes: ${{ matrix.os == 'windows-2025' && 7 || 6 }}   # ~151s on Windows + headroom; a step timeout FAILS the step (not the job) so the upload still runs
+        env:
+          GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary
+        run: |
+          mkdir -p test-output
+          bash tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/canary-suite.log
+
       - name: Unit suite
         if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }}
         timeout-minutes: ${{ matrix.os == 'windows-2025' && 15 || 10 }}   # a step timeout FAILS the step (not the job), so the upload step reliably runs; sized from run 27325978197 + one internal MAX_WAIT hang
@@ -129,9 +145,13 @@ jobs:
           /tmp/shellcheck-v0.11.0/shellcheck --version
           /tmp/shellcheck-v0.11.0/shellcheck -S style \
             git-commit-lock.sh \
+            tests/_harness.sh \
             tests/git-commit-lock.test.sh \
+            tests/git-commit-lock.canary.test.sh \
             tests/git-commit-lock.interop.test.sh \
             tests/git-commit-lock.integration.test.sh \
+            tests/with-load.sh \
+            .github/scripts/nightly-triage.sh \
             install.sh
 
       - name: PSScriptAnalyzer (gate at warning severity)
diff --git a/.gitignore b/.gitignore
index be293f3..6ab470c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,7 @@
 # OS / editor cruft
 .DS_Store
 Thumbs.db
-/.agent/review-queue
-/.agent/review-queue.lock
-/.agent/review-queue.lock.*
-/.agent/last-opened
-/.agent/.tmp.*
+*.stackdump
+
+# Test/CI artifact output (manifests, suite logs); created at runtime, never committed.
+test-output/
diff --git a/README.md b/README.md
index 5bebc3a..2027cd0 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,11 @@ atomic create-or-fail open (`O_CREAT|O_EXCL` / `FileMode.CreateNew`) — atomic
 on local POSIX filesystems and NTFS alike, with no dependency on `flock` —
 whose content is the holder's unique token. Every worktree has its own git
 dir, so independent worktrees get independent locks, while all agents sharing
-one checkout contend on the same lock. The lock is deliberately a stealable
+one checkout contend on the same lock. The protocol's correctness rests on these
+operations being atomic, which holds on local filesystems (ext4, APFS, NTFS, and
+kin) but **not** on network or sync-backed storage — NFS, SMB shares,
+Dropbox/OneDrive-synced directories — where exclusion may silently fail. Keep the
+repo (and so its `.git/`) on a local disk. The lock is deliberately a stealable
 **lease**, not a kernel lock: in unattended agent fleets a hung-but-alive
 holder is at least as common as a crashed one, and a lock that can't be taken
 from a stuck holder halts the whole run — while a rare collision costs little
@@ -94,6 +98,12 @@ against each other on all three OSes — not as platform support, but because
 two independent implementations hammering one lock is cheap adversarial
 verification of the protocol.
 
+**Upgrade both implementations together.** Older releases stole with an
+unserialized move-aside instead of the claim protocol, so the
+no-displacement-during-recovery guarantee holds only when every party in a tree
+runs a current version; a mixed-version tree degrades that prevention to
+detection (exit 98) and can leave `.dead.*` files current versions don't clean.
+
 ## Suggested agent instructions
 
 Agents only benefit from the lock if their instructions tell them to use it.
@@ -246,9 +256,10 @@ knobs and how staleness and stealing work.
 
 ## Tests
 
-Three suites — bash unit, bash + PowerShell interop, and an end-to-end
-integration run of concurrent real commits — cover the tool, and CI runs
-them on Linux, macOS, and Windows. How to run them and what each covers:
+Four suites — bash unit, a bash concurrency canary, bash + PowerShell
+interop, and an end-to-end integration run of concurrent real commits —
+cover the tool, and CI runs them on Linux, macOS, and Windows. How to run
+them and what each covers:
 [`docs/git-commit-lock.md#tests`](docs/git-commit-lock.md#tests).
 
 ## Licence
diff --git a/docs/failure-modes.md b/docs/failure-modes.md
new file mode 100644
index 0000000..e82810c
--- /dev/null
+++ b/docs/failure-modes.md
@@ -0,0 +1,862 @@
+# git-commit-lock: failure-mode map and scope decisions
+
+**Status:** decision-support document. For each failure mode it states the
+tool's *current* behavior (grounded in the product code and tests), classifies
+it into one of three robustness tiers, and recommends whether it should be an
+in-scope guarantee. The owner uses this to deliberately decide, per mode, "yes,
+we guarantee this" or "no, out of scope."
+
+**Sources of truth, in order:** the product code
+(`git-commit-lock.sh`, `git-commit-lock.ps1`) and the test suites
+(`tests/git-commit-lock.test.sh`, `tests/git-commit-lock.canary.test.sh`,
+`tests/git-commit-lock.interop.test.sh`,
+`tests/git-commit-lock.integration.test.sh`). Every claim below cites
+`file:line`. The narrative docs (`README.md`, `docs/git-commit-lock.md`) and
+the implementation header comments are corroborating, not authoritative — where
+this document relies on a header comment it has verified the comment against the
+code. (Cited line numbers are against the tree at commit `c762899`; treat them
+as anchors, not exact addresses, if the files move.)
+
+A note on epistemics: the bash file's header (`git-commit-lock.sh:1-426`) is
+itself an exhaustive design narrative and the ps1 header
+(`git-commit-lock.ps1:41-177`) mirrors it. They are unusually trustworthy as
+documentation *because* the tests pin the behaviors they describe. This document
+does not re-derive the protocol; it re-classifies it for a scope decision and
+flags the boundaries the headers state but a reader might skip.
+
+---
+
+## 1. The core guarantee (what must hold under ANY conditions)
+
+**No silent lost update — given cooperative wrapper unwind.** The absolute safety
+property is that the tool never reports a *serialized* critical section that
+wasn't: a holder whose lease was taken from it learns so — `lock_release` returns
+**98** and logs a loud WARNING — rather than exiting success
+(`git-commit-lock.sh:1607-1688`; `git-commit-lock.ps1:1717-1837`). The two
+reserved failure codes mean the wrapped command was provably *not* run (96 usage,
+97 timeout) or provably *not serialized* (98) (`git-commit-lock.sh:392-415`).
+
+Two honest qualifications make this a precise property rather than a slogan, and
+both matter for the scope decision:
+
+- **It is a lease, not a kernel lock** (`docs/git-commit-lock.md:60-126` explains
+  why no OS primitive spans bash-on-MINGW and PowerShell/.NET). **Strict mutual
+  exclusion holds only *within* the staleness window** (default 300s): a hold that
+  overruns it *can* be stolen mid-work — "fail-open" — so two processes can
+  briefly *both* believe they hold the lock. That overlap is accepted by design
+  and made *detectable* (the displaced holder's 98 at release), not prevented
+  (`git-commit-lock.sh:213-227`). At most one process is ever the *legitimate*
+  holder; a displaced believer finds out at release. So "mutual exclusion" is a
+  Tier-1 guarantee **within the envelope (commits faster than STALE)**, not an
+  unconditional one.
+- **Detection requires the wrapper to actually reach release.** The 98 path fires
+  on normal return and on trapped signals. It does **not** fire if the held process
+  is terminated or *replaced* without unwinding — an external SIGKILL, a bash
+  `exec` in the wrapped command (which replaces the holding shell, so neither
+  `lock_release` nor the EXIT trap runs), or PowerShell `[Environment]::Exit()`
+  (bypasses `Lock-Release`, the `finally`, and the `PowerShell.Exiting` backstop,
+  `git-commit-lock.ps1:221-245`). A *plain* `exit` is safe — it unwinds. A
+  non-unwinding exit returning 0 *while displaced* can report success without the
+  98 (see **§H4**). The *next* holder still recovers via staleness, but the
+  abruptly-exiting one is not warned. Hence the precise statement: **no silent lost
+  update, provided the wrapper unwinds cooperatively.**
+
+Liveness (eventual recovery) and bounded stalls are best-effort within an
+operating envelope (Tier 2), not absolute — and "recovery" means lock-shaped
+orphans get reclaimed, **not** that every bad state self-heals (a foreign object
+at the path is deliberately never auto-removed; see the tier split).
+
+The integration suite is the end-to-end witness for this guarantee on the real
+use case: many workers committing into one repo, audited for "every commit
+lands, history linear, no sweep-up, no `index.lock` races, no stolen leases,
+clean tree" (`tests/git-commit-lock.integration.test.sh:10-12, 226-283`).
+
+### The three tiers used throughout
+
+1. **Correctness guarantee** — must hold under *any* conditions (load, slow FS,
+   adversarial scheduling). Two kinds, and the distinction matters:
+   - **Safety (unconditional):** no corruption, and **no silent lost update** —
+     the displaced holder detects the loss (98) *provided its wrapper reaches
+     release* (§1's hard-kill/`Exit()` caveat). Strict **mutual exclusion holds
+     within the staleness window**; beyond it the lease is
+     fail-open-but-detectable.
+   - **Recovery (for lock-shaped stale state, under the supported FS/clock/tooling
+     envelope):** a crashed holder's stale lock, an orphaned claim, and an empty
+     crash-orphan are eventually reclaimed. This does **not** extend to *foreign*
+     objects at the path — a directory, a real user file, or non-`tok.` junk
+     content are deliberately *never* auto-removed; they wait at 97 for an
+     operator. "Eventual recovery" means lock-shaped orphans self-clear, not that
+     every bad state self-heals.
+   If a *safety* property can break, it is a bug; a *recovery* property failing
+   outside its envelope (e.g. a foreign object, an unreadable clock) is a
+   classified Tier-2/3 degradation, not a Tier-1 violation.
+2. **Best-effort within a stated envelope** — holds under normal/expected
+   conditions, degrades gracefully (and *detectably*) under pathological ones.
+   Everything wall-clock-bounded lives here, because wall-clock bounds depend on
+   scheduling: timeouts, recovery latency, the diagnostic warnings that depend
+   on timing. Correctness is preserved; only liveness/latency degrades.
+3. **Out of scope** — explicitly not handled; the operating envelope excludes
+   it. Damage, if any, is bounded and documented.
+
+---
+
+## 2. Summary table
+
+Legend — **Tier:** 1 correctness / 2 best-effort-in-envelope / 3 out-of-scope.
+**Tested:** ✓ deterministic test · ~ load/timing-sensitive or partial · ○
+robust-by-code-but-unverified · S static/grep check · (plat) platform-gated.
+
+| # | Failure mode | Current behavior | Tier | Tested | Recommendation |
+|---|---|---|---|---|---|
+| A1 | Clean high contention (N workers, no crashes) | Serialized; no lost update | 1 | ✓ C:81-111 (canary), I:227-261/341-386, integ | **In scope.** Keep. |
+| A2 | Thundering herd recovering one dead lock | Claim serializes; exactly one steal, zero displacement | 1 | ✓ U:212-346, I:884-1015 | **In scope.** Keep. |
+| A3 | Many concurrent stealers on one ghost | One O_EXCL claim winner | 1 | ✓ U:1095-1128, I:1017-1088 | **In scope.** Keep. |
+| B1 | Holder dies (crash/SIGKILL/power) mid-hold | Lease ages out; stolen after STALE | 1 (recovery) / 2 (latency) | ✓ U:197-210/348-361 | **In scope** (recovery). Latency = Tier 2. |
+| B2 | Holder dies mid-CLAIM (trappable: INT/TERM) | Trap deletes claim, token-checked; discovery read | 1 | ✓ U:1857-1928, I:1151-1244 | **In scope.** Keep. |
+| B3 | Holder dies mid-claim (untrappable: SIGKILL) | Claim ages out ≤ CLAIM_STALE; rival rename can install unowned lock, recovered ≤ STALE | 2 | ✓ U:1648-1677 (forensics) | **Accept** (residual 5). Bounded, no false success. |
+| B4 | Slow but UNCONTENDED holder overruns STALE | Keeps its lock (nothing moved it) | 1 | ✓ U:419-429, I:494-499 | **In scope.** Keep. |
+| B5 | Slow CONTENDED holder overruns STALE | Stolen; robbed holder detects at release → 98 | 1 (detection) | ✓ U:387-417, I:460-492 | **In scope.** This *is* fail-open-but-detectable. |
+| C1 | Orphaned/stale lock | mtime-stale → stolen via claim | 1 | ✓ U:197-210 | **In scope.** Keep. |
+| C2 | Empty lock (crash between create+write) | Empty + stale → stealable | 1 | ✓ U:348-361 | **In scope.** Keep. |
+| C3 | Crashed-claimant / empty claim orphan | Ages out ≤ CLAIM_STALE; cleared | 1 (recovery) / 2 (latency) | ✓ U:1130-1154 | **In scope.** Keep. |
+| C4 | Leaked claim (unverifiable unlink) | Leaked-token memory keeps ownership discoverable | 1 | ✓ U:1549-1758, U:2013-2164 | **In scope.** Keep. |
+| D1 | Atomic rename-over (steal install) | `mv -T` / `File.Move(...,true)` / 5.1 unlink+move | 1 (local FS) | ✓ U:212-346, I:16d S:1141 | **In scope on local FS.** Boundary = D-axis. |
+| D2 | O_EXCL atomic create | `set -C` redirect / `FileMode.CreateNew` | 1 (local FS) | ✓ throughout | **In scope on local FS.** |
+| D3 | Wrong-type at path (dir/symlink/FIFO/dev/socket) | Never stolen/deleted; loud warn; waiters → 97 | 1 (bash + ps1-on-Win) / 2 (ps1-on-POSIX) | ✓ U:818-892/1156-1262/Test 37 (rename-refused mid-steal)/Test 44 (socket+device), ~(plat) | **In scope.** ps1-on-POSIX residual = accept. |
+| D4 | Non-lock CONTENT at path (user file) | Never stolen (content guard); warn | 1 | ✓ U:1034-1076 | **In scope.** Two accepted residuals (§D4). |
+| D5 | Case-insensitive FS path collision | Not handled explicitly | 3 | ✗ | **Likely non-issue;** see §D5. Decide. |
+| E1 | Network/shared FS (NFS/SMB/9p/Dropbox) | Outside design guarantees (stated) | 3 | ✗ | **Out of scope** (stated). See §E — decide whether to *enforce*. |
+| E2 | Multi-host clock skew / NTP jump | Single-clock assumption; documented (local jump → detected-98, safe) | 3 | ✗ | **Out of scope**; single-clock assumption documented. See §E2. |
+| E3 | mtime probe unreadable (staleness clock broken) | Warns loudly once; treats as not-stale → safe, recovery disabled → 97 | 2 | ✓ U:Test 42 | **Accept** — fails safe + announced. See §E3. |
+| F1 | Disk full (ENOSPC) during create/write | Create fails → wait; torn write ages out | 2/3 | ✓ U:Test 50 (Linux+sudo tmpfs; (plat) skip elsewhere) | **Tested** (§4 item 5) + document. See §F1. |
+| F2 | ENOSPC during LOG write | Swallowed (`|| true`); silent log loss | 2 | ✓ U:Test 49 (portable failing-log path) | **Tested** (§4 item 5); logging best-effort, lock unaffected. |
+| F3 | Inode / FD exhaustion | Create fails → wait → 97 | 2 | ○ (document-only) | **Document-only**: no deterministic portable injection. See §F3. |
+| F4 | Read-only / unwritable lock dir or parent | `mkdir -p` best-effort; create fails → wait → 97 | 2 | ✓ U:Test 48 (POSIX `chmod 0555`; (plat) skip on Windows) | **Tested** (§4 item 5, highest-value). See §F4. |
+| G1 | Lock path = a directory / `$HOME` typo | Never stolen/deleted; loud warn; → 97 | 1 | ✓ U:818-840 | **In scope.** Keep. |
+| G2 | Garbage numeric config | Falls back to default + stderr note | 1 | ✓ U:695-703, I:554-608 | **In scope.** Keep. |
+| G3 | `run` outside a git repo, no `AGENT_LOCK_PATH` | Refuses (96) | 1 | ✓ U:705-712 | **In scope.** Keep. |
+| G4 | `MAX_WAIT ≤ STALE + CLAIM_STALE` (default MW) | Startup warning | 2 | ✓ U:497-522 | **In scope.** Keep. |
+| H1 | SIGINT/SIGTERM mid-hold | Release + re-raise (143); traps restored | 1 | ✓ U:577-600/1989-2011 | **In scope.** Keep (bash). ps1 = §H. |
+| H2 | EXIT-while-holding | Release + chain caller's EXIT trap | 1 | ✓ U:633-648 | **In scope.** Keep. |
+| H3 | ps1 process death under `-File` | `PowerShell.Exiting` does NOT fire; relies on stale window | 2 | ○ (limit documented) | **Accept;** `run` path is covered. See §H. |
+| H4 | Non-unwinding exit while held (SIGKILL / bash `exec` / `[Environment]::Exit()`) | Skips release → a displaced holder is unwarned (no 98); plain `exit` is safe | 2 | ~ (I:308-334 indirect) | **Document** the no-silent-loss boundary. See §H4. |
+| I1 | bash⇄pwsh wire/format compatibility | Shared format; token grammar tightened to match | 1 | ✓ I:* throughout | **In scope.** Keep. |
+| I2 | Mixed-VERSION tree (old unserialized steal) | Prevention degrades to detection (98); `.dead.*` litter | 3 | ✗ | **Out of scope:** "upgrade both together." Residual 4. |
+| J1 | Logging subsystem failure | All log writes `|| true`; 1 MB self-truncate | 2 | ✓ U:Test 49 (via F2) | **Tested** (§4 item 5, via F2); logging never blocks the lock. |
+| K1 | Extreme load / CPU oversubscription / slow FS | Correctness holds; wall-clock bounds stretch | 2 | ~ (CI stress) | **Envelope defined** (design doc + envelope tier). See §K — the key analytical section. |
+| K2 | Internal time budgets (poll, MAX_WAIT, read ladder) | Fixed schedules; tunable | 2 | ✓/~ | **In scope** as Tier-2 envelope. See §K. |
+
+U = `tests/git-commit-lock.test.sh`, I = `tests/git-commit-lock.interop.test.sh`,
+C = `tests/git-commit-lock.canary.test.sh` (the concurrency canary),
+integ = `tests/git-commit-lock.integration.test.sh`.
+
+---
+
+## 3. Per-mode detail
+
+### A. High contention / thundering herd
+
+**A1 — Clean contention, no crashes.** N processes race to acquire a free or
+held-then-released lock. The acquire loop is one O_EXCL create attempt per poll;
+exactly one creator wins, the rest poll and take turns
+(`git-commit-lock.sh:1312-1361`). After winning, the acquirer re-reads its own
+token (read-back verification, `git-commit-lock.sh:1352-1361`) before claiming
+the hold — so even a create that "won" but whose file was concurrently
+clobbered does not produce a false hold.
+*Tier 1.* Tested heavily: the concurrency canary — mutual exclusion under many
+concurrent workers, 8 rounds × 25 at FULL (`tests/git-commit-lock.canary.test.sh`
+Test 1, `C:81-111`) — interop Test 1/Test 6 mixed bash+pwsh (`I:227-261`, the
+strict deterministic counter `I:341-386`), and the integration suite's real-commit
+swarm. (Crash-recovery / claim-contention witnesses stay in the unit suite: A2's
+Test 2b, A3's Test 20.) **Recommend: in scope, keep.** This is the tool's whole reason to exist.
+
+**A2 — Thundering herd recovering one dead lock.** After a holder dies, *every*
+waiter judges the same lock stale off the same mtime in the same poll window —
+the worst case for displacement. The **claim protocol** is the answer: to steal,
+a waiter must first win an O_EXCL claim file `<lock>.next`, re-verify staleness
+under the claim, then install by one atomic rename-over
+(`git-commit-lock.sh:1070-1218`, the steps narrated at `:82-115`). This
+*prevents* the straggler-robs-recovery-winner race rather than detecting and
+repairing it. *Tier 1.* Tested: unit Test 2b asserts zero spurious 98s, exactly
+one `STOLE-BY-CLAIM` per round, and — via a background sampler — that **no
+move-aside `.dead.*` file ever exists** (`U:212-346`); interop Test 16 proves
+the same across mixed impls (`I:884-1015`). The header records the unserialized
+baseline was probed to displace 5/5 with 4 waiters (`git-commit-lock.sh:233-234`).
+**Recommend: in scope, keep — this is a load-bearing correctness property.**
+
+**A3 — Many concurrent stealers.** Distilled A2: N stealers, one O_EXCL claim
+winner, the rest wait and acquire in sequence. *Tier 1.* Tested: unit Test 20
+(`U:1095-1128`), interop Test 16b (one bash claimant vs one ps1 claimant on one
+ghost, cross-parsing each other's claim files, `I:1017-1088`).
+**Recommend: in scope, keep.**
+
+> **Load caveat on A2/A3 (see §K):** *correctness* is load-independent (it rests
+> on O_EXCL + atomic rename, not timing). What stretches under load is the
+> *latency* to recover, and the *test harness's* ability to set up the race
+> deterministically — Test 2b/16 carry heavy sync scaffolding and bounded
+> discard-and-retry precisely because a fast waiter can complete an entire steal
+> before the harness finishes backdating the ghost (`U:70-104, 285-336`). That
+> is a test-harness envelope concern, not a protocol gap.
+
+### B. Holder death
+
+**B1 — Crash/SIGKILL/power loss mid-hold.** The lease ages out: once the lock
+file's mtime is older than `STALE_SECS`, a waiter steals it. *Recovery is Tier
+1; recovery latency is Tier 2* (bounded by STALE + poll cadence under normal
+load). Tested via the stale-lock and empty-orphan steals (`U:197-210, 348-361`).
+**Recommend: in scope (recovery); latency bound documented (§K).**
+
+**B2 — Trappable death mid-claim (INT/TERM).** The EXIT/INT/TERM handlers are
+armed at acquire *start*, not at hold, in "claim-window mode"
+(`git-commit-lock.sh:1299-1310, 987-997`). A trappable exit while a claim is in
+flight runs the token-checked claim deletion (one bounded retry) and a final
+discovery read; it never runs lock-release (98) semantics on a *mere claim*.
+*Tier 1.* Tested: unit Test 33 — TERM mid-claim deletes our claim, leaves a
+*foreign* claim intact, no 98, no ageout penalty (`U:1857-1928`); the matching
+ps1 lane is interop Test 16e (`I:1151-1244`). **Recommend: in scope, keep.**
+
+**B3 — Untrappable death mid-claim (SIGKILL between claim and rename).**
+Deliberately **accepted, not prevented** (residual 5,
+`git-commit-lock.sh:266-282`). The orphaned claim normally just ages out at
+CLAIM_STALE; the rare bad case is a suspended rival's rename installing it as an
+*unowned* lock that stalls waiters ≤ STALE before the lease recovers it. Crucial
+property: **no false success anywhere** — nobody believes they hold; the only
+cost is a bounded stall, same class as B1 at far lower probability. The preventing
+alternative (a two-rename compare-and-swap) was evaluated and rejected because it
+reintroduces crash litter (`git-commit-lock.sh:276-282`). *Tier 2.* Tested for
+forensics/recovery via the crashed-leaver leg of Test 31 (`U:1648-1677`).
+**Recommend: accept as a documented bounded residual. Do not build the
+two-rename CAS** — the cure is worse than the disease and the failure is already
+false-success-free.
+
+**B4 — Slow but uncontended holder.** With no waiter, nothing moves the file;
+the token still matches at release; success. *Tier 1.* Tested: unit Test 4c,
+interop Test 9 (`U:419-429`, `I:494-499`). **Recommend: in scope, keep** — this
+is what stops the lock punishing every slow-but-safe hold.
+
+**B5 — Slow CONTENDED holder (the fail-open ceiling).** A hold past STALE *with*
+a contender gets stolen; the robbed holder detects it at release (file gone, or
+a foreign token — both definitive because acquire's read-back proved our token
+was at the path) and returns exactly **98** plus a WARNING
+(`git-commit-lock.sh:1620-1688`). *Tier 1 for detection.* Tested: unit Test 4b,
+interop Test 8 both directions (`U:387-417`, `I:460-492`). **Recommend: in
+scope, keep.** This is the deliberate fail-open-but-detectable contract; the
+mitigation is operational — "commits must be fast" (the golden rule,
+`docs/git-commit-lock.md:433-458`), and raise STALE for a genuinely slow hold.
+
+### C. Orphaned / stale locks and claims
+
+**C1/C2 — Stale or empty lock.** Staleness is judged by the lock file's own
+mtime; a lock older than STALE and *lock-shaped* (empty, or line 1 starts
+`tok.`) is stealable (`git-commit-lock.sh:1408-1446`). The empty case is the
+crash-between-create-and-write orphan and is explicitly stealable. *Tier 1.*
+Tested: Test 2 (stale), Test 3 (empty orphan regression) (`U:197-210, 348-361`).
+**Recommend: in scope, keep.**
+
+**C3 — Crashed-claimant / empty-claim orphan.** A claim older than CLAIM_STALE
+(default 60s; claims are normally held for ms) is cleared by any waiter, which
+re-races the claim create (`git-commit-lock.sh:1228-1267`). A crashed claimant
+therefore delays only *steals*, only by ≤ the claim window; a free lock path is
+never blocked by a claim. *Recovery Tier 1, latency Tier 2.* Tested: Test 21
+(aged foreign claim and empty claim both age out and recovery completes,
+`U:1130-1154`). **Recommend: in scope, keep.**
+
+> **Test 21's `≤20s` latency assertion is Tier 2, not Tier 1.** `U:1144` asserts
+> wall-clock recovery `≤20s` with STALE=1, CLAIM_STALE=2, MAX_WAIT=30. The
+> *protocol* recovers correctly regardless; the 20s number is a generous
+> envelope bound that a sufficiently oversubscribed runner (e.g. 8 CPU hogs on a
+> 2-core box under the stress wrapper) can blow without any protocol defect.
+> This is exactly the kind of bound §K says to treat as a test-harness envelope:
+> if it flakes under extreme artificial load, **relax the test's bound or scope
+> the stress level — do not harden the code.**
+
+**C4 — Leaked claim.** A few exits must leave a claim behind without a verifiable
+unlink (an unreadable claim; an unlink blocked by a foreign handle — exactly
+three feeders, `git-commit-lock.sh:138-157`). These append the attempt token to
+an in-process **leaked-token memory**. While non-empty, every poll (and a pass
+at release/timeout) also reads the lock's line 1: a listed token there means a
+rival's rename installed *our* leaked claim as the lock → adopt the hold, or, at
+release, recognise our real hold was displaced, clean the leaked file
+best-effort, and report 98. The result is structural: **no process inside an
+acquire/hold/release arc can leave an *unowned* lock** (per-attempt tokens make
+the discovery read conclusive). One scope nuance worth stating, because the
+memory is **process-local**: only the leaking process can *adopt* its own
+installed claim. If that process exits the arc first — times out (97), releases
+cleanly, or dies — *before* adopting, the installed claim becomes an unowned lock
+recovered by the ordinary staleness lane, never adopted by another process (this
+is exactly residual 5 / §B3). Per-attempt-token uniqueness still guarantees that
+lock can never be *mistaken* for owned by anyone, so there is **no false
+success** — the only cost is a bounded stall. *Tier 1.* Tested extensively: Test 31 (the four
+leaked lanes, including a real Windows no-delete-share feeder), Test 35
+(release-time cleanup of a leak installed over a held hold → 98), Test 36
+(inconclusive-read keeps the entry) (`U:1549-1758, 2013-2164`); ps1 parity in
+interop Test 16e. **Recommend: in scope, keep.** This is the most intricate
+machinery in the tool and the most thoroughly tested.
+
+### D. Filesystem semantics the protocol depends on
+
+These are the **load-bearing FS assumptions**. Where one does not hold, that is a
+real robustness boundary, not a bug to fix.
+
+**D1 — Steal install: atomic overwrite vs. the 5.1 fallback.** The steal installs
+its lock at the path by replacing whatever is there. There are two engine classes
+and they differ in atomicity — so this row is *not* uniformly "atomic rename":
+- **Atomic overwrite (the guaranteed lane):** one `rename(2)`-class replace with
+  no path-absent window. bash uses GNU `mv -T` where available, probed once, with
+  a guarded `[ -d ]` + bare-`mv` fallback on BSD/macOS
+  (`git-commit-lock.sh:954-979`); pwsh 7 uses the 3-arg `File.Move(src,dst,true)`
+  (`git-commit-lock.ps1:941-982`). Atomic replace is guaranteed on local POSIX FS
+  and NTFS (probe R1: 400 replaces, zero absent reads,
+  `git-commit-lock.sh:380-382`); *not* guaranteed on some network FS (§E).
+- **Windows PowerShell 5.1 fallback (NOT atomic, but claim-guarded):** 5.1 has no
+  3-arg overload, so it unlinks then does a 2-arg `Move` (`git-commit-lock.ps1:941-982`).
+  This lane has a real path-absent window in which a rival's *create* can win the
+  recovered path — a **fairness loss, never a clobber** (claim serialization still
+  admits one stealer; the loser re-polls), documented at
+  `docs/git-commit-lock.md:471-476`.
+`File.Replace` is *deliberately never used* (throws on read-only dest;
+partial-failure states) — pinned by a static grep in interop Test 16d
+(`I:1141-1149`). *The atomic lane is Tier 1 on local FS; the 5.1 fallback is Tier
+1 for safety (no clobber) but gives up rename atomicity (fairness only).*
+**Recommend: in scope on local FS; the network-FS boundary is §E.**
+
+**D2 — O_EXCL atomic create.** `set -C` noclobber redirect (bash) /
+`FileMode.CreateNew` with `FileShare.ReadWrite|Delete` (ps1,
+`git-commit-lock.ps1:650-670`). Atomic create-or-fail on local POSIX and NTFS;
+exactly one creator wins. *Tier 1 on local FS.* **Recommend: in scope on local
+FS.** Boundary: O_EXCL is the classic NFS weak spot (§E).
+
+**D3 — Wrong-type object at the lock or claim path.** A directory, symlink, FIFO,
+socket, or device at the path is **never stolen or deleted**. bash has a
+pre-create type guard (`[ -f ] && ! [ -L ]`) plus a per-poll wrong-type
+classifier with two-consecutive-poll confirmation to survive Windows
+delete-pending ghosts (`git-commit-lock.sh:1322-1327, 1518-1570`); the same
+guards apply to the claim path with independent per-path warn-once state
+(`:1458-1487`). The FIFO case is *why the pre-create guard is mandatory*: a
+noclobber `>` onto a FIFO blocks in `open(2)` before any timeout logic — a hang,
+not a warning. *Tier 1 on bash, and on ps1-on-Windows.* Tested: Test 17
+(dir/symlink/FIFO at lock path), Test 22 (claim path), Test 17d (churn must not
+false-warn), and Test 44 (the socket & device-node arms of the same classifier,
+bash; POSIX CI legs) (`U:818-892, 1156-1262, 894-1032`).
+
+> **The one real D3 boundary — ps1 on POSIX (Tier 2, accepted).** The .NET API
+> exposes no portable type bit for FIFO/device/socket on Unix; they stat as size
+> 0 and take the **empty-orphan steal lane** (lock path) or empty-claim clear
+> lane (`git-commit-lock.ps1:62-78, 520-525`; `docs/git-commit-lock.md:215-222`).
+> Damage is capped at the one misconfigured inode (consumed by the rename). This
+> is an **unsupported configuration** (ps1 is Windows-only; POSIX runs it solely
+> as cross-impl protocol verification, `README.md:91-95`). **Recommend: accept,
+> as documented.** Closing it would need a `stat(2)` shell-out the port avoids;
+> not worth it for an unsupported config.
+
+**D4 — Non-lock CONTENT at the path.** An age-gated content guard steals only
+empty or `tok.`-prefixed line-1 content; a real user file at a typo'd path
+survives forever (`git-commit-lock.sh:1411-1444`). *Tier 1.* Tested: Test 18
+(user file untouched; sub-prefix torn write `to` never stolen; `tok.`-prefixed
+torn write *is* stolen) (`U:1034-1076`). **Two accepted residuals** make the
+guarantee precise (`git-commit-lock.sh:298-311`): (a) a stale **empty** user
+file is indistinguishable from the crash orphan and *is* stolen; (b) a stale
+user file whose line 1 happens to start `tok.` passes the wire test and *is*
+stolen. Both are deliberate (a fuller shape check buys near-zero protection for a
+harder-bound wire format). **Recommend: in scope, keep, with the two residuals
+documented** (already are).
+
+**D5 — Case-insensitive filesystem.** Not handled explicitly. The lock and claim
+paths differ only by the `.next` suffix (`<lock>` vs `<lock>.next`), which never
+collide under case folding, and the token content is case-exact regardless of FS
+case sensitivity. The only theoretical exposure is two *different* configured
+`AGENT_LOCK_PATH` values that differ only in case resolving to one file on
+NTFS/APFS — but that would be a single shared lock, which is *correct* behavior
+(they'd serialize), not a break. *Tier 3 (non-issue).* **Recommend: out of
+scope as a non-issue; no action.** (Cheap to add one sentence to the design doc
+if desired.)
+
+### E. Network / shared filesystems and clocks
+
+**E1 — Network/shared FS (NFS, SMB/CIFS, 9p, Dropbox/OneDrive sync).** The design
+doc states this plainly: the repo must live on a **local FS with atomic
+create/rename and sane mtimes**; "repos on network or sync-backed storage … are
+outside the design's guarantees" (`docs/git-commit-lock.md:122-126`). This is the
+honest boundary, because the protocol's *correctness* rests on D1 (atomic
+rename-over) and D2 (O_EXCL create), and both are exactly the operations network
+filesystems weaken:
+- **NFS:** `O_EXCL` create is famously unreliable on older NFS (the client can't
+  guarantee exclusive create across the network); `rename` atomicity and mtime
+  granularity vary by version/server. On such a mount, **D2 can let two creators
+  both "win"** → two live holders, and the read-back verification
+  (`:1352-1361`) is the only backstop (it would catch *some* but not all
+  interleavings).
+- **SMB/CIFS:** delete/rename semantics and the no-delete-share handle behavior
+  differ from both POSIX and local NTFS; mtime resolution and clock source may be
+  the *server's*, not the client's.
+- **Sync folders (Dropbox/OneDrive):** asynchronous replication means the lock
+  file's existence and content are *not* globally consistent — two machines can
+  both create "the" lock locally before sync reconciles. Fundamentally broken;
+  not a tunable.
+
+*Tier 3 (out of scope, stated).* Untested (CI runs local FS only). **Recommend:
+keep out of scope — but consider making it harder to *fall into* accidentally.**
+The current failure mode on a bad FS is *silent* (the tool runs, exclusion may
+just not hold). Options, in increasing cost: (i) leave as-is, documented — the
+default lock lives in `.git`, which is almost always local, so accidental
+network use is rare; (ii) a one-line caveat in `README.md` (since done —
+`README.md:60-64`; previously only in the deeper design doc); (iii) an optional
+best-effort startup probe of the lock dir's
+FS type with a stderr warning on a known-network type (cheap on Linux via
+`stat -f`, awkward cross-platform, and inherently incomplete). **My
+recommendation: (ii) now** (surface the boundary in the README, where an operator
+actually looks), and treat (iii) as optional polish — do *not* try to *support*
+network FS.
+
+**E2 — Multi-host clock skew / NTP jumps / timezone.** *This is the one place
+the documentation is genuinely thin, and it deserves a deliberate decision.*
+Staleness is mtime-vs-`now` arithmetic (`git-commit-lock.sh:928, 1409`). The
+lock file records `host=<hostname>` (`:519`), which *suggests* cross-host use —
+but the staleness math implicitly assumes **the mtime and the comparing
+process's clock come from the same time source.** Reasoning from first
+principles about what can go wrong:
+- On a **single host** (the actual supported case — all contenders share one
+  checkout, hence one machine), mtime and `now` are the same clock; skew is a
+  non-issue, and the **mtime floor** (946684800 / 2000-01-01,
+  `git-commit-lock.sh:925`) already absorbs the only real local clock glitch:
+  the Windows FILETIME-zero (1601) transient on fresh files
+  (`docs/git-commit-lock.md:283-293`, probed at 0.04–0.5% of readings).
+- A **large local clock correction** on the one host splits by sign, because
+  staleness is `age = now - mtime` (`git-commit-lock.sh:928, 1409`): a **forward**
+  jump (now leaps ahead) inflates the computed age, so a *live* lock can look
+  stale → premature steal; a **backward** jump (NTP steps back) shrinks the age,
+  so a genuinely *stale* lock can look fresh → delayed recovery. The
+  forward/premature-steal case is the only worrying one — and it degrades into the
+  *already handled* B5 lane: a premature steal of a still-live hold is detected at
+  release as 98 (given cooperative unwind), never a silent double-commit. So even
+  a local clock jump is **correctness-safe, liveness-degraded** — Tier 2.
+- **Cross-host** use over a shared FS (already E1-out-of-scope) is where skew
+  would actually bite: host A's mtime compared against host B's `now` with
+  minutes of skew could steal live locks wholesale. But this only arises *on a
+  network FS*, which is already excluded.
+- **Timezone** is a non-factor: all arithmetic is in epoch seconds
+  (`git-commit-lock.sh:439-449`, `git-commit-lock.ps1:448-451`), never local
+  time.
+
+*Tier 3 for cross-host (rides on E1); Tier 2 for a local NTP jump.* Untested — and
+no code change is warranted (see below). **Documented:** the design doc now states
+explicitly that the tool assumes a single time source — single-host use (the common
+case) or a shared FS with a single server clock — and that this is *why*
+network/multi-host is out of scope (`git-commit-lock.md`, "One time source"). It
+also records the reassuring part: a *local* clock jump is correctness-safe — a
+forward jump can prematurely steal a still-live lock, but that degrades to the
+detected exit-98 lane, never a silent double-commit. A doc matter, not a code gap.
+
+**E3 — mtime probe fails entirely (the staleness clock is unreadable).** Distinct
+from a *wrong* clock (E2): here the lock file's mtime cannot be read at all. Both
+ports retry three times on a *present* file, then warn loudly once per process —
+bash via `stat -c %Y` / `stat -f %m` / `date -r` (`git-commit-lock.sh:629-645`),
+pwsh via `Get-Item.LastWriteTimeUtc` (`git-commit-lock.ps1:531-560`): *"Staleness
+detection is BROKEN: stale locks will never be stolen, so a crashed holder wedges
+waiters until MAX_WAIT."* The stale check then treats an unreadable mtime as **not
+stale** — the floor guard `[ "$mt" -gt 946684800 ]` fails closed to "fresh"
+(`git-commit-lock.sh:925-927`). **Safety is preserved**: the tool never steals a
+lock whose age it cannot establish, so no premature steal and no corruption — but
+**recovery of a genuinely crashed holder is disabled**, and waiters block to
+MAX_WAIT (97). *Tier 2 (safety held, recovery lost — and loudly announced).*
+Tested: unit Test 42 shadows the inner mtime probe to return empty on a present,
+stale ghost and asserts the fail-safe lane — the "Staleness detection is BROKEN"
+warn-once fires, the ghost is NOT stolen (left in place), and the waiter blocks to
+MAX_WAIT → 97. **Recommend: accept; documented (§E3, `guarantees.md` BE-3)** — it is a
+host/FS-health failure the tool already detects and announces, and it fails *safe*
+(no false steal); the loud warning is the right behavior. This is also the clean
+reason recovery is a *Tier-1-within-envelope* property, not unconditional (see the
+tier split under §1): it presumes a readable clock.
+
+### F. Resource exhaustion
+
+**F1 — Disk full (ENOSPC) during a claim/lock create or write.** The create is
+one open+write+close in a subshell; if the write fails (ENOSPC), the subshell
+fails and the acquirer falls through to wait (`git-commit-lock.sh:1336-1361`,
+comment at `:1341-1343`). A created-but-write-failed file is an empty orphan that
+ages into the steal lane. A torn write *shorter than `tok.`* (e.g. `to`) is the
+accepted residual at `:299-304`: non-empty, non-prefixed → never stolen, loud,
+fixed by one manual `rm`. *Tier 2 (degrades to wait/97) / Tier 3 (the torn-write
+manual-fix residual).* **Tested** (per §4 item 5): unit Test 50 mounts a small 64k
+tmpfs, fills it to ENOSPC, and asserts the waiter times out at 97 with the wrapped
+command never running — no corruption, no false hold. ENOSPC injection needs a full
+FS (root via a tmpfs; `ulimit -f` raises SIGXFSZ — the wrong lane), so the test runs
+on **Linux with passwordless sudo** (the Linux CI leg) and skips-with-note elsewhere.
+ENOSPC is a host-health failure; the tool degrades safely (no corruption, no false
+hold) and the one sharp edge (sub-`tok.` torn write needing manual `rm`) is already
+documented.
+
+**F2 — ENOSPC during a LOG write.** All log writes end in `|| true`
+(`git-commit-lock.sh:561`); a failed log write is silently lost. *Tier 2.*
+**Tested** (per §4 item 5): unit Test 49 points `AGENT_LOCK_LOG` at a path *under a
+regular file*, so every open/append fails ENOTDIR, and asserts the lock still
+acquires + releases cleanly (rc 0), the wrapped command runs, the lock is cleaned
+up, and no log file appears — i.e. the failing log write is swallowed and the lock
+is unaffected. This is a portable injection (no chmod/perms), and it **also covers
+J1**. Logging is best-effort by explicit design (it must never block or fail the
+lock); the only downside is reduced post-mortem signal under disk pressure.
+
+**F3 — Inode / FD exhaustion.** Same shape as F1: a create that can't get an
+inode fails → wait → eventually 97. The tool holds at most a couple of FDs
+briefly. *Tier 2.* **Document-only — no deterministic portable injection.** A
+`ulimit -n` FD cap can't be driven deterministically here: the create needs only
+~1 FD, so an FD-exhaustion test would have to pin the process at *exactly* the
+limit across a poll loop without starving the harness itself — not portable or
+stable. Inode exhaustion needs a full FS the way F1 does (and F1/Test 50 already
+exercises the create-fails-→-wait-→-97 lane that F3 shares). So F3 is recorded as
+a reasoned-but-untested boundary rather than given a flaky test; the safe-degrade
+behaviour is the same as F1, which is tested.
+
+**F4 — Read-only / unwritable lock dir or parent.** `lock_acquire` does a
+best-effort `mkdir -p "$(dirname …)"` (`git-commit-lock.sh:1278`); if the dir is
+unwritable the create fails every poll and the waiter times out at 97. No
+corruption, no false hold. A *release* unlink blocked by an unwritable parent
+routes to the LEFTOVER lane (`:1699-1711`). *Tier 2.* **Tested** (per §4 item 5 — the
+highest-value one): unit Test 48 `chmod 0555`s the lock-dir parent and asserts the
+waiter times out at 97, the wrapped command never runs, no lock file is created,
+and the WAITING/TIMEOUT lines are logged — no corruption, no false hold. POSIX-only
+(`chmod 0555` is a no-op for writes on Git-Bash/NTFS, so it skips-with-note on
+Windows; the Linux/macOS CI legs exercise it). A correct, if blunt, outcome (97); an
+*earlier, clearer* error would be nicer but is optional polish, low priority.
+
+**F5 — Memory exhaustion.** The scripts allocate trivially (a few shell vars; the
+leaked-token list is "almost always empty"). Not a meaningful failure surface.
+*Tier 3 / non-issue.* **Recommend: no action.**
+
+### G. Misconfiguration
+
+**G1 — Lock path is a directory / `$HOME` / a real file.** Covered by D3/D4:
+never stolen or deleted, loud one-time warning, waiters reach 97
+(`U:818-840`). *Tier 1.* The security note (`docs/git-commit-lock.md:530-541`)
+bounds the worst case even for a *hostile* repo redirecting the git dir: the tool
+only ever creates its own small set of files at its own names and never deletes
+recursively. **Recommend: in scope, keep.**
+
+**G2 — Garbage numeric config.** Each knob is validated at source time; invalid
+values fall back to default with a stderr note (`git-commit-lock.sh:481-500`).
+The ps1 port *tightens* .NET's permissive parser to bash's grammar so the same
+env var configures the same value on both impls — e.g. rejecting `"1e3"`,
+trailing newlines, whitespace (`git-commit-lock.ps1:327-359`). *Tier 1.* Tested:
+unit Test 13, interop Test 12 (cross-impl parity, including `1e3`/`+2`/`'   '`/
+trailing-newline) (`U:695-703`, `I:554-608`). **Recommend: in scope, keep.**
+
+**G3 — `run` outside a git repo, no `AGENT_LOCK_PATH`.** Refused with 96 — a
+CWD-scoped lock would serialize against nobody (`git-commit-lock.sh:1768-1773`).
+Sourcing keeps a CWD fallback with a stderr warning and creates no files
+(`:570-572`; unit Test 14/14b). *Tier 1.* **Recommend: in scope, keep.**
+
+**G4 — `MAX_WAIT ≤ STALE + CLAIM_STALE`.** A startup warning, gated on MAX_WAIT
+being left at its default (a caller who set it chose the relationship). The
+relation is the stacked worst-case recovery: a crashed holder *plus* a crashed
+claimant (`git-commit-lock.sh:502-514`). *Tier 2 (advisory).* Tested: Test 8
+exercises the gate and the stacking (`U:497-522`). **Recommend: in scope,
+keep.**
+
+### H. Signals, interrupts, cleanup-on-exit
+
+**H1/H2 — bash INT/TERM/EXIT.** Handlers armed at acquire start; on a held lock
+they release and re-raise the signal (wrapper dies 143, what a watchdog needs);
+they restore the caller's pre-acquire traps exactly (`git-commit-lock.sh:1037-
+1054, 1002-1023, 780-784`). *Tier 1.* Tested: Test 11 (TERM mid-hold → 143,
+released), Test 12c (exit-while-holding chains the caller's EXIT trap), Test 12d/e
+(trap restoration), Test 34 (TERM on a *steal*-acquired hold behaves identically
+— all acquisition paths funnel through one hold helper) (`U:577-600, 633-693,
+1989-2011`). One documented caveat: a SIGINT delivered to the `run` wrapper alone
+while its foreground child survives is discarded by bash before any trap
+(`git-commit-lock.sh:1030-1036`) — a real Ctrl+C hits the whole group and does
+take the path. **Recommend: in scope, keep.**
+
+**H3 — ps1 process death.** PowerShell has no `trap SIGTERM`. The port substitutes
+(a) `try/finally` inside `Lock-Acquire`, which runs on Ctrl+C/pipeline-stop/
+terminating errors and does the claim-window cleanup + discovery read
+(`git-commit-lock.ps1:1378, 1672-1683, 1240-1295`); and (b) a `PowerShell.Exiting`
+engine-event backstop for a *held* lock (`:704, 1303-1324`). **Documented limit:**
+`PowerShell.Exiting` fires under `-Command` and interactively but **NOT under
+`-File`**, and not on hard kill / `[Environment]::Exit()`
+(`git-commit-lock.ps1:241-245, 1298-1302`). So a held lock abandoned by a
+forgetful dot-source `-File` caller relies on the stale window, not the backstop.
+The **`run` contract path is unaffected** — it pairs Acquire/Release in
+try/finally (`:1928-1979`). *Tier 2 (for the dot-source `-File` gap).* The happy
+path and trap-time claim cleanup are tested (interop Test 16e); the `-File`
+non-firing is documented, not test-pinned. **Recommend: accept the `-File`
+backstop gap as documented** — the stale window recovers it, and the supported
+`run`/try-finally paths are covered. If you want to close it, the documented
+option is handle-based ops (`git-commit-lock.ps1:146-151`), a larger change not
+worth it for a forgetful-caller edge.
+
+**H4 — Process termination/replacement *without wrapper unwind* (the no-silent-loss
+boundary).** §1's safety guarantee — a displaced holder reports 98 rather than a
+false success — relies on the wrapper *reaching its release path*. The bypass class
+is any termination or replacement of the holding process that skips that unwind;
+crucially it is **not** triggered by a normal `exit`. The instances:
+- **External SIGKILL** — untrappable; no handler runs in either port.
+- **bash `exec` that replaces the lock-holding shell** — `run` executes `"$@"`
+  *in the wrapper shell itself* (`git-commit-lock.sh:1733`), so the bypass needs the
+  exec to run in *that* shell: the wrapped command *is* an exec (`run -- exec …`),
+  or a **sourced** caller does `lock_acquire; exec …` in its own shell. Then the
+  exec replaces that shell's process image and *neither* the trailing `lock_release`
+  *nor* the `EXIT` trap (`git-commit-lock.sh:1002-1013`, armed at `:1308`) runs. An
+  exec **nested in a child** — the ordinary `run -- bash -c 'exec …'` — does **not**
+  bypass (the child is replaced; the wrapper waits and releases normally). *Verified
+  empirically 2026-06-17.*
+- **PowerShell `[Environment]::Exit(n)`** — a CLR hard-exit that bypasses
+  `Lock-Release`, the `finally`, *and* the `PowerShell.Exiting` backstop
+  (`git-commit-lock.ps1:221-245`).
+
+The useful contrast: a **plain `exit` is safe** — bash `exit` fires the EXIT trap
+(which releases), and a plain `exit` inside the pwsh `run` body unwinds its
+`finally` (`git-commit-lock.ps1:1928-1979`). Only *non-unwinding* termination or
+replacement escapes. If such a process was *already displaced* (its lease stolen
+past STALE) and exits **0**, its caller sees success with no 98 — the one
+interleaving that defeats "no silent lost update." What keeps it narrow: an external
+SIGKILL yields a non-zero wait status (`128+9`), so a caller checking exit codes does
+*not* see success; the leak needs a command that *deliberately* replaces or
+hard-exits the process **and** returns 0 **while displaced**. The *next* holder
+still recovers via staleness; only the abruptly-exiting one is unwarned. *Tier 2 —
+the residual edge of the fail-open lease.* Exercised indirectly: interop Test 5
+*uses* `[Environment]::Exit()` to fabricate a no-release orphan, confirming the
+bypass (`I:308-334`). **Recommend: accept; documented as the explicit boundary of the
+no-silent-loss guarantee** (`guarantees.md` OOS-5 / G-S1), alongside the "commits must be fast" golden rule — a
+command that replaces/hard-exits the process mid-critical-section *after being
+displaced* is exactly the fail-open case the STALE budget exists to make rare. No
+code change closes it without the handle-based ops the design rejected (§H3).
+
+### I. Cross-implementation
+
+**I1 — Wire/format compatibility.** One on-disk format (token line 1, owner line
+2, `tok.` prefix as wire contract), one read-retry schedule (8 attempts,
+20/40/80/160/320/320/320 ms — verified byte-identical between
+`git-commit-lock.sh:670` and `git-commit-lock.ps1:597-629`), one set of release
+verdicts, one config grammar. *Tier 1.* The interop suite is built to break this:
+mixed bash+pwsh exclusion (T1/T6), each side steals the other's genuine stale
+lock (T4/T5), robbed-holder 98 both directions (T8), release-classification
+agreement (T11), cross-impl claim staleness clearing (T16c), and a Windows
+PowerShell 5.1 smoke lane (T17). **Recommend: in scope, keep — and keep the
+interop suite as the guard.** Two independent implementations hammering one lock
+is "cheap adversarial verification of the protocol" (`README.md:94`).
+
+**I2 — Mixed-version tree.** Prevention (the claim protocol) holds only when
+*all* parties run it; older releases stole with an unserialized move-aside, so a
+mixed tree degrades prevention to detection (98) and can leave `.dead.*` litter
+current versions don't clean (residual 4, `git-commit-lock.sh:261-265`). *Tier
+3.* Untested (would require shipping an old version into the suite). **Recommend:
+out of scope; keep the "upgrade both implementations together" deployment note**
+— in the design doc (`docs/git-commit-lock.md:251-255`) and now also surfaced in
+`README.md:101-106`, where operators actually look. Acceptable
+because the degraded mode is still *detected* (98), never silent.
+
+### J. Logging subsystem failure
+
+**J1.** Every log write is `|| true`; the log self-truncates past ~1 MB rather
+than rotating (`git-commit-lock.sh:554-562`). A broken log never blocks or fails
+the lock. Under a redirected git dir, log *content* (the owner line) is
+attacker-influenceable — one-line text spoofing, no execution; the tool itself
+writes only its token, owner line, and protocol events, never secrets
+(`docs/git-commit-lock.md:543-551`). *Tier 2.* **Tested — covered by the F2
+log-failure test (per §4 item 5): unit Test 49** proves a failing log path leaves the
+lock fully working. Logging is best-effort by design, which is the right call for a
+lock that must keep working when the disk is full or the log path is bad. The
+follow-on (unchanged): don't build automation that *trusts* log text from an
+untrusted repo (already documented).
+
+### K. Behavior under extreme load / scheduling pressure, and internal time budgets
+
+**This is the most important analytical section** — it separates "must hold under
+any load" from "holds within an envelope," and tells the owner which apparent
+flakes are real gaps vs harness concerns.
+
+**The clean split: correctness is load-independent; liveness/latency is not.**
+
+- **Load-independent (Tier 1 *safety*, must always hold):** no silent lost update
+  (given cooperative unwind, §1/§H4), no corruption, and strict mutual exclusion
+  *within the staleness window*. These rest on O_EXCL create + atomic rename +
+  per-attempt-token discovery — *structural* properties that do not reference the
+  clock for their *correctness*. (Recovery of lock-shaped orphans is also
+  load-independent in *correctness* — only its latency degrades — but it presumes
+  a readable clock, §E3, and does not extend to foreign objects, per the tier
+  split under §1.) The mtime
+  floor
+  (`:925`) and the read-retry ladder (`:668-684`) exist precisely so that the
+  one timing-sensitive input (mtime, and transient empty reads) cannot corrupt a
+  correctness decision: a sub-floor or unsettled reading is treated as "wait,"
+  never "steal." A 25-worker round can go 3s → 41s under load
+  and *still* lose no update.
+
+- **Load-dependent (Tier 2, best-effort in an envelope):** every wall-clock bound.
+  - **Recovery latency** ≈ STALE (+ CLAIM_STALE if a claimant also crashed) +
+    poll cadence. Under CPU oversubscription or a slow FS, polls stretch, so
+    recovery takes longer — but still completes.
+  - **`MAX_WAIT` timeout (97):** a waiter on a genuinely squatted/blocked lock
+    gives up at MAX_WAIT. Under load the *real* time to MAX_WAIT stretches with
+    poll cadence; the guarantee is "bounded by MAX_WAIT polls," not "exactly
+    MAX_WAIT seconds." Interop Test 14b explicitly checks that a blocked steal
+    **never busy-spins past MAX_WAIT** and logs in a damped, bounded way
+    (`I:746-817`) — a real correctness-adjacent property (no busy-spin), with a
+    timing-dependent upper bound on the STALE-line count (`[1,8]`).
+  - **The read-retry ladder (~1.26s budget):** sized to ride out a sub-second
+    transient (AV scanner handle, probe-F create→write gap). Under pathological
+    load a transient *longer* than ~1.26s would surface as the unverifiable-2 /
+    run-1 verdict (a detected, non-corrupting outcome), not a wrong hold. Test
+    16c pins that a 0.4s transient is ridden out (`U:784-817`).
+
+**Internal time budgets, enumerated** (all tunable via `AGENT_LOCK_*`):
+
+| Budget | Default | Role | Load sensitivity |
+|---|---|---|---|
+| `STALE_SECS` | 300s | steal threshold (the lease length) | the fail-open ceiling; raise for slow holds |
+| `CLAIM_STALE_SECS` | 60s | crashed-claimant ageout | delays only steals |
+| `POLL_SECS` | 2s | poll interval | cadence stretches under load |
+| `MAX_WAIT` | 420s | total wait cap → 97 | real wall-clock stretches with cadence |
+| read-retry ladder | ~1.26s | ride out transient empty reads | a longer transient → detected-2, not wrong hold |
+| mtime floor | 2000-01-01 | reject FILETIME-zero | static, not load-sensitive |
+
+**Judgments on the load-sensitive behaviors — gap, degradation, or harness
+concern:**
+
+1. **Protocol correctness under load — (c) non-issue / already guaranteed.**
+   The stress branch wraps every suite in artificial CPU+disk load
+   (`tests/with-load.sh`) specifically to widen timing windows and surface
+   *latency/race flakes*, and the protocol assertions (exclusion, one-steal,
+   zero-98) are written to hold regardless. **Recommend: nothing to harden.**
+
+2. **Wall-clock test *bounds* under extreme load — (b) acceptable degradation;
+   fix the TEST, not the code.** Two examples surfaced by the prior stress
+   effort (which I verified independently against the code, not adopted):
+   - *Test 21's `≤20s` recovery-latency assertion* (`U:1144`) and
+   - *Test 22(a)'s claim-path warning* — the warning relies on the
+     two-consecutive-poll confirmation (the mechanism Test 17d pins for the lock
+     path) having poll *headroom* before MAX_WAIT, which an oversubscribed runner
+     can starve (`U:1156-1172`); the test asserts the warning fires, not a specific
+     poll count,
+   - and *Test 29's `≥2 CLAIM lines` discriminator* (explicitly given `MAX_WAIT=6`
+     headroom, `U:1514-1518`).
+
+   Each asserts a wall-clock or poll-count bound that an oversubscribed runner
+   (e.g. 8 hogs on 2 cores) can blow *without any protocol defect* — the
+   protocol still recovers/warns correctly, just slower. **Recommend: where these
+   flake only under extreme artificial load, relax the bound or scope the stress
+   level for that test; do NOT change product code.** The correctness assertions
+   in the same tests must stay strict.
+
+3. **Test-*harness* race setup under load — (c) harness concern, already
+   mitigated.** Tests 2b/16/16b carry heavy sync scaffolding (`sync_waiting_fresh`,
+   token-guarded `backdate_ghost`, bounded discard-and-retry, `U:70-151`) because
+   a fast waiter can complete an entire steal before the harness finishes setting
+   up the race. This is purely about *constructing* the scenario deterministically;
+   the protocol is fine. **Recommend: keep the scaffolding; it is the right fix.**
+
+4. **No-busy-spin under a permanently blocked lock — (a) a real property, and
+   it's guarded.** A failed-steal lane that `continue`d past the timeout+sleep
+   would busy-spin and never reach 97 — a genuine bug class. Interop Test 14b is
+   the regression guard (`I:746-817`). **Recommend: keep that test; treat any
+   regression here as Tier 1.**
+
+**Net K — the envelope, now adopted.** The explicit envelope — *"correctness holds
+under any load; wall-clock recovery/timeout latency scales with poll cadence and
+scheduling, bounded by the configured knobs"* — is stated in the design doc
+(`git-commit-lock.md`, "operating envelope") and in `load-testing-strategy.md` §1.
+The suite's wall-clock assertions are scoped to a load level via the envelope tier
+(`GCL_ENVELOPE_TIER` strict/relax, `ok_envelope`/`bad_envelope`): an oversubscribed
+runner's latency miss warns rather than reds, while the correctness asserts stay
+strict. So the stress branch's extreme `both/8-hog` mode is a flake-hunting tool,
+not a contract the product must meet on a 2-core runner — which structurally ends
+the chasing of "flakes" that are really a test asserting a Tier-1 bound on a
+Tier-2 quantity.
+
+---
+
+## 4. Open questions / recommended scope decisions
+
+Ordered by how much they need an explicit owner decision.
+
+**Status (Ben, 2026-06-17): reviewed and accepted — with two changes marked below.**
+Item 3 (network FS) is **document-only**: do not build the FS-type probe. Item 5 is
+**overridden** — the untested-but-robust lanes *will* get test coverage (actually-tested
+edge cases make the tool more maintainable and give future users confidence), rather than
+"accept untested". Every other recommendation is accepted as written.
+
+1. **The load/timing envelope (§K) — highest value.**
+   *Recommendation:* state in `docs/git-commit-lock.md` that correctness
+   (exclusion, no silent loss, eventual recovery) is load-independent, while all
+   wall-clock bounds (recovery latency, MAX_WAIT, the read ladder) are
+   best-effort and scale with scheduling. Then **scope the suite's wall-clock
+   assertions to a defined load level** so extreme-stress flakes (Test 21's 20s,
+   Test 22a's warning timing, Test 29's poll count) are recognised as Tier-2
+   envelope misses, not product regressions. *This resolves the recurring
+   "flake" question structurally.* Cost: doc + a test-bound audit; no product
+   change.
+   *Status (done):* the envelope is stated in `docs/git-commit-lock.md` ("operating
+   envelope" — correctness load-independent, wall-clock bounds best-effort) and
+   `docs/load-testing-strategy.md` §1, and the suite's wall-clock assertions are
+   scoped to a load level via the envelope tier (`GCL_ENVELOPE_TIER`).
+
+2. **Multi-host / clock-skew assumption (§E2) — a doc matter, not a
+   code gap.** The tool implicitly assumes a single time source; a *local* NTP
+   jump is correctness-safe (degrades to the detected-98 lane), and cross-host
+   skew only bites on a network FS that's already out of scope. *Recommendation:*
+   add one explicit sentence — "assumes a single clock, i.e. single-host (the
+   common case) or a shared FS with one server clock" — and the reassurance that
+   a local clock jump cannot cause a silent double-commit. No code change.
+   *Status (done):* the single-clock sentence + local-jump reassurance are in
+   `docs/git-commit-lock.md` ("One time source").
+
+3. **Network/shared FS is out of scope but fails *silently* if entered (§E1).**
+   The boundary is correctly stated in the design doc but only there.
+   *Decision (Ben — document-only):* surface the boundary in `README.md` (where
+   operators look), since the failure on a bad FS is silent loss of exclusion. Do
+   **not** attempt to *support* network FS, and **do not build** the optional
+   FS-type startup probe — just document. (It would be cross-platform-awkward and
+   incomplete anyway; Ben: "don't do the polish, just document.")
+   *Status (done):* the network/sync-FS boundary is stated in `README.md` (the
+   "local filesystems only" note); the FS-type probe was deliberately not built.
+
+4. **ps1-on-POSIX FIFO/device residual (§D3) and ps1 `-File` exit backstop gap
+   (§H3) — accept as documented.** Both are real but confined to an unsupported
+   config (ps1-on-POSIX) or a forgetful-caller edge that the stale window
+   recovers. *Recommendation:* no code change; confirm they stay documented.
+   Reconsider only if PowerShell-on-POSIX ever becomes supported (it isn't,
+   `README.md:91-95`).
+
+5. **Untested-but-robust-by-code lanes (resource exhaustion F1/F3/F4, log-write
+   failure F2/J1).** These degrade safely (wait/97, or silent best-effort log
+   loss) but had **no fault-injection tests** — they were reasoned-correct, not
+   verified. *Decision (Ben — overrides the prior "accept untested"):* **add test
+   coverage** for these lanes. Rationale: actually-tested edge cases make the
+   project easier to maintain and give future users confidence, versus
+   "reasoned-correct but untested." Add deterministic fault-injection tests where
+   feasible — **unwritable lock dir → clean 97** (F4, cheapest/highest-value and
+   the most likely real-world misconfig); an **unwritable log path → the lock
+   still works, the log write is swallowed** (F2/J1); and the **ENOSPC / inode /
+   FD-exhaustion** lanes (F1/F3) where they can be injected deterministically and
+   portably (e.g. a small dedicated tmpfs or quota for ENOSPC, `ulimit -n` for
+   FDs). Flag in the plan any lane that proves genuinely impractical to fault-inject
+   portably, rather than forcing a flaky test.
+
+   *Status (done):* coverage added — **F4** unit Test 48 (POSIX `chmod 0555`,
+   skip-with-note on Windows), **F2/J1** unit Test 49 (portable failing-log path via
+   ENOTDIR), **F1** unit Test 50 (Linux + passwordless-sudo 64k tmpfs filled to
+   ENOSPC; skip-with-note elsewhere). **F3** (inode/FD exhaustion) proved impractical
+   to fault-inject deterministically and portably — the create needs only ~1 FD, so a
+   `ulimit -n` cap can't be driven deterministically across a poll loop without
+   starving the harness, and inode exhaustion needs a full FS the way F1 does (F1/Test
+   50 already exercises the shared create-fails-→-wait-→-97 lane). Per the "flag any
+   impractical lane" instruction above, F3 stays **document-only**, not a flaky test.
+
+6. **Mixed-version tree (§I2) and case-insensitive FS (§D5) — out of scope,
+   confirm.** The first degrades to detection (98), never silent, and is covered
+   by the "upgrade both together" note. The second is a non-issue. *Recommendation:*
+   leave both out of scope; optionally one sentence each in the design doc.
+
+### Things explicitly NOT to do (the design already considered and rejected them)
+
+- **A background heartbeat** to refresh the lease — would make the tool more than
+  a single synchronous script; the fail-open-but-detectable lease is the
+  deliberate alternative (`git-commit-lock.sh:217-218`).
+- **A two-rename compare-and-swap** to prevent residual 5 (B3) — reintroduces
+  crash litter + a sweep, for a failure that is already bounded and
+  false-success-free (`git-commit-lock.sh:276-282`).
+- **`File.Replace` in the ps1 port** — pinned out by interop Test 16d for good
+  reasons (read-only-dest throw, partial-failure states).
+- **Trying to support network/shared filesystems** — the protocol's correctness
+  rests on local-FS atomic create/rename; this is a boundary to *document*, not
+  to engineer around.
diff --git a/docs/git-commit-lock.md b/docs/git-commit-lock.md
index 828cfc4..c8dc29b 100644
--- a/docs/git-commit-lock.md
+++ b/docs/git-commit-lock.md
@@ -292,6 +292,22 @@ settles in milliseconds. The
 same floor governs the claim file's ageout: a sub-floor claim mtime reads as
 "just created", never "ancient — clear".
 
+**The operating envelope — correctness is load-independent; latency is not.**
+Exclusion, no-silent-loss, and eventual recovery rest on atomic create/rename
+plus per-attempt tokens, and hold under any load. The wall-clock bounds —
+recovery latency (≈ `STALE_SECS` + poll cadence), the `MAX_WAIT` timeout, and the
+~1.3 s read-retry ladder — are best-effort and scale with scheduling: under CPU
+oversubscription or a slow filesystem they stretch, but the protocol still
+recovers and never loses an update. (For the precise guarantee/scope split, see
+[`guarantees.md`](guarantees.md).)
+
+**One time source.** The tool assumes a single clock — single-host use (the
+common case: all contenders share one checkout, hence one machine and one clock),
+or a shared filesystem with one server clock. A local clock jump is
+correctness-safe: a forward jump can make a live lock look stale and be
+prematurely stolen, but that degrades to the detected exit-98 lane (the robbed
+holder's release fails loudly), never a silent double-commit.
+
 ## The PowerShell port (`git-commit-lock.ps1`)
 
 Some agents (Codex on Windows, for example) run their commands in
@@ -561,6 +577,7 @@ unavailable):
 | `git-commit-lock.sh`                  | the mutex (bash; the authoritative implementation): source for `lock_acquire/lock_release/lock_run`, or `git-commit-lock.sh run -- <cmd>` |
 | `git-commit-lock.ps1`                 | wire-compatible PowerShell port (see [The PowerShell port](#the-powershell-port-git-commit-lockps1) above): `git-commit-lock.ps1 run "<pwsh cmd>"`, or dot-source for `Lock-Acquire`/`Lock-Release` |
 | `tests/git-commit-lock.test.sh`             | self-contained bash tests (throwaway temp dirs); exit 0 == all pass |
+| `tests/git-commit-lock.canary.test.sh`      | bash concurrency canary: mutual exclusion under many concurrent workers over repeated rounds — the statistical full-fan-out scenario (throwaway temp dirs) |
 | `tests/git-commit-lock.interop.test.sh`     | cross-impl tests: pwsh + bash workers share one lock and serialise; run from MINGW/Git-Bash |
 | `tests/git-commit-lock.integration.test.sh` | end-to-end: many concurrent workers make real commits into one shared repo; the history is audited for the tool's guarantees |
 
@@ -571,20 +588,25 @@ Run the suites from a clone of this repository (they are not installed to
 
 ```sh
 bash tests/git-commit-lock.test.sh             # bash implementation
+bash tests/git-commit-lock.canary.test.sh      # bash concurrency canary (mutual exclusion under many concurrent workers)
 bash tests/git-commit-lock.interop.test.sh     # bash + PowerShell interop (skips if pwsh is absent)
 bash tests/git-commit-lock.integration.test.sh # end-to-end: concurrent real commits into one repo (pwsh half skips if absent)
 ```
 
 Each suite prints a result summary line and exits 0 when everything passes.
-All three use throwaway temp dirs and never touch the repo you launch them
+All four use throwaway temp dirs and never touch the repo you launch them
 from. The heavy fan-out tests run at a REDUCED width by default, so a routine
 run doesn't lag a shared development machine; each suite prints a
 `fan-out mode:` line at the start and tags its result line with the mode, so
 check those say `FULL` when you ran `GCL_TEST_FULL=1` for the full-strength
 canary (CI does).
 
-`tests/git-commit-lock.test.sh` covers the bash implementation: mutual exclusion
-under many concurrent workers (clean acquire/release path), stale-lock theft,
+`tests/git-commit-lock.canary.test.sh` is the concurrency canary: mutual
+exclusion under many concurrent workers (clean acquire/release path) over
+repeated rounds — the statistical scenario that needs the full 8×25 fan-out
+(`GCL_TEST_FULL=1`, which CI runs) to trust a rare exclusion race.
+
+`tests/git-commit-lock.test.sh` covers the bash implementation: stale-lock theft,
 crash recovery under contention (several waiters racing one dead lock —
 claim-serialized: exactly one steal, zero displacements, zero spurious 98s,
 and no move-aside file ever created), claim contention (many concurrent
@@ -648,7 +670,7 @@ is audited for the guarantees this document claims — every commit lands,
 history stays linear, no commit sweeps up another worker's file, no
 `index.lock` races, no stolen leases, and a clean tree at the end.
 
-The same three suites run in CI on Linux, macOS, and Windows
+The same four suites run in CI on Linux, macOS, and Windows
 (`.github/workflows/tests.yml`), at full fan-out strength, alongside a
 shellcheck + PSScriptAnalyzer lint job. The POSIX legs exercise the
 PowerShell implementation purely as cross-implementation protocol
@@ -664,9 +686,10 @@ heavy process fan-out is environmental, not a lock failure — but only the
 interop suite's exclusion test tolerates it (scoring by violations/steals,
 with a minimum-acquired floor so a collapsed fan-out cannot pass vacuously);
 the integration suite is deliberately strict per worker (every worker must
-launch and commit), and the unit suite's counts are exact.
+launch and commit), and the unit and canary suites' counts are exact (the
+canary requires every worker to acquire and release in each round).
 
-For debugging, all three suites copy their logs and work dirs to
+For debugging, all four suites copy their logs and work dirs to
 `$GCL_TEST_PRESERVE_DIR` when it is set, and keep the work dir on disk on any
 failure.
 
diff --git a/docs/guarantees.md b/docs/guarantees.md
new file mode 100644
index 0000000..d27aab0
--- /dev/null
+++ b/docs/guarantees.md
@@ -0,0 +1,423 @@
+# git-commit-lock: guarantees and scope (the normative contract)
+
+**Status: normative.** This document states *what the tool guarantees*, *under
+what conditions* (the operating envelope), and *what is explicitly out of
+scope*. It is the contract a user or a CI gate can point at: a behavior listed
+under [Guarantees](#2-guarantees) is a property the code must uphold and the
+tests defend; a behavior under [Out of scope](#5-out-of-scope-not-guaranteed) is
+one the tool deliberately does not promise.
+
+**How this relates to the other two docs.** This is the *contract*;
+[`failure-modes.md`](failure-modes.md) is the *analysis* behind it (per-mode
+current behavior, tier classification, and the scope decisions that produced
+this contract); [`git-commit-lock.md`](git-commit-lock.md) is the *design
+reference* (why the protocol is shaped this way and how it works). Where they
+appear to disagree, the **code and tests are authoritative**, then this contract,
+then the analysis, then the design narrative. Each guarantee below cites its
+witnessing test(s) and the failure-modes section that justifies it; the
+[Verification map](#7-verification-map) collects those pointers. (Test and
+`file:line` citations are **anchors, not exact addresses**: find a test by its
+name/number — the line numbers reflect the tree when written and drift as files
+move.)
+
+This contract makes **no new claims** about behavior — it is a re-statement of
+the decisions recorded in `failure-modes.md` §4 as commitments. It does not
+re-derive the protocol (see the design doc) or re-argue the tiers (see the
+analysis).
+
+---
+
+## 1. The operating envelope
+
+Every guarantee in §2 holds **within this envelope**. Outside it, the tool
+degrades as described in §4 (best-effort) or §5 (out of scope) — in most cases
+*detectably and without corruption*, but the strict guarantees are not promised.
+The envelope is not a disclaimer bolted on; it is the precise set of assumptions
+the filesystem-lease design rests on.
+
+**E1 — Single host, single time source.** All contenders share one working tree,
+hence one machine, hence one clock. Staleness is `age = now − mtime` arithmetic
+(`git-commit-lock.sh:928,1409`); it assumes the mtime and the comparing process's
+`now` come from the *same* clock. Single-host use satisfies this. A *local* clock
+jump remains correctness-safe (it degrades to the detected-98 lane, never a
+silent double-commit; see G-S1 and `failure-modes.md` §E2). Multi-host use over a
+shared FS does not satisfy it and is out of scope (§5, OOS-2).
+
+**E2 — Local filesystem with atomic create/rename and sane mtimes.** The protocol
+is built from three filesystem operations — atomic create-or-fail (`O_EXCL` /
+`FileMode.CreateNew`), atomic rename-over, and unlink — each atomic on local
+POSIX filesystems and NTFS (ext4, APFS, NTFS, and kin). (The one exception is the
+Windows PowerShell 5.1 steal, which lacks the atomic 3-arg move and uses a
+claim-guarded unlink-then-move — a fairness loss, never a clobber; see BE-5.)
+Network and sync-backed storage (NFS, SMB/CIFS, 9p, Dropbox/OneDrive) weaken
+exactly these operations and are out of scope (§5, OOS-1;
+`git-commit-lock.md:122-126`).
+
+**E3 — Cooperative wrapper unwind.** The theft-detection guarantee (G-S1) fires
+when the lock-holding shell *reaches its release path* — on normal return, on a
+handled INT/TERM, or on a plain `exit` (all of which unwind). It is **not**
+triggered by a termination or replacement that skips the unwind: an external
+SIGKILL, an `exec` that replaces the lock-holding shell itself, or PowerShell
+`[Environment]::Exit()`. (An `exec` nested in a child — the ordinary
+`run -- bash -c 'exec …'` — does *not* skip release.) See §5, OOS-5 for the
+precise boundary.
+
+**E4 — Commits fast relative to the staleness window (for *strict* exclusion).**
+The lease is fail-open: a hold older than `AGENT_LOCK_STALE_SECS` (default 300s)
+can be stolen mid-work. *Strict* mutual exclusion (G-S3) is therefore guaranteed
+only for holds that complete within the staleness window. A hold that overruns it
+is still *safe* — a displaced holder is detected (G-S1) — but two processes can
+briefly both believe they hold the lock. Keep commits well inside the window, or
+raise `AGENT_LOCK_STALE_SECS` for a deliberately slow hold (the golden rule,
+`git-commit-lock.md:433-458`).
+
+**E5 — Matching protocol version on all parties.** Prevention of the
+crash-recovery-under-contention race (G-S3's no-displacement property) holds only
+when every contender runs the claim protocol. A mixed-version tree degrades
+prevention to detection and is out of scope (§5, OOS-3).
+
+**E6 — Supported platforms.** `git-commit-lock.sh` (bash) is supported on Linux,
+macOS, and Windows under Git-for-Windows' MINGW bash. `git-commit-lock.ps1`
+(PowerShell) is supported on **Windows only**. Running the `.ps1` port on POSIX is
+a CI-only cross-implementation protocol check, not a supported configuration (§5,
+OOS-4; `README.md:91-95`).
+
+**E7 — Cooperating, non-hostile agents.** The lock is advisory: it serializes
+*cooperating* agents. It detects interference where it can (token checks; exit 98)
+but cannot prevent a process running as the same user from deleting or
+overwriting the lock file. The threat model is honest agents racing each other,
+not an actively hostile local process (§5, OOS-6;
+`git-commit-lock.md:520-528`).
+
+---
+
+## 2. Guarantees
+
+Each guarantee holds **within the envelope (§1)**. The defaults named are knobs
+(`AGENT_LOCK_*`); the guarantee is in terms of the configured value, not a fixed
+number of seconds.
+
+### 2A. Safety (unconditional within the envelope)
+
+These are correctness properties. If one can break inside the envelope, that is a
+bug.
+
+- **G-S1 — No silent lost update.** A holder whose lease is taken from it never
+  reports a serialized critical section that wasn't. On release, a **definitive**
+  theft (the lock file is gone, or carries a foreign token) returns **98** with a
+  loud WARNING rather than success (`git-commit-lock.sh:1607-1688`;
+  `git-commit-lock.ps1:1717-1837`); a state the release cannot disambiguate (the
+  file is present but reads **empty** after the retry ladder — possibly a successor
+  mid-create after a boundary steal) returns the distinct **unverifiable** code
+  (`lock_release` 2; `run` maps it to 1 when the command itself succeeded, else
+  keeps the command's code) — still **never** a silent success. *Condition:* the
+  wrapper unwinds cooperatively (E3). *Witness:* unit Test 4b (98 + WARNING), Test
+  16 (unverifiable lane), interop Test 8 (98 both directions) (`U:387-417`,
+  `I:460-492`). *Basis:* `failure-modes.md` §1, §B5.
+
+- **G-S2 — No corruption and no false hold.** An acquirer that cannot prove its
+  own token is at the lock path (after the read-back retry ladder) treats the lock
+  as **not** acquired and logs loudly; it never "repairs" a failed read-back by
+  rewriting the path (`git-commit-lock.sh:1352-1361`). Every path that cannot
+  establish a fact fails toward "wait", never toward "steal" or "hold". This
+  extends to resource-exhaustion lanes: a create that fails (ENOSPC, FD/inode
+  exhaustion, an unwritable lock dir) **never produces a false hold or corruption**
+  — it falls through to wait/97 (an empty orphan ages into the recovery lane). The
+  guarantee is *no false hold*, not a uniformly clean 97: a torn write shorter than
+  `tok.` is a non-lock-shaped residual, never stolen, that needs manual removal
+  (`failure-modes.md` §F1 — an accepted residual). *Witness:* the read-back-failure lanes —
+  create-path Test 32, steal-path Test 32b (`U:1760-1855`); resource lanes —
+  unwritable lock dir Test 48 (F4), ENOSPC Test 50 (F1, Linux+sudo; skip-with-note
+  elsewhere) (`failure-modes.md` §4 item 5); FD/inode exhaustion (F3) is document-only
+  (no portable injection). *Basis:* §1, §A1, §F.
+
+- **G-S3 — Strict mutual exclusion within the staleness window, with no
+  displacement during crash recovery.** Within `AGENT_LOCK_STALE_SECS` no steal
+  occurs at all, so at most one process holds the lock. When a holder dies and a
+  herd of waiters recovers the one stale lock, the **claim protocol** admits
+  exactly one stealer and the recovering waiter keeps the lock it recovered — a
+  straggler whose stale judgement predates the recovery cannot displace it
+  (`git-commit-lock.sh:1070-1218`). At most one process is ever the *legitimate*
+  holder. (On the supported Windows PowerShell 5.1 unlink-then-move lane the
+  recovering waiter can *lose* the recovered path to a rival's create in the
+  transient absent window — a fairness loss, never a clobber; see BE-5.)
+  *Condition:* holds complete within the window (E4); a stable clock (E1) — a local
+  clock jump preserves *no silent loss* (G-S1) but can break *strict exclusion* by
+  making a live lock look stale (a premature, but detected, steal); and matching
+  version (E5). *Witness:* the concurrency canary (mutual exclusion under many
+  concurrent workers, 8 rounds × 25 at FULL, `C:81-111`), unit Tests 2b/20
+  (claim-recovery and many-stealers), interop Tests 1/6/16/16b, integration suite
+  (`U:212-346,1095-1128`; `I:227-261,341-386,884-1088`). *Basis:*
+  §A1/§A2/§A3.
+
+- **G-S4 — Never destroys a non-lock-shaped object.** A directory, symlink, FIFO,
+  device, socket, or a regular file whose line 1 is neither empty nor `tok.`-
+  prefixed is **never** stolen or deleted, at either the lock path or the claim
+  path (`git-commit-lock.sh:1322-1327,1411-1444,1458-1487,1518-1570`). The
+  never-steal *safety* is unconditional; the *warning* is best-effort — it normally
+  fires once and names the object, but an **actively-rewritten** user file may never
+  age into the content guard and then times out at 97 *without* the warning
+  (`git-commit-lock.sh:308`). Deletion is
+  never recursive; the tool only ever removes its own named lock-protocol files.
+  *Two accepted residuals* bound this and are documented, not bugs: a stale
+  *empty* user file, and a stale file whose line 1 happens to start `tok.`, are
+  stolen (`git-commit-lock.sh:298-311`). *Witness:* unit Tests 17/17d/18/22
+  (dir/symlink/FIFO/content) and Test 44 (socket & device-node, bash; POSIX CI)
+  (`U:818-892,894-1032,1034-1076,1156-1262`). *Basis:* §D3/§D4/§G1. *Scoped
+  exception:* ps1-on-POSIX has no .NET type probe for FIFO/device/socket (§5,
+  OOS-4).
+
+- **G-S5 — Truthful exit codes.** The three reserved high codes from `run` are
+  exact: **96** = usage error (command **not** run), **97** = acquisition timed
+  out (command **not** run), **98** = lock stolen mid-hold (command **ran but was
+  not serialized** — redo it) (`git-commit-lock.sh:392-415`). A `run` exit of the
+  command's own code (including 0) means the command was serialized — *subject to
+  the one carve-out in OOS-5* (a non-unwinding exit returning 0 while displaced).
+  *Two stated assumptions* keep the high-code contract exact: the wrapped command
+  must not itself exit 96/97/98 (such an exit is indistinguishable from a tool
+  verdict, `git-commit-lock.sh:392`), and an **unverifiable** release maps a
+  *successful* command to **1** (G-S1), so 0 is never reported over an unverifiable
+  hold. *Witness:* Test 7 (96), Test 8 (97), Test 4b (98), Test 5 (propagation),
+  Test 16 (unverifiable→1), interop `run` verdict tests. *Basis:* §1, §H4.
+
+### 2B. Recovery (within the FS/clock/tooling envelope)
+
+These hold given a readable clock (E1) and lock-shaped state; latency is
+best-effort (§4).
+
+- **G-R1 — Lock-shaped orphans are reclaimed.** A crashed holder's stale lock, an
+  orphaned or empty claim, and an empty crash-orphan (a crash between create and
+  content write) all eventually become stealable and are recovered, bounded by
+  `STALE` (+ `CLAIM_STALE` if a claimant also crashed) plus poll cadence
+  (`git-commit-lock.sh:1408-1446,1228-1267`). This does **not** extend to *foreign*
+  objects (G-S4) — those wait for an operator. *Witness:* unit Tests 2/3/21
+  (`U:197-210,348-361,1130-1154`). *Basis:* §B1/§C1/§C2/§C3.
+
+- **G-R2 — One stuck agent cannot wedge the fleet.** Because the lock is a lease
+  and the claim is itself leased, a hung-but-alive holder or claimant is recovered
+  within its window; the fleet does not deadlock behind it. *Witness:* the stale-
+  steal and crashed-claimant lanes above. *Basis:* §1, `git-commit-lock.md:60-82`
+  (the explicit reason for a lease over a kernel lock).
+
+- **G-R3 — No busy-spin; bounded wait.** A waiter on a genuinely squatted or
+  delete-blocked lock gives up at `MAX_WAIT` and never busy-spins past it; the
+  failed-steal lane logs in a damped, bounded way (`I:746-817`). *Witness:* interop
+  Test 14b. *Basis:* §K(4).
+
+- **G-R4 — No process leaves an *unowned* lock behind.** Per-attempt tokens make
+  the ownership-discovery read conclusive, so no process inside an
+  acquire/hold/release arc can install a lock nobody owns and walk away: it either
+  discovers it holds, or the lock is recovered by staleness, and in no case is a
+  steal-installed lock mistaken for owned by the wrong process
+  (`git-commit-lock.sh:138-157` + the leaked-token memory). The one bounded
+  residual — an untrappably-killed claimant's claim installed as an unowned lock —
+  stalls waiters ≤ one stale window with **no false success** (accepted; §B3).
+  *Witness:* unit Tests 31/35/36 (`U:1549-1758,2013-2164`). *Basis:* §C4.
+
+### 2C. Interoperation
+
+- **G-I1 — bash and PowerShell take the same lock.** One on-disk wire format
+  (`tok.`-prefixed line 1, owner line 2), one read-retry ladder
+  (8 attempts, 20/40/80/160/320/320/320 ms — byte-identical between ports), one
+  set of release verdicts, one config grammar. A `.sh` holder and a `.ps1` holder
+  in one tree serialize against each other and steal each other's genuinely stale
+  locks. *Condition:* Windows for the supported ps1 config (E6). *Witness:* the
+  interop suite throughout (`I:*`). *Basis:* §I1.
+
+---
+
+## 3. Failure semantics (the shape of every degradation)
+
+When the tool cannot uphold a property it fails in one of these bounded,
+documented ways — **never** silently:
+
+- **Detect, don't pretend** — a displaced holder returns 98 + WARNING (G-S1).
+- **Wait, don't guess** — an unprovable state routes to poll/wait → 97, never to
+  a steal or a hold (G-S2).
+- **Refuse, don't destroy** — a non-lock-shaped object is left in place (and
+  normally warned about — the warning is best-effort, see G-S4); waiters reach 97.
+- **Announce, don't hide** — a broken staleness clock (unreadable mtime) warns
+  loudly once and disables stealing (fails safe; §4, BE-2).
+
+**Within the operating envelope**, the only place a *correctness* degradation can
+be silent — a non-unwinding exit returning 0 while displaced — is carved out
+explicitly in OOS-5. Two silences fall *outside* that scope and are disclosed
+separately: a degradation **outside** the envelope (a network/sync FS silently
+losing exclusion, OOS-1), and a **non-correctness** loss (a swallowed log write,
+BE-4). Logging is best-effort by design; correctness is not.
+
+---
+
+## 4. Best-effort (within the envelope, not a hard guarantee)
+
+These hold under normal conditions and degrade *gracefully and detectably* under
+pathological scheduling or host-health failures. **Correctness (§2) is preserved
+throughout; only liveness/latency degrades.** This tier is what the suite's
+wall-clock test assertions are scoped against (the strict/envelope test split; see
+`failure-modes.md` §K and §4 item 1).
+
+- **BE-1 — Wall-clock latency bounds are in poll-count, not seconds.** Recovery
+  latency (≈ `STALE` + poll cadence), the `MAX_WAIT` timeout, and the ~1.26s
+  read-retry ladder all *stretch* under CPU oversubscription or a slow FS while
+  still completing. The guarantee is "bounded by the configured knobs in
+  poll-count," not "exactly N seconds." Tests asserting a specific wall-clock or
+  poll-count number (Test 21's ≤20s, Test 22a's warning timing, Test 29's ≥2-CLAIM
+  count) assert an *envelope* bound, not a correctness bound, and may be relaxed or
+  gated to a defined load level (`GCL_ENVELOPE_TIER=relax`) without any product
+  change. *Basis:* `failure-modes.md` §K and §4 item 1.
+
+- **BE-2 — Diagnostic warnings are best-effort.** The wrong-type config warning
+  and the claim-path warning rely on poll headroom that an oversubscribed runner
+  can starve; the guarantee is that the *condition is handled safely*, not that a
+  specific warning fires within a specific time. *Basis:* §K(2), §D3.
+
+- **BE-3 — Recovery presumes a readable clock; an unreadable mtime fails safe.**
+  If the lock's mtime cannot be read at all, both ports retry three times, then
+  warn loudly once per process and treat the lock as **not** stale (the mtime floor
+  fails closed to "fresh"): no premature steal, no corruption — but recovery of a
+  genuinely crashed holder is *disabled* and waiters block to `MAX_WAIT` (97).
+  Safety is preserved; recovery is lost and announced. *Witness:* unit Test 42
+  (shadows the mtime probe to return empty on a present stale ghost; the
+  "Staleness detection is BROKEN" warn-once fires, the ghost is left in place,
+  the waiter blocks to 97). *Basis:* §E3.
+
+- **BE-4 — Logging is best-effort and never blocks the lock.** Every log write
+  ends `|| true`; a failed or unwritable log write is swallowed and the lock works
+  unaffected (the log self-truncates past ~1 MB). *Witness:* unit Test 49 (points
+  `AGENT_LOCK_LOG` under a regular file so every append fails ENOTDIR; the lock
+  still acquires + releases cleanly with the log write swallowed — also covers
+  J1). *Basis:* §F2/§J1.
+
+- **BE-5 — The PowerShell 5.1 steal is claim-guarded, not atomic.** Windows
+  PowerShell 5.1 lacks the 3-arg `File.Move` overload, so its steal is
+  unlink-then-move with a transient absent window. Under the claim this is a
+  *fairness loss* (a rival's create can win the recovered path; the claimant backs
+  off cleanly), **never a clobber**. *Basis:* §D1, `git-commit-lock.md:471-476`.
+
+---
+
+## 5. Out of scope (not guaranteed)
+
+The tool deliberately does not promise the following. Where it can, it still fails
+*safely and detectably*; the point of listing them is that the strict guarantees
+of §2 are **not** claimed here.
+
+- **OOS-1 — Network / shared / sync-backed filesystems.** NFS, SMB/CIFS, 9p,
+  Dropbox/OneDrive. These weaken the atomic create/rename the protocol rests on, so
+  exclusion may silently not hold. Documented boundary only — surfaced in the
+  README; **no** FS-type probe is built (decision: `failure-modes.md` §4 item 3).
+  *Basis:* §E1.
+
+- **OOS-2 — Multi-host use / clock skew across hosts.** Rides on OOS-1 (only arises
+  on a shared FS). A *local* clock jump on the single host is **in scope and
+  correctness-safe** (degrades to the detected-98 lane). *Basis:* §E2.
+
+- **OOS-3 — Mixed-version trees.** If contenders run different protocol versions,
+  the no-displacement prevention (G-S3) degrades to detection (98), and old-style
+  stealers can leave `.dead.*` litter. Never silent, but the prevention property is
+  not guaranteed. Deployment rule: **upgrade both implementations together**
+  (`git-commit-lock.md:251-256`; also surfaced in `README.md:101-106`).
+  *Basis:* §I2.
+
+- **OOS-4 — PowerShell port on POSIX.** Supported on Windows only; on POSIX it runs
+  solely as a cross-implementation protocol check. Its one residual there
+  (FIFO/device/socket stat as empty and take the empty-orphan lane, capping damage
+  at the one misconfigured inode) is accepted and documented. *Basis:* §D3.
+
+- **OOS-5 — A non-unwinding exit returning 0 while displaced (the no-silent-loss
+  boundary).** G-S1's detection requires the *lock-holding shell* to reach release
+  (E3). If a *displaced* holder is terminated or replaced **without unwinding** —
+  external SIGKILL, an `exec` that replaces the **lock-holding shell itself**, or
+  PowerShell `[Environment]::Exit()` — *and* the resulting process exits **0**, the
+  caller can see success with no 98. The `exec` case is **narrower than it looks**
+  (verified empirically): `lock_run` runs the wrapped command vector in the wrapper
+  shell (`git-commit-lock.sh:1733`), so the bypass needs the exec to run in *that*
+  shell — a **sourced** caller doing `lock_acquire; exec …` in its own shell, or
+  the contrived `run -- exec …` where the wrapped command *is* an exec. An exec
+  **nested in a child** — the normal `run -- bash -c 'exec …'` — does **not**
+  bypass: the child is replaced, the wrapper waits and releases normally. A **plain
+  `exit` is safe** (it unwinds). What keeps the whole class narrow: an external
+  SIGKILL yields a non-zero wait status (POSIX `128+9`), so a caller checking exit
+  codes does not see success; the hole needs a process that *deliberately* replaces
+  or hard-exits the lock-holding shell **and** returns 0 **while displaced**. The
+  *next* holder still recovers via staleness; only the abruptly-exiting one is
+  unwarned. No code change closes this without the handle-based ops the design
+  rejected. *Witness:* the §H4 non-unwinding-exit boundary is pinned by interop
+  Test 5 (`I:308-334`, ps1 `[Environment]::Exit()`) and unit Test 40 (bash `exec`
+  in the lock-holding shell, OOS-5). *Basis:* §H4.
+
+- **OOS-6 — Adversarial / hostile local processes.** The lock is advisory. Against
+  a process actively trying to break it (deleting/overwriting the lock file, or a
+  hostile repo redirecting the git dir), the tool *detects* interference where it
+  can but does not prevent it; damage from a redirected git dir is bounded to the
+  tool's own named files with non-recursive deletion. *Basis:*
+  `git-commit-lock.md:520-551`.
+
+- **OOS-7 — Non-issues, explicitly.** A case-insensitive FS path collision (the
+  lock and claim paths never collide under case folding; two case-differing
+  configured paths resolving to one file is *correct* shared-lock behavior) and
+  memory exhaustion (the scripts allocate trivially). No action. *Basis:* §D5/§F5.
+
+### Things deliberately NOT built (and why)
+
+The design considered and rejected each of these; they are not roadmap items
+(`failure-modes.md` §4 "Things explicitly NOT to do"):
+
+- A **background heartbeat** to refresh the lease — would make the tool more than a
+  single synchronous script; the fail-open-but-detectable lease is the deliberate
+  alternative.
+- A **two-rename compare-and-swap** to prevent the B3 residual — reintroduces crash
+  litter and a sweep, for a failure that is already bounded and false-success-free.
+- **`File.Replace`** in the ps1 port — throws on a read-only destination and has
+  partial-failure states (pinned out by interop Test 16d).
+- **Supporting network/shared filesystems** — correctness rests on local-FS atomic
+  create/rename; this is a boundary to document, not to engineer around.
+
+---
+
+## 6. Staying inside the envelope (operating rules)
+
+- **Hold the lock only to commit.** Decide what to stage, build any patch, and
+  resolve failures *outside* the lock; a normal stage+commit holds it for seconds
+  (the golden rule, `git-commit-lock.md:433-458`). This keeps holds inside the
+  staleness window (E4) so G-S3 applies.
+- **For a deliberately slow hold, raise `AGENT_LOCK_STALE_SECS`** for that
+  invocation rather than risking a fail-open steal.
+- **Keep the lock on a local filesystem** (the default `<gitdir>/commit.lock`
+  almost always is) so E2 holds.
+- **Upgrade both implementations together** (E5) so G-S3's prevention holds.
+- **Never `git stash` in a shared checkout** — it rewrites the working tree and
+  clobbers other agents' edits (orthogonal to the lock, but part of operating in a
+  shared tree).
+
+---
+
+## 7. Verification map
+
+Each guarantee → its witnessing test(s) and the failure-modes section. `U` =
+`tests/git-commit-lock.test.sh`, `I` = `tests/git-commit-lock.interop.test.sh`,
+`C` = `tests/git-commit-lock.canary.test.sh` (the concurrency canary), `integ` =
+`tests/git-commit-lock.integration.test.sh`. The former resource-exhaustion and
+diagnostic-clock coverage gaps are now closed by the fault-injection tests
+(`failure-modes.md` §4 item 5): F4 (Test 48), F2/J1 (Test 49), F1 (Test 50), and the
+unreadable-mtime fail-safe (Test 42). The one remaining document-only lane is F3
+(FD/inode exhaustion), which has no portable deterministic injection.
+
+| Guarantee | Witness | failure-modes § |
+|---|---|---|
+| G-S1 no silent lost update | U Test 4b + Test 16 (unverifiable lane); I Test 8 (both dirs) | §1, §B5 |
+| G-S2 no corruption / no false hold | U Tests 32/32b (read-back failure); **resource lanes: Test 48 (F4), Test 50 (F1); F3 document-only** | §1, §A1, §F |
+| G-S3 strict exclusion in window + no displacement | C Test 1 (8×25 canary); U Tests 2b/20; I Tests 1/6/16/16b; integ | §A1/§A2/§A3 |
+| G-S4 never destroys non-lock-shaped | U Tests 17/17d/18/22 (dir/symlink/FIFO) + Test 44 (socket/device) | §D3/§D4/§G1 |
+| G-S5 truthful exit codes | U Tests 7/8/4b/5/16; I run-verdict tests | §1, §H4 |
+| G-R1 lock-shaped orphans reclaimed | U Tests 2/3/21 | §B1/§C1/§C2/§C3 |
+| G-R2 one stuck agent can't wedge | stale-steal + crashed-claimant lanes | §1 |
+| G-R3 no busy-spin; bounded wait | I Test 14b | §K(4) |
+| G-R4 no unowned lock left behind | U Tests 31/35/36 | §C4 |
+| G-I1 bash⇄pwsh same lock | I suite throughout | §I1 |
+| BE-3 unreadable mtime fails safe | U Test 42 | §E3 |
+| BE-4 logging best-effort | U Test 49 (F2/J1) | §F2/§J1 |
+
+The resource-exhaustion and diagnostic-clock lanes (Tests 42/48/49/50) are the
+fault-injection coverage added per `failure-modes.md` §4 item 5; F3 (FD/inode
+exhaustion) stays document-only for want of a portable deterministic injection.
diff --git a/docs/load-testing-strategy.md b/docs/load-testing-strategy.md
new file mode 100644
index 0000000..22ea393
--- /dev/null
+++ b/docs/load-testing-strategy.md
@@ -0,0 +1,236 @@
+# git-commit-lock: CI & load-testing strategy
+
+This is the rationale for *why the CI is shaped the way it is* — the principles
+behind the three workflows (`tests.yml`, `nightly.yml`, `deep-sweep.yml`), the load
+wrapper (`tests/with-load.sh`), and the two test-level levers (the Axis-A sweep and
+the envelope tier). It describes the system as it stands; for the correctness
+guarantees the suites assert against, see `docs/guarantees.md` and
+`docs/failure-modes.md`.
+
+---
+
+## 1. The principle: correctness is load-independent
+
+This is not a throughput-bound system whose correctness degrades under load. Safety
+and exclusion rest on structural primitives — `O_EXCL` create, atomic `rename(2)`,
+per-attempt token discovery — that never consult the clock for a *correctness*
+decision (`guarantees.md` §2A, BE-1; `failure-modes.md` §K). No amount of CPU or IO
+pressure makes a rename non-atomic or lets two `O_EXCL` creates both win on a local
+filesystem.
+
+So load does not *change what is correct* — it only *surfaces races*. Its sole job
+is to widen the timing windows in the protocol's multi-syscall sequences (which are
+not individually atomic) so that the inter-process interleavings the code claims to
+handle are actually exercised. The right question to ask of a load regime is "does
+this raise the probability that process A is suspended between syscall N and N+1
+while process B advances?" — not "does it consume the box?". Past roughly 2× CPU
+oversubscription, more load finds no new correctness bugs; it only stretches
+wall-clock latency and starts tripping the suite's best-effort timing assertions.
+
+Two consequences shape the whole design:
+
+- **The per-PR gate runs no load** (strict, fast). A red required check is then
+  always actionable — a real correctness bug or genuine infra drift, never a
+  stress-manufactured wall-clock flake.
+- **Load lives in non-blocking tiers** (nightly, deep-sweep), where the
+  load-sensitive timing assertions are relaxed to warnings so an oversubscribed
+  runner cannot turn a latency stretch into a red.
+
+## 2. Deterministic steering is the primary race-coverage lever
+
+The protocol's genuinely dangerous windows — create → read-back verify; the claim
+recheck → touch → re-verify → rename residual; rename-over → read-back on a steal;
+the release boundary — are ones where a *wrong interleaving could actually corrupt
+state*. External load can only reach those windows *probabilistically*: it raises
+the background chance of hitting an interleaving nobody scripted.
+
+The suite reaches them *deterministically* instead, by in-process function
+interposition. `clone_fn` (`tests/_harness.sh`) clones a library internal (or
+shadows a command like `mv`/`rm`/`touch` with a shell function) so a steering test
+can land "the rival's rename" at an exact protocol position, then call the original
+through the clone (the Test 23–36 steered scenarios in
+`tests/git-commit-lock.test.sh`). This hits the exact protocol window every run,
+attributably — which is why it, not external load, is the primary race-coverage
+investment.
+
+External load is the secondary, broad-net lever. It earns its place mainly on the
+one window it can genuinely move: the mtime-staleness / fail-open boundary, where
+CPU/IO pressure stretches a contended holder past the STALE threshold and exercises
+the detected-98 lane. A corollary for triage: because external load *cannot* break
+correctness, a load run that produces a *correctness* failure is surfacing either a
+real logic bug in a steering-reachable window (high value) or a test-harness setup
+race (a harness fix, not a code fix).
+
+## 3. The three tiers
+
+### Tier R — required, per-PR (`tests.yml`)
+
+The blocking gate. It runs every suite (unit, interop, integration, and the
+full-width concurrency canary as its own parallel cell) at full fan-out
+(`GCL_TEST_FULL=1`) with **no load** and the **strict** envelope tier (the default —
+the workflow sets no `GCL_ENVELOPE_TIER`, so every timing assertion is hard). The
+matrix is:
+
+| Cell | OS | Engines / leg | Buys |
+|---|---|---|---|
+| ubuntu-24.04 `all` + `canary` | Linux | bash + pwsh7 | Linux correctness + interop baseline |
+| macos-15 `all` + `canary` | macOS | bash + pwsh7 | BSD `stat`/`mv` lanes |
+| windows-2025 `unit` | Windows | bash (MINGW) | delete-pending ghosts, FILETIME floor |
+| windows-2025 `interop-integration` | Windows | bash + pwsh7 + **PowerShell 5.1** | the 5.1 non-atomic-fallback path + real NTFS commit swarm |
+| windows-2025 `canary` | Windows | bash (MINGW) | full-width concurrency under process-spawn overhead |
+
+The canary runs as a separate parallel cell on every arch because it is about half
+the Windows unit wall-clock; suites must *not* run concurrently inside one runner
+(they are timing-sensitive on 2-core runners). Triggers: `pull_request` and
+`push: main` (both `paths-ignore` docs/`.plans`/license), a weekly `schedule` to
+catch runner-image and tool drift, and `workflow_dispatch`. The concurrency group is
+`${{ github.workflow }}-${{ github.ref }}` with `cancel-in-progress: true`, so rapid
+pushes coalesce. A separate `lint` job gates shellcheck (pinned v0.11.0, `-S style`)
+and PSScriptAnalyzer (warning severity).
+
+### Tier N — nightly, scheduled (`nightly.yml`)
+
+A non-blocking scheduled stress run (08:23 UTC daily, plus `workflow_dispatch`).
+This project has **no branch protection** (single-dev, decision 2026-06-18), so
+nightly never gates a PR; its job is to catch the load-sensitive flakes and coverage
+regressions the no-load per-PR gate cannot.
+
+Six `stress` cells run the suites wrapped in `tests/with-load.sh` at one
+oversubscription level (`GCL_STRESS_RATIO=2`, R≈2), one `GCL_STRESS_KIND` each:
+ubuntu×{cpu, disk, both}, macos×disk, windows interop-integration×disk, windows
+unit×both. macOS gets a single cell (it is the scarce, slow pool); ubuntu absorbs
+the extra kinds (cheapest). The whole workflow runs with two test-level levers
+turned on (§4): `GCL_ENVELOPE_TIER=relax` (the three load-sensitive timing
+assertions warn instead of failing; correctness assertions stay hard) and
+`GCL_TEST_SWEEP=1` (the Axis-A waiter-count sweep). Each cell writes its own
+`cell-conclusion.txt` (ground truth, captured under `always()`) and uploads its logs
+plus the load-manifest on success too — the negatives are needed to read the
+positives.
+
+A separate `kcov` job runs the unit + canary suites under kcov v43 (built from
+source) on Linux, **no load, strict envelope, full fan-out**, and gates line
+coverage of `git-commit-lock.sh` at a 0.80 floor (tracks ~0.83 achieved; ratchets up
+as tests land). It explicitly overrides the workflow-level `relax` back to `strict`
+so coverage is measured on a clean run.
+
+A `triage` job (`always()`) downloads every cell's artifact and classifies each into
+one labelled issue per (date, class): `nightly-correctness` (a correctness assertion
+failed — investigate), `nightly-envelope` (a relaxed timing miss — expected,
+tracked), or `nightly-infra` (missing artifact / timeout / errored — not a product
+failure). An empty-round guard prevents "0 FAIL across 0 logs" being misread as
+green when an artifact set is entirely missing.
+
+### Tier D — on-demand deep sweep (`deep-sweep.yml`)
+
+`workflow_dispatch`-only; it never runs on push/PR and never gates anything. This is
+the deep flake-hunting instrument — the "50-clean hunt". A dispatch picks a
+`stress_kind`, an optional raw `stress_load` override, a `repeat` count, and an
+`envelope_tier` (defaults `relax`). Each suite is run `repeat` times under load in a
+fail-fast loop that names the failing iteration. The concurrency group is per-run
+(`deep-${{ github.run_id }}`) so many parallel dispatches fan out freely and accept
+queue waves rather than cancelling each other. Timeouts are deliberately generous
+(deep + loaded + repeated is far slower than the gate).
+
+## 4. The two test-level levers
+
+These let the existing tests yield more under load without touching the per-PR
+gate's behaviour.
+
+**The Axis-A waiter-count sweep** (`GCL_TEST_SWEEP`, `T_AXIS_A` in
+`tests/_harness.sh`). By default `T_AXIS_A="4"`, so per-PR and plain dev runs are
+byte-identical to the historical behaviour. Under `GCL_TEST_SWEEP=1` (nightly and
+deep only) it becomes `"4 12 24"`, and the fan-out/contention tests iterate over it —
+unit Test 2b, unit Test 20 (which composes its own list from its mode-driven floor
+plus the sweep's higher counts), and interop Test 16 — each naming N in every
+assertion message so a sweep failure says which N broke. This widens the
+thundering-herd / claim-serialization and displacement windows that re-running N=4
+never will. Correctness assertions are kept config-independent (e.g. hold ≫ STALE so
+"zero-98 / one-steal" stays a pure correctness statement) and MAX_WAIT scales with N,
+so a large-N run doesn't time out and *look* like a product failure.
+
+**The envelope tier** (`GCL_ENVELOPE_TIER`, default `strict`, in
+`tests/git-commit-lock.test.sh`). A wall-clock or poll-count bound is a best-effort
+liveness property (`guarantees.md` BE-1), not a correctness one. The `ok_envelope` /
+`bad_envelope` assertion helpers behave exactly like the hard `ok`/`bad` under
+`strict`; under `relax` a `bad_envelope` becomes a `WARN` that does not increment
+FAIL. Three assertions are tiered this way — recovery latency ≤20s (Test 21), the
+claim-path config warning firing (Test 22a), and the failed-steal's claim being
+re-created rather than left to age out (Test 29). Nightly and deep set `relax`;
+per-PR and the kcov job never do. So an oversubscribed runner can stretch wall-clock
+to a warning without reddening correctness, while correctness assertions stay hard in
+both tiers.
+
+## 5. How load is calibrated (`tests/with-load.sh`)
+
+The wrapper runs a command under a calibrated, reproducible background load, then
+tears it down by *exact spawned PIDs* (never by name — safe on a shared box and on an
+ephemeral runner) and propagates the wrapped command's exit code.
+
+- **Load is an oversubscription ratio**, not an absolute hog count:
+  `GCL_STRESS_RATIO` (R, default 1) gives stressors-per-kind = `round(R × nproc)`,
+  floored at 1 for a selected kind. "R=2" means the same pressure on a 2-core and a
+  32-core runner, where a raw hog count would not.
+- **The total ratio is capped** by `GCL_STRESS_RATIO_MAX` (default 2). `both` runs
+  cpu + disk, so its total would be 2R; the cap scales each kind down proportionally
+  so the runner is never wedged. The deep-sweep flake hunt can raise it deliberately.
+- **`GCL_STRESS_KIND`** selects `none` (clean pass-through, zero added load),
+  `cpu`, `disk`, or `both`. **`GCL_STRESS_LOAD`** is a back-compat raw per-kind
+  count override (kept so the deep-sweep `stress_load` input keeps working); empty
+  ⇒ use the ratio.
+- **CPU stressor:** `stress-ng --cpu` when available (calibrated, measurable), else a
+  portable bash spin loop. **Disk stressor:** a tight create / write+fsync / delete
+  loop over a small file on the test scratch volume — real metadata + write-back
+  pressure that contends with the lock-file create/delete the suite itself does
+  (always the portable shell hog; cross-platform, low-fidelity but real).
+- **A per-run `load-manifest` JSON** is written next to the suite logs (on success
+  too): `{kind, R, ratio_max, raw-load override, nproc, cpu/disk/total stressor
+  counts, capped?, cpu mechanism, cgroup probe, baseline/loaded ms, achieved
+  slowdown, tool versions, os/arch, git sha, command}`, so any flake is reproducible.
+  A cheap fixed bash micro-benchmark, timed unloaded then mid-load, records a coarse
+  achieved-slowdown figure (only when load is actually applied).
+
+### Platform asymmetry (current operating facts)
+
+The platforms diverge too much for a uniform calibrated injection layer, so the
+wrapper is honest about which regime ran:
+
+- Deterministic steering is portable (bash everywhere; pwsh equivalent) — the real
+  race-coverage tool, on every leg.
+- Calibrated CPU throttling via a cgroup v2 quota is **Linux-only and probe-gated**:
+  `GCL_STRESS_CGROUP=1` makes the wrapper *probe* for a writable cgroup v2 cpu
+  controller and record the result in the manifest (`writable` /
+  `present-not-delegated` / `no-cpu-controller` / `no-cgroup-v2`); it does not create
+  scopes here (that needs a usable systemd manager). IO cgroup throttling is
+  experimental and intentionally not attempted.
+- Everywhere else (macOS, Windows) load is blunt CPU/disk oversubscription —
+  uncalibrated but real pressure.
+
+## 6. GitHub Actions operating facts
+
+- **Minutes are free on public repos; concurrency is the real ceiling.** Free-plan
+  accounts cap concurrent jobs (~20 total, with a smaller macOS sub-limit). A matrix
+  past that *queues* into waves, it doesn't fail. The required gate stays small
+  enough to run in one wave; the deep sweep intentionally exceeds it and accepts
+  waves. macOS is the slowest and scarcest pool, so it is kept sparse across all
+  tiers; ubuntu (cheapest) is used liberally.
+- **`fail-fast: false`** on every matrix — an OS-specific failure is exactly the
+  signal we want, so the other legs finish.
+- **`paths-ignore` and required checks:** `tests.yml` filters docs/`.plans`/license
+  paths. A workflow whose jobs are *required* checks would leave those checks
+  Pending (blocking merge) when skipped by a path filter — but this project has no
+  branch protection, so the filter just saves runner minutes on doc-only pushes
+  without that hazard.
+- **Artifacts** are uploaded with `include-hidden-files: true` (the integration
+  suite's key diagnostics — lock log, repo state — live under the scratch repo's
+  `.git/`) and named uniquely per cell so parallel uploads never collide.
+- All actions are SHA-pinned.
+
+## 7. The discipline: required = always-meaningful-red
+
+The invariant that ties it together: **required is always-meaningful-red; nightly is
+triaged-amber-tolerant; deep is noise-by-design.** Keeping artificial load off the
+required gate is what makes a red gate trustworthy; putting all load in non-blocking
+tiers with the envelope assertions relaxed is what stops load from manufacturing
+flakes that erode that trust. The required tier is never retry-masked — a retry that
+hid a 1-in-20 real race would defeat the silent-loss class this tool exists to
+prevent.
diff --git a/tests/_harness.sh b/tests/_harness.sh
new file mode 100644
index 0000000..7529cca
--- /dev/null
+++ b/tests/_harness.sh
@@ -0,0 +1,193 @@
+# shellcheck shell=bash
+# tests/_harness.sh — shared test harness for the git-commit-lock suites.
+#
+# Sourced by all four suites (git-commit-lock.test.sh, .canary.test.sh,
+# .interop.test.sh, .integration.test.sh) to share the bits they all
+# copy-pasted: the PASS/FAIL/TAP counters, the GCL_TAP / GCL_TEST_ONLY reads,
+# ok()/bad(), section(), the end-of-suite DONE sentinel (finish), and the
+# per-test selector verdict helper.
+# Pure deduplication — ZERO behaviour change vs the inline copies it replaces.
+#
+# Contract for sourcing suites:
+#   * Source this EARLY (before any use of the inits/helpers below), CWD-
+#     independently — resolve it from the sourcing script's own location:
+#       _HARNESS_DIR="$(CDPATH= cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+#       # shellcheck source=tests/_harness.sh
+#       . "$_HARNESS_DIR/_harness.sh"
+#   * Each suite still defines its OWN cleanup() (it closes over the suite's
+#     $WORK and the bodies genuinely differ); finish() below calls whatever
+#     cleanup() is in scope when the EXIT trap fires.
+#   * Each suite installs the trap itself: `trap finish EXIT`.
+#   * The suite reaching its end sets DONE=1 before its verdict line.
+#
+# The whole project runs its suites under `set -uo pipefail` (NOT set -e); these
+# helpers are written for that (they assert on values, never on implicit exit
+# propagation), and the disables below cover the idioms that pervade the suites.
+#
+# shellcheck disable=SC2015  # The pervasive `<assert> && ok ... || bad ...`
+# idiom is deliberate throughout: ok/bad are echo+counter helpers that cannot
+# fail, so the classic A && B || C pitfall (C running after B fails) is moot.
+# shellcheck disable=SC2310,SC2312  # info-level, deliberate: helper functions
+# and command substitutions run inside conditions all over a test suite; the
+# suites run WITHOUT errexit (set -uo only) and assert on values, not on
+# implicit exit propagation.
+
+PASS=0; FAIL=0; TAPN=0; DONE=0; SECTIONS_RUN=0
+GCL_TAP="${GCL_TAP:-0}"           # CI sets GCL_TAP=1 for machine-readable TAP13 output
+GCL_TEST_ONLY="${GCL_TEST_ONLY:-}"  # if set, run ONLY test blocks whose label REGEX-matches (single-test selector)
+
+# Axis-A waiter-count sweep (see load-testing-strategy.md). GCL_TEST_SWEEP=1 (nightly/deep CI) widens
+# the fan-out/contention tests over several waiter counts to wring more coverage
+# from the existing tests; unset/0 (per-PR default + plain dev) keeps the floor so
+# default runs are byte-identical to today. T_AXIS_A is the shared waiter-count
+# list the contention tests (unit Test 2b, interop Test 16) iterate N over; each
+# names N in every assertion message so a sweep failure says which N broke. The
+# floor is 4 — the count those two tests hardcode today, so the single-element
+# default reproduces today's behaviour exactly. (Test 20's floor is mode-driven
+# `$T20_N` (5 REDUCED / 10 FULL), not 4, so it composes its own list from $T20_N +
+# the sweep's higher counts rather than from T_AXIS_A — see that test.)
+GCL_TEST_SWEEP="${GCL_TEST_SWEEP:-0}"
+# shellcheck disable=SC2034  # T_AXIS_A is consumed by the sourcing suites (unit
+# Test 2b, interop Test 16), not within this harness file.
+if [ "$GCL_TEST_SWEEP" = 1 ]; then T_AXIS_A="4 12 24"; else T_AXIS_A="4"; fi
+
+# ok/bad are TAP-aware (gated by GCL_TAP so plain dev runs are byte-unchanged) and
+# bump the running assertion number TAPN. The trailing `1..$TAPN` plan line (emitted
+# by each suite just before its verdict) lets a TAP consumer fail on a short count;
+# together with the DONE sentinel below this closes the silent-undercount gap.
+# `return 0` preserves the "ok/bad cannot fail" property the
+# `<assert> && ok ... || bad ...` idiom relies on.
+ok()  { PASS=$((PASS+1)); TAPN=$((TAPN+1)); echo "PASS: $*"
+        [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $*"; return 0; }
+bad() { FAIL=$((FAIL+1)); TAPN=$((TAPN+1)); echo "FAIL: $*"
+        [ "$GCL_TAP" = 1 ] && echo "not ok $TAPN - $*"; return 0; }
+
+# Per-test gate: echoes the block header (so a normal run is byte-unchanged) and
+# returns success iff GCL_TEST_ONLY is unset/empty OR its regex matches the label.
+# Each top-level `== Test N: <desc> ==` block is wrapped `if section "..."; then ... fi`.
+# Bumps SECTIONS_RUN on a match so the verdict's zero-match guard (selector_report)
+# can catch a selector regex that matched nothing.
+section() {
+  echo "== $1 =="
+  if [ -z "${GCL_TEST_ONLY:-}" ] || [[ "$1" =~ $GCL_TEST_ONLY ]]; then
+    SECTIONS_RUN=$((SECTIONS_RUN + 1)); return 0
+  fi
+  return 1
+}
+
+# Sentinel: the suite reaching its end sets DONE=1. If the EXIT trap fires with
+# DONE!=1, the suite died early (a stray exit/crash) and the assertion count is
+# unreliable — fail loudly even if the pre-trap code was 0. A bare trap `return`
+# is IGNORED (the script keeps its pre-trap code), so the guard must `exit 1`.
+# Calls the suite-local cleanup() (each suite defines its own, closing over its
+# own $WORK); whatever cleanup is in scope when the trap fires is used.
+finish() {
+  cleanup
+  if [ "${DONE:-0}" != 1 ]; then
+    echo "Bail out! suite terminated early before the plan line; ran ${TAPN:-0} assertion(s), count unreliable" >&2
+    exit 1
+  fi
+}
+
+# Selector verdict helper, called by the section-using suites just before their
+# verdict line. Two parts, both gated on GCL_TEST_ONLY being non-empty so a
+# default run stays byte-identical:
+#   1. Zero-match guard: a set-but-non-matching GCL_TEST_ONLY ran NO test block,
+#      so the (vacuously green) verdict would lie — a typo'd selector regex must
+#      FAIL, not pass with zero assertions. Bail loudly. (The finish EXIT trap
+#      also fires here since DONE is still 0; this exit is non-zero regardless.)
+#   2. Report how many blocks the selector matched.
+# Integration does NOT call this — it is one indivisible scenario that does not
+# use section(), so it note-and-ignores GCL_TEST_ONLY at its top instead.
+selector_report() {
+  if [ -n "${GCL_TEST_ONLY:-}" ] && [ "$SECTIONS_RUN" = 0 ]; then
+    echo "Bail out! GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" matched no test" >&2
+    exit 1
+  fi
+  [ -n "${GCL_TEST_ONLY:-}" ] && echo "selector GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" ran $SECTIONS_RUN test block(s)"
+  return 0
+}
+
+# --- Shared timing/lock helpers (unit + interop; integration uses none) -------
+# Backdate a path's mtime by $2 seconds — how a test fakes a stale lock (the
+# lock's staleness clock is the lock FILE's own mtime, stamped by the creating
+# write). Portable: BSD/macOS touch has no `-d @epoch`, so convert the target
+# epoch to a `touch -t` stamp via GNU `date -d @` with BSD `date -r` as
+# fallback.
+epoch_to_stamp() {
+  date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null
+}
+backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; }
+
+# Token-guarded backdate for the contended-recovery rounds (unit T2b /
+# interop T16/T16b). Why: under load a fast waiter can complete its ENTIRE steal
+# (claim -> rename-over -> ACQUIRED) before the harness's `touch` executes, so a
+# blind backdate lands on the WINNER'S freshly installed lock, making it
+# instantly stale for every rival — a legitimate re-steal then fails the round's
+# "zero 98s / exactly one STOLE-BY-CLAIM" assertions although the protocol
+# behaved exactly as designed (observed 2026-06-12 on a loaded box). Verdicts:
+#   * pre-read not the ghost: a waiter stole the ghost BEFORE the touch (it
+#     aged stale naturally during a stalled sync); no touch is performed and
+#     the round premise is gone — invalid, the caller retries the round.
+#   * post-read the ghost: conclusive — nothing ever rewrites the ghost
+#     token at the path, so the touch verifiably hit the ghost; any steal
+#     after the post-read steals an ALREADY-ancient ghost, exactly the
+#     scenario the round wants. Valid.
+#   * post-read anything else: a steal raced the touch->re-read window —
+#     COMMON under load (waiters poll every 0.05s; the post-read costs
+#     subprocess spawns), so it must not blindly invalidate. The lock's
+#     MTIME arbitrates which file the touch hit: a winner's installed lock
+#     is FRESH (the rename carries the claim file's just-created mtime), so
+#     fresh => the touch hit the GHOST and a legitimate steal followed —
+#     valid; ancient => the touch landed on the WINNER'S live lock and
+#     corrupted the round — invalid, retry. Vanished => cannot arbitrate —
+#     invalid, retry.
+backdate_ghost() {  # $1=lock $2=ghost token $3=age-secs -> 0 iff the round premise is intact
+  local pre post now mt
+  pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
+  [ "$pre" = "$2" ] || return 1
+  backdate "$1" "$3" 2>/dev/null || return 1
+  post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
+  [ "$post" = "$2" ] && return 0
+  [ -e "$1" ] || return 1
+  now="$(date +%s)"
+  mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1
+  [ $(( now - mt )) -lt $(( $3 / 2 )) ]
+}
+
+# Wait for every waiter's WAITING line while keeping the ghost lock FRESH
+# (touch -c to now, no-create so a released path is never resurrected): a
+# fresh ghost cannot be judged stale, so no waiter can steal it before the
+# guarded backdate — without this, a sync stalled past STALE (slow worker
+# cold starts on a loaded box) lets the ghost age stale naturally and a
+# waiter steals it mid-sync. Freshening is race-safe: if a steal slipped in
+# anyway, touching the winner's (already fresh) live lock to "now" is a
+# harmless no-op, and backdate_ghost's pre-read catches the broken premise.
+sync_waiting_fresh() {  # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING
+  local lock="$1" deadline f ok=1
+  deadline=$(( $(date +%s) + $2 )); shift 2
+  for f in "$@"; do
+    until grep -q "WAITING for lock" "$f" 2>/dev/null; do
+      touch -c "$lock" 2>/dev/null
+      if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi
+      sleep 0.2
+    done
+  done
+  [ "$ok" = 1 ]
+}
+
+# Fabricate a lock file the way a real (foreign) holder would have written it:
+# token line + owner line. The token MUST be "tok."-prefixed (wire format) or
+# the steal's content guard will — correctly — refuse to steal it.
+fabricate_lock() {  # $1=path $2=token $3=owner
+  printf '%s\n%s\n' "$2" "$3" > "$1"
+}
+
+# Wait (up to $3 seconds, default 15) for a pattern to appear in a file. Used to
+# gate on the WAITING log line: proof the waiter actually contended, without a
+# fixed-length hold.
+wait_for_grep() {
+  local pat="$1" f="$2" tries=$(( ${3:-15} * 20 ))
+  while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done
+  grep -q "$pat" "$f" 2>/dev/null
+}
diff --git a/tests/git-commit-lock.canary.test.sh b/tests/git-commit-lock.canary.test.sh
new file mode 100755
index 0000000..0f30461
--- /dev/null
+++ b/tests/git-commit-lock.canary.test.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# git-commit-lock.canary.test.sh — the concurrency CANARY, extracted from the
+# unit suite (git-commit-lock.test.sh) into its own file so it runs as a
+# naturally-parallel CI job.
+#
+# Runs entirely against throwaway temp dirs, so it never touches the repo you
+# launch it from. Exit 0 == pass.
+#   bash tests/git-commit-lock.canary.test.sh
+#
+# This is a STATISTICAL concurrency canary — N workers race the lock over
+# repeated rounds; repetition at width is its coverage. It is cheap on
+# Linux/macOS (fast process spawn) but pathological on Windows (~half the
+# Windows unit wall-clock), which is exactly why it lives in its own cell.
+#
+# Fan-out: defaults to REDUCED width so routine dev runs don't lag a live shared
+# machine; set GCL_TEST_FULL=1 (CI does) for the full-strength 8x25 canary. The
+# file prints which mode ran — a reduced pass must never masquerade as the full one.
+#
+# On failure the work dir is PRESERVED (path printed) for post-mortem; set
+# GCL_TEST_PRESERVE_DIR=<dir> to additionally copy all logs/outputs there
+# regardless of outcome (used by CI).
+#
+# shellcheck disable=SC2015  # The pervasive `<assert> && ok ... || bad ...`
+# idiom is deliberate throughout: ok/bad are echo+counter helpers that cannot
+# fail, so the classic A && B || C pitfall (C running after B fails) is moot.
+# shellcheck disable=SC2310,SC2312  # info-level, deliberate: helper functions
+# and command substitutions run inside conditions all over a test suite; the
+# suite runs WITHOUT errexit (set -uo only) and asserts on values, not on
+# implicit exit propagation.
+# shellcheck disable=SC2016  # $INCR is single-quoted on purpose: it expands
+# inside the worker's `bash -c`, not here.
+set -uo pipefail
+
+# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad,
+# section, the finish EXIT-trap sentinel (calls our cleanup below). Resolved from
+# THIS script's own dir so it sources regardless of CWD; sourced EARLY (before any
+# use of the inits/helpers below).
+_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=tests/_harness.sh
+. "$_HARNESS_DIR/_harness.sh"
+
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$DIR/.." && pwd)"   # the implementations live at the repo root
+LIB="$ROOT/git-commit-lock.sh"
+
+if [ "${GCL_TEST_FULL:-0}" = 1 ]; then
+  GCL_MODE="FULL"; T1_ROUNDS=8; T1_N=25
+else
+  GCL_MODE="REDUCED"; T1_ROUNDS=3; T1_N=8
+fi
+echo "fan-out mode: $GCL_MODE (T1 ${T1_ROUNDS} rounds x ${T1_N} workers)"
+[ "$GCL_MODE" = REDUCED ] && echo "  (set GCL_TEST_FULL=1 for the full-strength 8x25 canary — CI runs it)"
+
+WORK="$(mktemp -d 2>/dev/null || echo "${TMPDIR:-/tmp}/git-commit-lock-test.$$")"
+mkdir -p "$WORK"
+cleanup() {
+  if [ -n "${GCL_TEST_PRESERVE_DIR:-}" ]; then
+    mkdir -p "$GCL_TEST_PRESERVE_DIR" 2>/dev/null || true
+    cp -R "$WORK"/. "$GCL_TEST_PRESERVE_DIR"/ 2>/dev/null || true
+    echo "note: copied test artifacts to $GCL_TEST_PRESERVE_DIR"
+  fi
+  if [ "${FAIL:-0}" -gt 0 ]; then
+    echo "note: failures detected — work dir preserved for post-mortem: $WORK"
+  else
+    rm -rf "$WORK" 2>/dev/null || true
+  fi
+}
+# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup()
+# above and fails loudly if the suite died before setting DONE=1.
+trap finish EXIT
+
+# The RESULT line below expands $ENV_WARN, which in the unit suite is maintained
+# by the envelope-tier assertions (ok_envelope/bad_envelope). The canary uses
+# only plain ok/bad (no envelope assertions), so define it to 0 here so the
+# standard RESULT line works unchanged under set -u.
+ENV_WARN=0
+
+# Critical section that loses updates without a mutex: read, gap, write+1.
+INCR='n="$(cat "$1")"; sleep 0.03; echo $((n+1)) > "$1"'
+
+if section "Test 1: concurrent workers, mutual exclusion (repeated rounds, $GCL_MODE width)"; then
+# A single pass is too weak to trust a rare exclusion race (the release-steal
+# bug found 2026-05-30 lost ~1 update per 25 only intermittently). Repeat
+# several rounds; ANY lost update across ALL rounds fails the test.
+# MAX_WAIT caps a regression at 180s per worker instead of the 420s default;
+# STALE stays comfortably above any realistic hold so nothing is ever stolen.
+N=$T1_N; ROUNDS=$T1_ROUNDS; t1_fail=0; T1ERR="$WORK/excl.err"; : > "$T1ERR"
+for r in $(seq 1 "$ROUNDS"); do
+  COUNTER="$WORK/counter.$r"; echo 0 > "$COUNTER"
+  LOCK="$WORK/excl.$r.lock"; LOG="$WORK/excl.$r.log"; : > "$LOG"; pids=()
+  for _ in $(seq 1 "$N"); do
+    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=120 \
+      AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=180 \
+      bash "$LIB" run -- bash -c "$INCR" _ "$COUNTER" 2>> "$T1ERR" &
+    pids+=($!)
+  done
+  for p in "${pids[@]}"; do wait "$p"; done
+  c="$(cat "$COUNTER")"; a="$(grep -c ACQUIRED "$LOG")"; rl="$(grep -c RELEASED "$LOG")"
+  if [ "$c" != "$N" ] || [ "$a" != "$N" ] || [ "$rl" != "$N" ] || [ -e "$LOCK" ]; then
+    t1_fail=1; echo "  round $r: counter=$c acquired=$a released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)"
+  fi
+done
+[ "$t1_fail" = 0 ] && ok "$ROUNDS rounds x $N workers ($GCL_MODE): no lost updates, balanced acquire/release, no leftover lock" \
+                    || bad "mutual-exclusion failure in at least one round (see above)"
+# Regression: under contention the lock file routinely vanishes mid-mtime-probe;
+# that must NOT be misdiagnosed as "staleness detection broken" (false WARNING
+# observed 2026-06-10 before the probe got its retry loop).
+grep -q "Staleness detection is BROKEN" "$T1ERR" \
+  && bad "spurious mtime-probe WARNING under contention (see $T1ERR)" \
+  || ok "no spurious mtime-probe warnings under contention"
+fi
+
+# Zero-match guard + selector-report line (shared helper in _harness.sh): a
+# set-but-non-matching GCL_TEST_ONLY ran NO test block, which without the guard
+# would fall through to a vacuous PASS=0 FAIL=0 "green". Near-pointless in a
+# one-test file, but zero-cost and keeps the finish/zero-match scaffolding
+# uniform with the other suites.
+selector_report
+
+DONE=1
+echo
+echo "==== RESULT: $PASS passed, $FAIL failed, $ENV_WARN envelope warning(s) (fan-out: $GCL_MODE) ===="
+[ "$GCL_TAP" = 1 ] && echo "1..$TAPN"
+[ "$FAIL" = 0 ]
diff --git a/tests/git-commit-lock.integration.test.sh b/tests/git-commit-lock.integration.test.sh
index a142bba..49badf8 100644
--- a/tests/git-commit-lock.integration.test.sh
+++ b/tests/git-commit-lock.integration.test.sh
@@ -36,6 +36,13 @@
 # they expand inside a worker's `bash -c` invocation, not here.
 set -uo pipefail
 
+# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad,
+# section, the finish EXIT-trap sentinel (calls our cleanup below). Resolved from
+# THIS script's own dir so it sources regardless of CWD.
+_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=tests/_harness.sh
+. "$_HARNESS_DIR/_harness.sh"
+
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT="$(cd "$DIR/.." && pwd)"   # the implementations live at the repo root
 LIB="$ROOT/git-commit-lock.sh"
@@ -59,11 +66,9 @@ cleanup() {
     rm -rf "$WORK" 2>/dev/null || true
   fi
 }
-trap cleanup EXIT
-
-PASS=0; FAIL=0
-ok()  { echo "PASS: $*"; PASS=$((PASS+1)); }
-bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); }
+# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup()
+# above and fails loudly if the suite died before setting DONE=1.
+trap finish EXIT
 
 # --- sizing ------------------------------------------------------------------
 # Commits serialise (that's the whole point), so wall time ≈ workers x commit
@@ -95,6 +100,15 @@ echo "fan-out mode: $GCL_MODE (bash swarm ${BROUNDS}x${BN}, mixed swarm ${MSH}+$
 # bounded max wait so a wedge fails the suite instead of hanging it.
 LK_ENV=(AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=240)
 
+# Note-and-ignore the per-test selector the unit/interop suites honour: this
+# suite is ONE indivisible scenario (Tests 1-3 share a single repo + the ALL_IDS
+# accumulator, and Test 3 audits Tests 1+2's output), so a per-block selector
+# can't apply. If GCL_TEST_ONLY is set (read by _harness.sh), say so loudly on
+# stderr and run the whole scenario as normal.
+if [ -n "$GCL_TEST_ONLY" ]; then
+    echo "NOTE: integration suite ignores GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" — Tests 1-3 are one indivisible scenario (shared repo + ALL_IDS audit); running the whole suite." >&2
+fi
+
 # --- scratch repo ------------------------------------------------------------
 REPO="$WORK/repo"; OUTD="$WORK/out"; NOHOOKS="$WORK/nohooks"
 mkdir -p "$REPO" "$OUTD" "$NOHOOKS"
@@ -301,5 +315,7 @@ done
                   || bad "$n_next leftover claim file(s) beside the lock"
 
 echo
+DONE=1
 echo "==== INTEGRATION RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ===="
+[ "$GCL_TAP" = 1 ] && echo "1..$TAPN"
 [ "$FAIL" = 0 ]
diff --git a/tests/git-commit-lock.interop.test.sh b/tests/git-commit-lock.interop.test.sh
index 06fe746..bfb0e44 100644
--- a/tests/git-commit-lock.interop.test.sh
+++ b/tests/git-commit-lock.interop.test.sh
@@ -40,6 +40,16 @@
 # they expand inside a worker's `bash -c` or pwsh invocation, not here.
 set -uo pipefail
 
+# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad,
+# section, the finish EXIT-trap sentinel (calls our cleanup below), and the
+# shared timing/lock helpers (epoch_to_stamp, backdate, backdate_ghost,
+# sync_waiting_fresh, fabricate_lock, wait_for_grep). Resolved from THIS
+# script's own dir so it sources regardless of CWD; sourced EARLY (before any
+# use of the inits/helpers below).
+_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=tests/_harness.sh
+. "$_HARNESS_DIR/_harness.sh"
+
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT="$(cd "$DIR/.." && pwd)"   # the implementations live at the repo root
 SH="$ROOT/git-commit-lock.sh"
@@ -67,9 +77,11 @@ WORK="$(pwsh -NoProfile -Command '[IO.Path]::Combine([IO.Path]::GetTempPath(), "
 WORK="${WORK//\\//}"
 mkdir -p "$WORK"
 
-PASS=0; FAIL=0
-ok()  { echo "PASS: $*"; PASS=$((PASS+1)); }
-bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); }
+# The PASS/FAIL/TAP/SECTIONS_RUN inits, the GCL_TAP/GCL_TEST_ONLY reads, ok/bad,
+# and section() all come from _harness.sh (sourced above). GCL_TEST_ONLY is the
+# single-test selector: a <regex> that runs only the `== Test N: <desc> ==`
+# blocks whose label matches (BASH =~); unset/empty runs every block; a typo'd
+# regex that matches nothing bails out loudly at the verdict (selector_report).
 
 # Failure post-mortems need the logs: keep $WORK when anything failed, and
 # honour GCL_TEST_PRESERVE_DIR (the CI preserve-logs knob) by copying
@@ -86,7 +98,9 @@ cleanup() {
   fi
   rm -rf "$WORK" 2>/dev/null || true
 }
-trap cleanup EXIT
+# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup()
+# above and fails loudly if the suite died before setting DONE=1.
+trap finish EXIT
 
 # Poll for a marker file: ready-markers replace fixed head-start sleeps so a
 # slow pwsh cold-start (1-3s+ under load) can't fake an ordering failure.
@@ -95,88 +109,11 @@ wait_for() {  # $1=file $2=max iterations of 50ms (default 200 = 10s)
   return 1
 }
 
-# Wait (up to $3 seconds, default 15) for a pattern to appear in a file —
-# used to gate on the WAITING log line (proof a waiter actually contended)
-# without a fixed-length hold. Same helper as the unit suite.
-wait_for_grep() {
-  local pat="$1" f="$2" tries=$(( ${3:-15} * 20 ))
-  while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done
-  grep -q "$pat" "$f" 2>/dev/null
-}
-
-# Backdate a path's mtime by $2 seconds — how a test fakes a stale lock (the
-# staleness clock is the lock FILE's own mtime, stamped by the creating
-# write). Portable: BSD/macOS touch has no `-d @epoch`, so convert the target
-# epoch to a `touch -t` stamp via GNU `date -d @` with BSD `date -r` as
-# fallback (same helper as the unit suite).
-epoch_to_stamp() {
-  date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null
-}
-backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; }
-
-# Token-guarded backdate for the contended-recovery tests (T16/T16b; same
-# guard as the unit suite's T2b — full rationale there). Why: under load a
-# fast waiter can complete its ENTIRE steal (claim -> rename-over ->
-# ACQUIRED) before the harness's `touch` executes, so a blind backdate lands
-# on the WINNER'S freshly installed lock, making it instantly stale for
-# every rival — a legitimate re-steal then fails the test's "zero 98s /
-# exactly one STOLE-BY-CLAIM" assertions although the protocol behaved
-# exactly as designed (observed 2026-06-12 on a loaded box: a fast pwsh
-# waiter judged the FRESH ghost at age==STALE, stole and ACQUIRED before the
-# touch, which then aged its live lock to 10000s and a rival re-stole it).
-# Verdicts:
-#   * pre-read not the ghost: stolen BEFORE the touch (no touch performed) —
-#     invalid, the caller retries the run.
-#   * post-read the ghost: conclusive — the touch hit the ghost. Valid.
-#   * post-read anything else: a steal raced the touch->re-read window —
-#     COMMON under load (waiters poll every 0.05s; the post-read costs
-#     subprocess spawns), so it must not blindly invalidate. The lock's
-#     MTIME arbitrates which file the touch hit: a winner's installed lock
-#     is FRESH (the rename carries the claim file's just-created mtime), so
-#     fresh => the touch hit the GHOST and a legitimate steal followed —
-#     valid; ancient => the touch landed on the WINNER'S live lock and
-#     corrupted the run — invalid, retry. Vanished => cannot arbitrate —
-#     invalid, retry.
-backdate_ghost() {  # $1=lock $2=ghost token $3=age-secs -> 0 iff the run premise is intact
-  local pre post now mt
-  pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
-  [ "$pre" = "$2" ] || return 1
-  backdate "$1" "$3" 2>/dev/null || return 1
-  post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
-  [ "$post" = "$2" ] && return 0
-  [ -e "$1" ] || return 1
-  now="$(date +%s)"
-  mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1
-  [ $(( now - mt )) -lt $(( $3 / 2 )) ]
-}
-
-# Wait for every waiter's WAITING line while keeping the ghost lock FRESH
-# (touch -c to now, no-create so a released path is never resurrected): a
-# fresh ghost cannot be judged stale, so no waiter can steal it before the
-# guarded backdate — without this, a sync stalled past STALE (slow pwsh cold
-# starts on a loaded box) lets the ghost age stale naturally and a waiter
-# steals it mid-sync. Freshening is race-safe: if a steal slipped in anyway,
-# touching the winner's (already fresh) live lock to "now" is a harmless
-# no-op, and backdate_ghost's pre-read catches the broken premise.
-sync_waiting_fresh() {  # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING
-  local lock="$1" deadline f ok=1
-  deadline=$(( $(date +%s) + $2 )); shift 2
-  for f in "$@"; do
-    until grep -q "WAITING for lock" "$f" 2>/dev/null; do
-      touch -c "$lock" 2>/dev/null
-      if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi
-      sleep 0.2
-    done
-  done
-  [ "$ok" = 1 ]
-}
-
-# Fabricate a lock file the way a real (foreign) holder would have written it:
-# token line + owner line. The token MUST be "tok."-prefixed (wire format) or
-# the steal's content guard will — correctly — refuse to steal it.
-fabricate_lock() {  # $1=path $2=token $3=owner
-  printf '%s\n%s\n' "$2" "$3" > "$1"
-}
+# wait_for_grep, epoch_to_stamp, backdate, backdate_ghost, sync_waiting_fresh,
+# and fabricate_lock now live in _harness.sh (sourced above) — shared
+# byte-for-byte with the unit suite. (wait_for above is interop-only: its arg-2
+# is a count of 50ms iterations, distinct from the unit suite's wait_for_file
+# whole-seconds semantics, so the two poll helpers stay separate.)
 
 # A pwsh process that holds the lock FILE open with FileShare.Read — the
 # no-delete-share handle class that blocks unlink AND rename alike (probe
@@ -224,7 +161,7 @@ ps_worker() {  # $1=lock $2=log $3=holder $4=violations $5=id
     pwsh -NoProfile -File "$PS1WIN" run "$body"
 }
 
-echo "== Test 1: mixed pwsh+bash workers, mutual exclusion across implementations ($GCL_MODE width) =="
+if section "Test 1: mixed pwsh+bash workers, mutual exclusion across implementations ($GCL_MODE width)"; then
 NSH=$T1_NSH; NPS=$T1_NPS; TOT=$((NSH+NPS))
 LOCK="$WORK/excl.lock"
 HOLDER="$WORK/holder"; : > "$HOLDER"; VIOL="$WORK/violations"; : > "$VIOL"
@@ -259,8 +196,9 @@ else
   [ "$st" != 0 ] && { echo "  STALE/STEAL log lines:"; grep -E "STALE|STOLE" "$WORK/excl-all.log" | sed 's/^/    /'; }
   bad "cross-impl exclusion/balance: violations=$nv steals=$st acquired=$a (floor $((TOT/2))) released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)"
 fi
+fi
 
-echo "== Test 2: a bash holder blocks a pwsh waiter (no concurrent hold, no wrongful steal) =="
+if section "Test 2: a bash holder blocks a pwsh waiter (no concurrent hold, no wrongful steal)"; then
 LOCK="$WORK/b2.lock"; LOG="$WORK/b2.log"; : > "$LOG"; ORDER="$WORK/b2.order"; : > "$ORDER"
 READY="$WORK/b2.ready"; rm -f "$READY"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \
@@ -276,8 +214,9 @@ wait "$holder"
 got="$(tr '\n' ',' < "$ORDER")"
 [ "$got" = "sh-start,sh-end,ps-ran," ] && ok "bash-holds / pwsh-waits ordering correct" || bad "ordering wrong: $got"
 grep -q STOLE "$LOG" && bad "pwsh wrongly STOLE a live bash lock" || ok "pwsh did not steal the live bash lock"
+fi
 
-echo "== Test 3: a pwsh holder blocks a bash waiter =="
+if section "Test 3: a pwsh holder blocks a bash waiter"; then
 LOCK="$WORK/b3.lock"; LOG="$WORK/b3.log"; : > "$LOG"; ORDER="$WORK/b3.order"; : > "$ORDER"
 READY="$WORK/b3.ready"; rm -f "$READY"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \
@@ -290,8 +229,9 @@ wait "$holder"
 got="$(tr '\n' ',' < "$ORDER")"
 [ "$got" = "ps-start,ps-end,sh-ran," ] && ok "pwsh-holds / bash-waits ordering correct" || bad "ordering wrong: $got"
 grep -q STOLE "$LOG" && bad "bash wrongly STOLE a live pwsh lock" || ok "bash did not steal the live pwsh lock"
+fi
 
-echo "== Test 4: pwsh steals a STALE lock fabricated as bash's (old file mtime) =="
+if section "Test 4: pwsh steals a STALE lock fabricated as bash's (old file mtime)"; then
 # AGENT_LOCK_MAX_WAIT caps the run so a steal regression fails in ~20s, not 420s.
 LOCK="$WORK/b4.lock"; LOG="$WORK/b4.log"; : > "$LOG"; MARK="$WORK/b4.mark"; printf '%s' before > "$MARK"
 fabricate_lock "$LOCK" "tok.sh.ghost.1" "pid=99999 host=ghost"
@@ -304,22 +244,28 @@ grep -q STOLE "$LOG" && ok "log records the cross-impl steal" || bad "no STOLE e
 grep -q "holder=pid=99999 host=ghost" "$LOG" \
   && ok "STALE log line carries the holder parsed from line 2 (cross-impl wire format)" \
   || bad "holder from line 2 missing in pwsh's STALE log line"
+fi
 
-echo "== Test 5: bash steals a STALE lock GENUINELY created by pwsh (holder killed mid-hold) =="
-# The stale lock really is pwsh's: a pwsh process dot-sources the lock, acquires,
-# signals ready, then is hard-killed by PID mid-hold (TerminateProcess — no
-# release, no exit event), leaving its live lock FILE (token line 1) behind.
+if section "Test 5: bash steals a STALE lock GENUINELY created by pwsh (holder killed mid-hold)"; then
+# The stale lock really is pwsh's: a pwsh process dot-sources the lock, acquires (writing
+# its tok.ps.* token to line 1 and flushing+closing the file), signals ready, then
+# SELF-EXITS via [Environment]::Exit(0) — the port's documented hard-exit that bypasses
+# BOTH Lock-Release AND the PowerShell.Exiting backstop — leaving its live token'd lock
+# FILE behind with no release. This is DETERMINISTIC: the same on-disk state as a holder
+# killed mid-hold, but without an external kill. (An MSYS `kill -9 "$!"` does NOT reliably
+# terminate the native pwsh.exe under load — it survived, ran to completion, and its
+# graceful-exit backstop DELETED the lock, leaving an empty file to steal; observed under
+# CPU load, run 27621668323. See the Test 5 de-flake plan.)
 LOCK="$WORK/b5.lock"; LOG="$WORK/b5.log"; : > "$LOG"; MARK="$WORK/b5.mark"; printf '%s' before > "$MARK"
 READY="$WORK/b5.ready"; rm -f "$READY"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 \
-  pwsh -NoProfile -Command ". '$PS1WIN'; Lock-Acquire | Out-Null; [IO.File]::WriteAllText('$READY','r'); Start-Sleep 60" &
+  pwsh -NoProfile -Command ". '$PS1WIN'; if (-not (Lock-Acquire)) { [Environment]::Exit(3) }; [IO.File]::WriteAllText('$READY','r'); [Environment]::Exit(0)" &
 hpid=$!
 if wait_for "$READY"; then
-  kill -9 "$hpid" 2>/dev/null; wait "$hpid" 2>/dev/null
-  sleep 0.3
+  wait "$hpid" 2>/dev/null                          # holder self-exited via [Environment]::Exit (no release); reap it
   tok="$(head -n 1 "$LOCK" 2>/dev/null | tr -d '\r\n')"
   case "$tok" in
-    tok.ps.*) ok "dead pwsh holder left its own lock file behind (token $tok)" ;;
+    tok.ps.*) ok "self-exited pwsh holder left its own token'd lock behind (token $tok)" ;;
     *)        bad "expected a tok.ps.* token on line 1 of the orphan lock, got '$tok'" ;;
   esac
   backdate "$LOCK" 9999                           # age the orphan past any stale window
@@ -332,8 +278,9 @@ else
   kill -9 "$hpid" 2>/dev/null; wait "$hpid" 2>/dev/null
   bad "T5 pwsh holder never acquired/signalled ready"
 fi
+fi
 
-echo "== Test 6: deterministic lost-update counter, mixed bash+pwsh increments ($GCL_MODE width) =="
+if section "Test 6: deterministic lost-update counter, mixed bash+pwsh increments ($GCL_MODE width)"; then
 # The deterministic complement to Test 1's exclusion probe (which has a blind
 # window and tolerates launch flakiness): every worker MUST launch (strict rc
 # checks) and the final counter MUST equal the total increments — any lost
@@ -379,8 +326,9 @@ cat "$WORK"/cnt-*.log > "$WORK/cnt-all.log" 2>/dev/null || : > "$WORK/cnt-all.lo
 a="$(grep -c ACQUIRED "$WORK/cnt-all.log")"; rl="$(grep -c RELEASED "$WORK/cnt-all.log")"
 [ "$a" = "$CTOT" ] && [ "$rl" = "$CTOT" ] && ok "lock logs balanced ($a acquired / $rl released)" || bad "lock logs unbalanced: acquired=$a released=$rl want=$CTOT"
 [ -e "$LOCK" ] && bad "leftover counter lock" || ok "no leftover lock"
+fi
 
-echo "== Test 7: pwsh run propagates the command's exit code (two contending runs in parallel) =="
+if section "Test 7: pwsh run propagates the command's exit code (two contending runs in parallel)"; then
 LOCK="$WORK/rc.lock"; LOG="$WORK/rc.log"; : > "$LOG"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=60 \
   pwsh -NoProfile -File "$PS1WIN" run "exit 0" & p0=$!
@@ -391,8 +339,9 @@ wait "$p7"; rc7=$?
 [ "$rc0" = 0 ] && ok "pwsh exit 0 propagated" || bad "pwsh exit 0 not propagated (rc=$rc0)"
 [ "$rc7" = 7 ] && ok "pwsh exit 7 propagated" || bad "pwsh exit code not propagated ($rc7)"
 [ -e "$LOCK" ] && bad "lock left held after pwsh run" || ok "lock released after pwsh run (success and failure)"
+fi
 
-echo "== Test 7b: ps1 run verdicts for PowerShell-NATIVE failure (a failing cmdlet must not exit 0) =="
+if section "Test 7b: ps1 run verdicts for PowerShell-NATIVE failure (a failing cmdlet must not exit 0)"; then
 # A cmdlet's non-terminating error never sets LASTEXITCODE, so a runner
 # consulting only LASTEXITCODE would return 0 for a failed command. The
 # runner must consult the staged script's FINAL '$?' when no nonzero native
@@ -430,8 +379,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=20 \
 [ "$rc" = 0 ] && ok "mid-command cmdlet failure + succeeding final statement -> 0 (the documented final-statement limitation)" \
               || bad "limitation pin: rc=$rc (want 0 — has the final-statement contract changed?)"
 [ -e "$LOCK" ] && bad "lock left held after the failing-cmdlet verdict runs" || ok "no leftover lock after the failing-cmdlet verdict runs"
+fi
 
-echo "== Test 7c: ps1 CLI help/usage convention — explicit help -> stdout + exit 0; usage errors -> stderr + 96 =="
+if section "Test 7c: ps1 CLI help/usage convention — explicit help -> stdout + exit 0; usage errors -> stderr + 96"; then
 # (bash's side of the same convention is pinned in the unit suite, Test 7.)
 for h in --help -h; do
   pwsh -NoProfile -File "$PS1WIN" "$h" > "$WORK/t7c.out" 2> "$WORK/t7c.err"; rc=$?
@@ -451,8 +401,9 @@ pwsh -NoProfile -File "$PS1WIN" > "$WORK/t7c-noargs.out" 2> "$WORK/t7c-noargs.er
   || bad "ps1 no-args rc=$rc (want 96) stderr-usage=$(grep -c '^usage:' "$WORK/t7c-noargs.err")"
 pwsh -NoProfile -File "$PS1WIN" frobnicate >/dev/null 2>&1; rc=$?
 [ "$rc" = 96 ] && ok "ps1 unknown subcommand -> 96" || bad "ps1 unknown subcommand rc=$rc (want 96)"
+fi
 
-echo "== Test 8: a ROBBED holder exits 98 — pwsh victim/bash thief, then bash victim/pwsh thief =="
+if section "Test 8: a ROBBED holder exits 98 — pwsh victim/bash thief, then bash victim/pwsh thief"; then
 # Fail-open ceiling, cross-impl: the victim holds past its 1s stale window
 # UNTIL THE THIEF IS DONE (marker, not a fixed sleep — a fixed hold once let a
 # slow-starting thief arrive after the victim had already released), the other
@@ -485,15 +436,17 @@ touch "$TDONE"
 wait "$vic"; vic_rc=$?
 [ "$vic_rc" = 98 ] && ok "robbed bash holder exited 98" || bad "robbed bash holder exited $vic_rc (want 98)"
 [ "$thief_rc" = 0 ] && ok "pwsh thief exited 0" || bad "pwsh thief exited $thief_rc"
+fi
 
-echo "== Test 9: a slow but UNCONTENDED pwsh holder keeps its lock (slowness != failure) =="
+if section "Test 9: a slow but UNCONTENDED pwsh holder keeps its lock (slowness != failure)"; then
 LOCK="$WORK/slow.lock"; LOG="$WORK/slow.log"; : > "$LOG"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \
   pwsh -NoProfile -File "$PS1WIN" run "Start-Sleep 2"; rc=$?
 [ "$rc" = 0 ] && ok "uncontended slow pwsh holder exited 0" || bad "uncontended slow pwsh holder exited $rc"
 grep -q "WARNING" "$LOG" && bad "spurious theft WARNING with no contender" || ok "no spurious WARNING when uncontended"
+fi
 
-echo "== Test 10: default lock location is <gitdir>/commit.lock for BOTH impls (regression: item 1) =="
+if section "Test 10: default lock location is <gitdir>/commit.lock for BOTH impls (regression: item 1)"; then
 # The BLOCKER this guards against: the .ps1 silently fell back to a CWD lock at
 # default config, so the two impls never contended. Run BOTH impls from a
 # SUBDIRECTORY of a scratch repo with AGENT_LOCK_PATH/LOG unset; each command
@@ -515,8 +468,9 @@ nps="$(grep -c "ACQUIRED.*tok=tok\.ps\." "$DLOG" 2>/dev/null)"
   && ok "shared <gitdir> log shows 1 bash + 1 pwsh acquisition" \
   || bad "default-log evidence wrong: ACQUIRED=$na (want 2), pwsh tokens=$nps (want 1) in $DLOG"
 [ -e "$GITDIR2/commit.lock" ] && bad "leftover default lock" || ok "no leftover default lock"
+fi
 
-echo "== Test 11: release-time classification agrees across impls — truncated => unverifiable (1); deleted => theft (98) =="
+if section "Test 11: release-time classification agrees across impls — truncated => unverifiable (1); deleted => theft (98)"; then
 # (i) TRUNCATED at release: the file still exists but reads EMPTY after the
 # retry ladder. NOT provable theft (it is the probe-F create->write window of
 # a successor after a boundary steal, or external truncation), so BOTH impls
@@ -545,8 +499,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=20 \
   pwsh -NoProfile -File "$PS1WIN" run "Remove-Item -LiteralPath '$LOCK' -Force" 2>/dev/null; rc_ps=$?
 [ "$rc_sh" = 98 ] && ok "bash: lock GONE at release -> exit 98 (theft)" || bad "bash gone-at-release rc=$rc_sh (want 98)"
 [ "$rc_ps" = 98 ] && ok "pwsh: lock GONE at release -> exit 98 (theft)" || bad "pwsh gone-at-release rc=$rc_ps (want 98)"
+fi
 
-echo "== Test 12: fractional STALE/MAX_WAIT rejected identically by both impls (note + default) =="
+if section "Test 12: fractional STALE/MAX_WAIT rejected identically by both impls (note + default)"; then
 # These two knobs are integers in both impls; a fractional value silently
 # rounded by one side but rejected by the other would give the two impls
 # DIFFERENT steal thresholds for the same env. Both must note + use defaults.
@@ -601,10 +556,11 @@ n_ps="$(grep -c 'ignoring invalid' "$WORK/poll-ps.err")"
 [ "$rc_sh" = 0 ] && [ "$n_sh" = 0 ] && [ "$rc_ps" = 0 ] && [ "$n_ps" = 0 ] \
   && ok "POLL_SECS='' (empty): silent default in BOTH impls (no note)" \
   || bad "POLL_SECS='' parity: sh rc=$rc_sh notes=$n_sh; pwsh rc=$rc_ps notes=$n_ps (want rc 0 + 0 notes each)"
+fi
 
 if [ "$GCL_WINDOWS" = 1 ]; then
 
-echo "== Test 13: blocked release (no-delete-share handle) — deterministic LEFTOVER, run keeps the command's code, then recovery =="
+if section "Test 13: blocked release (no-delete-share handle) — deterministic LEFTOVER, run keeps the command's code, then recovery"; then
 # Probe D1 made this lane deterministically testable (TODO #30): a pwsh
 # FileShare.Read handle on the lock file blocks the release unlink (and any
 # steal rename) until it closes. (a) sourced bash: lock_release returns 1 and
@@ -708,8 +664,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 AGENT_LOCK
 [ "$rc" = 0 ] && ok "leftover reclaimed once the handle closed + stale window elapsed (TODO #30 lane)" \
               || bad "leftover recovery rc=$rc (want 0)"
 grep -q STOLE "$LOG" && ok "recovery steal logged" || bad "no STOLE entry during leftover recovery"
+fi
 
-echo "== Test 14: blocked steal — a no-delete-share handle on a STALE lock defers the steal until it closes =="
+if section "Test 14: blocked steal — a no-delete-share handle on a STALE lock defers the steal until it closes"; then
 # Same handle class against a stale lock: the stealer's rename keeps failing
 # while the handle is open (probe D1), so it re-polls — and acquires promptly
 # once the handle closes. Run with the ps1 stealer: this exercises its
@@ -737,8 +694,9 @@ else
   touch "$BGO"; wait "$blk14" 2>/dev/null
   bad "T14 blocker never signalled its handle open"
 fi
+fi
 
-echo "== Test 14b: blocked steal NEVER bypasses MAX_WAIT — squatted stale lock => 97 with bounded logging (regression: busy-spin) =="
+if section "Test 14b: blocked steal NEVER bypasses MAX_WAIT — squatted stale lock => 97 with bounded logging (regression: busy-spin)"; then
   # Discriminator: when the steal rename keeps
 # failing with the lock file still present (a no-delete-share handle squatting
   # it), a failed-steal lane that `continue`s past the timeout check AND the
@@ -810,13 +768,14 @@ else
   bad "T14b squatter never signalled its handle open"
 fi
 rm -f "$LOCK"
+fi
 
 else
   echo "== Tests 13/14/14b SKIPPED (POSIX): open handles never block unlink/rename here =="
   echo "note: the LEFTOVER and blocked-steal lanes are Windows-only by construction (.NET's Unix FileShare gates no namespace operation); the Windows CI leg covers them"
 fi
 
-echo "== Test 15: ps1-side never-steal guards — dir, dangling symlink, non-lock content (parity with the bash guards) =="
+if section "Test 15: ps1-side never-steal guards — dir, dangling symlink, non-lock content (parity with the bash guards)"; then
 # The ps1 guards use different APIs than bash (PSIsContainer, reparse
 # attributes, the catch-all CreateNew exception), so bash coverage proves
 # nothing about them. The wrong-type warning needs the SAME concrete type on
@@ -875,21 +834,22 @@ grep -q "is not a lock file" "$WORK/psuser.err" && ok "ps1: config warning names
                                                 || bad "ps1: no config warning for non-lock content"
 grep -q STOLE "$LOG" && bad "ps1 STOLE the user file" || ok "ps1: no steal of the user file"
 rm -f "$LOCK"
+fi
 
-echo "== Test 16: crash recovery under CONTENTION, mixed impls — claim-serialized: zero displacement, zero 98s =="
+if section "Test 16: crash recovery under CONTENTION, mixed impls — claim-serialized: zero displacement, zero 98s"; then
 # Cross-impl variant of the unit suite's Test 2b (which carries the full
-# rationale): 2 bash + 2 pwsh waiters race ONE crashed lock. Under the claim
-# protocol the straggler-robs-recovery-winner race is PREVENTED (the claim
-# serializes stealers across the wire format), not detected-and-repaired, so
-# the assertions are strict: every waiter exits 0 (zero spurious 98s — an
-# unserialized implementation displaces the recovery winner near-certainly),
-# exactly ONE STOLE-BY-CLAIM, NO move-aside file ever exists (an
-# implementation that staged the steal through an intermediate .dead.* file
-# would re-open the displacement race; a background sampler proves no such
-# file ever appears — and the unserialized "STOLE stale lock" line shape and
-# any STEAL-DISPLACED repair line must never appear), and the final state
-# is clean (no lock, no claim). Sync: waiters launch against a FRESH
-# fabricated lock and only once all four have logged WAITING is it
+# rationale): N waiters split half bash / half pwsh race ONE crashed lock.
+# Under the claim protocol the straggler-robs-recovery-winner race is
+# PREVENTED (the claim serializes stealers across the wire format), not
+# detected-and-repaired, so the assertions are strict: every waiter exits 0
+# (zero spurious 98s — an unserialized implementation displaces the recovery
+# winner near-certainly), exactly ONE STOLE-BY-CLAIM, NO move-aside file ever
+# exists (an implementation that staged the steal through an intermediate
+# .dead.* file would re-open the displacement race; a background sampler proves
+# no such file ever appears — and the unserialized "STOLE stale lock" line
+# shape and any STEAL-DISPLACED repair line must never appear), and the final
+# state is clean (no lock, no claim). Sync: waiters launch against a FRESH
+# fabricated lock and only once all have logged WAITING is it
 # backdated, so all judge stale within one poll window despite pwsh's slow
 # cold start; the sync keeps the ghost fresh while it waits
 # (sync_waiting_fresh) so a stalled sync can't let the ghost age stale on
@@ -901,13 +861,34 @@ echo "== Test 16: crash recovery under CONTENTION, mixed impls — claim-seriali
 # the run's premise is broken (the touch may have aged the WINNER'S live
 # lock), so the run is discarded and retried (bounded) instead of failing
 # assertions the protocol never violated.
+#
+# Waiter count is swept over $T_AXIS_A (see load-testing-strategy.md): one iteration at N=4 by
+# default (2 bash + 2 pwsh — byte-identical to today) and at N=4,12,24 under
+# GCL_TEST_SWEEP=1. N is split into a bash half (N/2) and a pwsh half (the
+# remainder); at N=4 that is 2+2 exactly. The correctness invariants stay strict
+# at EVERY N — but that needs STALE >> the winner's EFFECTIVE hold, which grows
+# with N under load (the winner is one of N concurrent processes), so STALE is
+# floored to N when sweeping (t16_stale); at the default floor it is the same 8
+# as today. MAX_WAIT scales too (30*N => 120 at N=4) so a wide, pwsh-cold-start-
+# heavy sweep has time to drain. The per-N tag on the non-count-naming
+# assertions is suppressed in the default run so the messages stay byte-identical.
 LOCK="$WORK/recov.lock"
 T16_TRIES=3
 T16_GRAVESEEN="$WORK/recov.graveseen"; T16_SAMPSTOP="$WORK/recov.sampstop"
+for T16_N in $T_AXIS_A; do
+t16_nsh=$(( T16_N / 2 )); t16_nps=$(( T16_N - t16_nsh ))   # bash half + pwsh half (2+2 at N=4)
+t16_maxwait=$(( 30 * T16_N ))
+# STALE budget: today's 8 in the default (non-sweep) run for byte-identical
+# behaviour; when sweeping, floor it to N so a wide fan-out's load-stretched
+# winner hold can never make its own live lock look stale (a legitimate but
+# unwanted second steal), keeping "exactly one steal" strict at every N.
+if [ "$GCL_TEST_SWEEP" = 1 ] && [ "$T16_N" -gt 8 ]; then t16_stale="$T16_N"; else t16_stale=8; fi
+if [ "$GCL_TEST_SWEEP" = 1 ]; then t16_ntag=" at N=$T16_N"; else t16_ntag=""; fi
 t16_valid=0; t16_sync=1; t16_fail=0; n98=0
 for t16_try in $(seq 1 "$T16_TRIES"); do
-  T16_GHOST="tok.ghost.recov.$t16_try"
-  rm -f "$WORK"/recov.ran.* "$T16_GRAVESEEN" "$T16_SAMPSTOP" "$LOCK" "$LOCK.next" 2>/dev/null
+  T16_GHOST="tok.ghost.recov.$T16_N.$t16_try"
+  rm -f "$WORK"/recov.ran.* "$WORK"/recov-sh*.log "$WORK"/recov-ps*.log \
+        "$T16_GRAVESEEN" "$T16_SAMPSTOP" "$LOCK" "$LOCK.next" 2>/dev/null
   fabricate_lock "$LOCK" "$T16_GHOST" "pid=999 host=ghost"   # fresh mtime: not yet stale
   (
     while [ ! -e "$T16_SAMPSTOP" ]; do
@@ -918,41 +899,45 @@ for t16_try in $(seq 1 "$T16_TRIES"); do
     done
   ) &
   t16_sampler=$!
-  pids=()
-  for i in 1 2; do
+  pids=(); t16_logs=()
+  for i in $(seq 1 "$t16_nsh"); do
     : > "$WORK/recov-sh$i.log"   # per-waiter logs: concurrent appends to one log drop lines
-    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-sh$i.log" AGENT_LOCK_STALE_SECS=8 \
-      AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \
+    t16_logs+=("$WORK/recov-sh$i.log")
+    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-sh$i.log" AGENT_LOCK_STALE_SECS="$t16_stale" \
+      AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t16_maxwait" \
       bash "$SH" run -- bash -c 'touch "$1"; sleep 0.1' _ "$WORK/recov.ran.sh$i" 2>/dev/null &
     pids+=($!)
   done
-  for i in 1 2; do
+  for i in $(seq 1 "$t16_nps"); do
     : > "$WORK/recov-ps$i.log"
-    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-ps$i.log" AGENT_LOCK_STALE_SECS=8 \
-      AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \
+    t16_logs+=("$WORK/recov-ps$i.log")
+    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-ps$i.log" AGENT_LOCK_STALE_SECS="$t16_stale" \
+      AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t16_maxwait" \
       pwsh -NoProfile -File "$PS1WIN" run "[IO.File]::WriteAllText('$WORK/recov.ran.ps$i', 'x'); Start-Sleep -Milliseconds 100" 2>/dev/null &
     pids+=($!)
   done
   t16_sync=1
-  if ! sync_waiting_fresh "$LOCK" 90 "$WORK/recov-sh1.log" "$WORK/recov-sh2.log" \
-                          "$WORK/recov-ps1.log" "$WORK/recov-ps2.log"; then
+  if ! sync_waiting_fresh "$LOCK" 90 "${t16_logs[@]}"; then
     t16_sync=0
-    for f in "$WORK/recov-sh1.log" "$WORK/recov-sh2.log" "$WORK/recov-ps1.log" "$WORK/recov-ps2.log"; do
-      grep -q "WAITING for lock" "$f" 2>/dev/null || echo "  T16 waiter never contended (no WAITING in ${f##*/})"
+    for f in "${t16_logs[@]}"; do
+      grep -q "WAITING for lock" "$f" 2>/dev/null || echo "  T16 N=$T16_N waiter never contended (no WAITING in ${f##*/})"
     done
   fi
-  backdate_ghost "$LOCK" "$T16_GHOST" 9999; t16_bd=$?   # all four now judge the ghost stale together
+  backdate_ghost "$LOCK" "$T16_GHOST" 9999; t16_bd=$?   # all waiters now judge the ghost stale together
   t16_fail=0; n98=0
   for p in "${pids[@]}"; do
     wait "$p"; rc=$?
     case "$rc" in
       0)  ;;
-      98) n98=$((n98+1)); echo "  T16 waiter rc=98 — displacement under the claim protocol" ;;
-      *)  t16_fail=1; echo "  T16 waiter rc=$rc (want 0)" ;;
+      98) n98=$((n98+1)); echo "  T16 N=$T16_N waiter rc=98 — displacement under the claim protocol" ;;
+      *)  t16_fail=1; echo "  T16 N=$T16_N waiter rc=$rc (want 0)" ;;
     esac
   done
   touch "$T16_SAMPSTOP"; wait "$t16_sampler" 2>/dev/null
-  cat "$WORK"/recov-*.log > "$WORK/recov-all.log" 2>/dev/null || : > "$WORK/recov-all.log"
+  # Aggregate from the explicit per-waiter log list, NOT a recov-*.log glob: the
+  # glob would also match recov-all.log itself, which now persists across sweep N
+  # iterations, so a glob could self-cat a stale aggregate into the count.
+  cat "${t16_logs[@]}" > "$WORK/recov-all.log" 2>/dev/null || : > "$WORK/recov-all.log"
   if [ "$t16_bd" != 0 ]; then
     # The backdate was NOT conclusively clean (see backdate_ghost; under
     # load the whole steal+release cycle often completes before the
@@ -969,7 +954,7 @@ for t16_try in $(seq 1 "$T16_TRIES"); do
     [ "$(grep -c "lock LOST" "$WORK/recov-all.log")" = 0 ] || t16_dirty=1
     { [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } && t16_dirty=1
     if [ "$t16_dirty" = 1 ]; then
-      echo "  T16 try $t16_try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying"
+      echo "  T16 N=$T16_N try $t16_try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying"
       rm -f "$LOCK" "$LOCK.next" 2>/dev/null
       continue
     fi
@@ -984,32 +969,34 @@ if [ "$t16_valid" = 1 ]; then
   nold="$(grep -c "STOLE stale lock" "$WORK/recov-all.log")"
   ndisp="$(grep -c "STEAL-DISPLACED" "$WORK/recov-all.log")"
   [ "$t16_fail" = 0 ] && [ "$t16_sync" = 1 ] \
-    && ok "2 bash + 2 pwsh waiters on one crashed lock: every waiter exited 0" \
-    || bad "mixed crash-recovery exits wrong (see above)"
-  [ "$n98" = 0 ] && ok "zero spurious 98s — the claim serialized recovery across implementations" \
-                 || bad "$n98 waiter(s) exited 98 — displacement happened under the claim protocol"
-  [ "$nran" = 4 ] && ok "all 4 waiter commands ran" || bad "only $nran/4 waiter commands ran"
-  [ "$nstole" = 1 ] && ok "exactly ONE STOLE-BY-CLAIM (the claim serialized the cross-impl recovery)" \
-                    || bad "STOLE-BY-CLAIM x$nstole (want exactly 1)"
+    && ok "$t16_nsh bash + $t16_nps pwsh waiters on one crashed lock: every waiter exited 0" \
+    || bad "mixed crash-recovery exits wrong$t16_ntag (see above)"
+  [ "$n98" = 0 ] && ok "zero spurious 98s$t16_ntag — the claim serialized recovery across implementations" \
+                 || bad "$n98 waiter(s) exited 98$t16_ntag — displacement happened under the claim protocol"
+  [ "$nran" = "$T16_N" ] && ok "all $T16_N waiter commands ran" || bad "only $nran/$T16_N waiter commands ran"
+  [ "$nstole" = 1 ] && ok "exactly ONE STOLE-BY-CLAIM$t16_ntag (the claim serialized the cross-impl recovery)" \
+                    || bad "STOLE-BY-CLAIM x$nstole$t16_ntag (want exactly 1)"
   grep -q "STOLE-BY-CLAIM.*ghost=pid=999 host=ghost" "$WORK/recov-all.log" \
-    && ok "the steal line attributes the crashed ghost cross-impl (wire-format line 2 parsed)" \
-    || bad "STOLE-BY-CLAIM does not carry the ghost's line-2 attribution"
+    && ok "the steal line attributes the crashed ghost cross-impl (wire-format line 2 parsed)$t16_ntag" \
+    || bad "STOLE-BY-CLAIM does not carry the ghost's line-2 attribution$t16_ntag"
   grep -q "CLAIM .*tok=tok\." "$WORK/recov-all.log" \
-    && ok "claim create logged with its per-attempt token (CLAIM ... tok=)" \
-    || bad "no CLAIM line with a token in the recovery logs"
-  [ "$nold" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged" \
-    || bad "'STOLE stale lock' shape appeared x$nold — an unserialized steal lane is present"
-  [ "$ndisp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)" \
-    || bad "STEAL-DISPLACED fired x$ndisp — displacement-repair machinery present?"
-  [ -e "$T16_GRAVESEEN" ] && bad "a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!" \
-    || ok "no move-aside file (.dead.*) ever existed during recovery (sampler)"
-  [ -e "$LOCK" ] && bad "leftover crash-recovery lock" || ok "no leftover lock"
-  [ -e "$LOCK.next" ] && bad "leftover claim after recovery" || ok "no leftover claim"
+    && ok "claim create logged with its per-attempt token (CLAIM ... tok=)$t16_ntag" \
+    || bad "no CLAIM line with a token in the recovery logs$t16_ntag"
+  [ "$nold" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged$t16_ntag" \
+    || bad "'STOLE stale lock' shape appeared x$nold$t16_ntag — an unserialized steal lane is present"
+  [ "$ndisp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)$t16_ntag" \
+    || bad "STEAL-DISPLACED fired x$ndisp$t16_ntag — displacement-repair machinery present?"
+  [ -e "$T16_GRAVESEEN" ] && bad "a move-aside file (.dead.*) existed during recovery$t16_ntag — the steal is staged through an intermediate file!" \
+    || ok "no move-aside file (.dead.*) ever existed during recovery (sampler)$t16_ntag"
+  [ -e "$LOCK" ] && bad "leftover crash-recovery lock$t16_ntag" || ok "no leftover lock$t16_ntag"
+  [ -e "$LOCK.next" ] && bad "leftover claim after recovery$t16_ntag" || ok "no leftover claim$t16_ntag"
 else
-  bad "T16: no clean run under a conclusive backdate in $T16_TRIES attempts (see above)"
+  bad "T16: no clean run under a conclusive backdate in $T16_TRIES attempts$t16_ntag (see above)"
+fi
+done
 fi
 
-echo "== Test 16b: bash claimant vs ps1 claimant racing ONE ghost — one claim winner, cross-impl wire parity =="
+if section "Test 16b: bash claimant vs ps1 claimant racing ONE ghost — one claim winner, cross-impl wire parity"; then
 # The 1+1 distilled version of Test 16: one bash and one pwsh waiter race the
 # same ancient ghost. Exactly one wins the O_EXCL claim and steals
 # (STOLE-BY-CLAIM x1); the loser either loses the claim create (a young
@@ -1081,8 +1068,9 @@ if [ "$t16b_valid" = 1 ]; then
 else
   bad "T16b: no clean run under a conclusive backdate in $T16B_TRIES attempts (see above)"
 fi
+fi
 
-echo "== Test 16c: cross-impl claim staleness — each side clears the OTHER side's aged claim; young foreign claims are respected =="
+if section "Test 16c: cross-impl claim staleness — each side clears the OTHER side's aged claim; young foreign claims are respected"; then
 # (a) bash clears an aged ps1-tokened claim, then completes the steal.
 LOCK="$WORK/cstale.lock"; LOG="$WORK/cstale.log"; : > "$LOG"
 fabricate_lock "$LOCK" "tok.ghost.cstale" "pid=9 host=ghost"; backdate "$LOCK" 9999
@@ -1132,8 +1120,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
   && ok "ps1 respected a young bash claim (97, claim intact, no clear/steal)" \
   || bad "ps1 young-bash-claim handling: rc=$rc intact=$([ -f "$LOCK.next" ] && echo yes || echo no)"
 rm -f "$LOCK" "$LOCK.next"
+fi
 
-echo "== Test 16d: static checks — no File.Replace in the ps1 port =="
+if section "Test 16d: static checks — no File.Replace in the ps1 port"; then
 # File.Replace is deliberately never used: it throws on a
 # read-only destination and has partial-failure states when called without a
 # backup file. The 5.1 lane must stay unlink + fail-if-exists Move.
@@ -1142,8 +1131,9 @@ if grep -qE 'File\]?::Replace' "$ROOT/git-commit-lock.ps1"; then
 else
   ok "git-commit-lock.ps1 contains no File.Replace call"
 fi
+fi
 
-echo "== Test 16e: ps1 arc-end pass keeps INCONCLUSIVE entries; trap-time discovery-HOLD releases per normal release semantics =="
+if section "Test 16e: ps1 arc-end pass keeps INCONCLUSIVE entries; trap-time discovery-HOLD releases per normal release semantics"; then
 # Driven directly via a dot-sourcing pwsh driver — the ps1 side's
 # unit-equivalent steering mechanism (the lib skips its CLI when
 # dot-sourced). Part 1: the arc-end resolution pass's entry-drop is gated
@@ -1237,8 +1227,9 @@ PSEOF
 else
   echo "note: the blocked trap-time release leg is Windows-only by construction (POSIX open handles never block unlink); the happy-path leg above pins the honest-log contract"
 fi
+fi
 
-echo "== Test 16f: ps1 claim-gone-at-touch — the SetLastWriteTimeUtc FileNotFound gone signal fires; no resurrection =="
+if section "Test 16f: ps1 claim-gone-at-touch — the SetLastWriteTimeUtc FileNotFound gone signal fires; no resurrection"; then
 # The unit suite's discovery-position matrix (T25) covers bash's
 # touch-gone lane; this is the ps1 counterpart: the claim passes the
 # step-3.1 recheck, vanishes before the step-3.2 touch (steered via the
@@ -1297,9 +1288,10 @@ PSEOF
 else
   echo "== Test 16f SKIPPED: claim-gone-at-touch steering uses Windows pwsh (POSIX legs cover the protocol via the bash matrix; the ps1 gone-catch is probed Q1) =="
 fi
+fi
 
 if command -v powershell >/dev/null 2>&1; then
-echo "== Test 17: Windows PowerShell 5.1 smoke lane — the ps1 must run, not just parse, on the in-box engine =="
+if section "Test 17: Windows PowerShell 5.1 smoke lane — the ps1 must run, not just parse, on the in-box engine"; then
 # Everything above runs the port under pwsh (7+). 5.1 ships in every Windows
 # 10/11 box and stays supported, so its claim is tested, not asserted: the
   # run lane's exit-code contract (0 / exit 7 / the failing-cmdlet -> 1) and
@@ -1369,11 +1361,20 @@ AGENT_LOCK_PATH="$LOCK51" AGENT_LOCK_LOG="$LOG51" AGENT_LOCK_STALE_SECS=2 \
 grep -q "CLAIM .*tok=tok\.ps\." "$LOG51" && ok "5.1: claim create logged with its per-attempt token" || bad "5.1: no CLAIM line with a tok.ps.* token"
 [ -e "$LOCK51" ] && bad "5.1: leftover lock after the steal ladder" || ok "5.1: no leftover lock"
 [ -e "$LOCK51.next" ] && bad "5.1: leftover claim after the steal ladder" || ok "5.1: no leftover claim"
+fi
 else
   echo "== Test 17 SKIPPED: Windows PowerShell 5.1 (powershell) not on PATH — POSIX leg; the Windows CI leg covers it =="
   echo "note: the 5.1 unlink+Move steal-ladder leg is part of this lane and is covered by the Windows CI leg"
 fi
 
 echo
+# Zero-match guard + selector-report line (shared helper in _harness.sh): a
+# set-but-non-matching GCL_TEST_ONLY ran no test block, so the (vacuously green)
+# verdict below would lie — bail loudly; a typo'd selector regex must FAIL, not
+# pass with zero assertions. When the selector matched, report how many blocks
+# ran. Both gated on GCL_TEST_ONLY non-empty so the default run stays unchanged.
+selector_report
+DONE=1
 echo "==== INTEROP RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ===="
+[ "$GCL_TAP" = 1 ] && echo "1..$TAPN"
 [ "$FAIL" = 0 ]
diff --git a/tests/git-commit-lock.test.sh b/tests/git-commit-lock.test.sh
index 021ea22..3a41419 100755
--- a/tests/git-commit-lock.test.sh
+++ b/tests/git-commit-lock.test.sh
@@ -7,7 +7,7 @@
 #
 # Fan-out: heavy concurrency tests default to REDUCED width so routine dev
 # runs don't lag a live shared machine; set GCL_TEST_FULL=1 (CI does) for the
-# full-strength canary. The suite prints which mode ran — a reduced pass must
+# full-strength fan-out. The suite prints which mode ran — a reduced pass must
 # never masquerade as the full one.
 #
 # On failure the work dir is PRESERVED (path printed) for post-mortem; set
@@ -21,21 +21,36 @@
 # and command substitutions run inside conditions all over a test suite; the
 # suite runs WITHOUT errexit (set -uo only) and asserts on values, not on
 # implicit exit propagation.
-# shellcheck disable=SC2016  # $INCR is single-quoted on purpose: it expands
-# inside the worker's `bash -c`, not here.
+# shellcheck disable=SC2016  # Single-quoted strings carrying `$…` on purpose —
+# steering-shell bodies (the T*_INNER `bash -c` programs) and grep patterns that
+# match literal `$_LOCK_*` text in the library — expand in their own context, not
+# here.
 set -uo pipefail
 
+# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad,
+# section, the finish EXIT-trap sentinel (calls our cleanup below), and the
+# shared timing/lock helpers (epoch_to_stamp, backdate, backdate_ghost,
+# sync_waiting_fresh, fabricate_lock, wait_for_grep). Resolved from THIS
+# script's own dir so it sources regardless of CWD; sourced EARLY (before any
+# use of the inits/helpers below).
+_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=tests/_harness.sh
+. "$_HARNESS_DIR/_harness.sh"
+
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT="$(cd "$DIR/.." && pwd)"   # the implementations live at the repo root
 LIB="$ROOT/git-commit-lock.sh"
 
 if [ "${GCL_TEST_FULL:-0}" = 1 ]; then
-  GCL_MODE="FULL"; T1_ROUNDS=8; T1_N=25; T2B_ROUNDS=4; T20_N=10
+  GCL_MODE="FULL"; T2B_ROUNDS=4; T20_N=10
 else
-  GCL_MODE="REDUCED"; T1_ROUNDS=3; T1_N=8; T2B_ROUNDS=2; T20_N=5
+  GCL_MODE="REDUCED"; T2B_ROUNDS=2; T20_N=5
 fi
-echo "fan-out mode: $GCL_MODE (T1 ${T1_ROUNDS} rounds x ${T1_N} workers)"
-[ "$GCL_MODE" = REDUCED ] && echo "  (set GCL_TEST_FULL=1 for the full-strength 8x25 canary — CI runs it)"
+# (The full-width concurrency canary, formerly Test 1, now lives in its own file
+# tests/git-commit-lock.canary.test.sh; this suite's heavy fan-out is Test 2b /
+# Test 20.)
+echo "fan-out mode: $GCL_MODE (Test 2b ${T2B_ROUNDS} rounds, Test 20 ${T20_N} concurrent workers)"
+[ "$GCL_MODE" = REDUCED ] && echo "  (set GCL_TEST_FULL=1 for full-strength fan-out — CI runs it)"
 
 WORK="$(mktemp -d 2>/dev/null || echo "${TMPDIR:-/tmp}/git-commit-lock-test.$$")"
 mkdir -p "$WORK"
@@ -51,78 +66,32 @@ cleanup() {
     rm -rf "$WORK" 2>/dev/null || true
   fi
 }
-trap cleanup EXIT
-
-PASS=0; FAIL=0
-ok()  { echo "PASS: $*"; PASS=$((PASS+1)); }
-bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); }
-
-# Backdate a path's mtime by $2 seconds — the lock's staleness clock is the
-# lock FILE's own mtime (stamped by the creating write), so this is how a
-# test fakes a stale lock. Portable: BSD touch has no `-d @epoch`, so convert
-# the target epoch to a `touch -t` stamp via GNU `date -d @` with BSD
-# `date -r` as fallback.
-epoch_to_stamp() {
-  date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null
-}
-backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; }
-
-# Token-guarded backdate for the contended-recovery rounds (T2b). Why: under
-# load a fast waiter can complete its ENTIRE steal (claim -> rename-over ->
-# ACQUIRED) before the harness's `touch` executes, so a blind backdate lands
-# on the WINNER'S freshly installed lock, making it instantly stale for every
-# rival — a legitimate re-steal then fails the round's "zero 98s / exactly
-# one STOLE-BY-CLAIM" assertions although the protocol behaved exactly as
-# designed (observed 2026-06-12 on a loaded box). Verdicts:
-#   * pre-read not the ghost: a waiter stole the ghost BEFORE the touch (it
-#     aged stale naturally during a stalled sync); no touch is performed and
-#     the round premise is gone — invalid, the caller retries the round.
-#   * post-read the ghost: conclusive — nothing ever rewrites the ghost
-#     token at the path, so the touch verifiably hit the ghost; any steal
-#     after the post-read steals an ALREADY-ancient ghost, exactly the
-#     scenario the round wants. Valid.
-#   * post-read anything else: a steal raced the touch->re-read window —
-#     COMMON under load (waiters poll every 0.05s; the post-read costs
-#     subprocess spawns), so it must not blindly invalidate. The lock's
-#     MTIME arbitrates which file the touch hit: a winner's installed lock
-#     is FRESH (the rename carries the claim file's just-created mtime), so
-#     fresh => the touch hit the GHOST and a legitimate steal followed —
-#     valid; ancient => the touch landed on the WINNER'S live lock and
-#     corrupted the round — invalid, retry. Vanished => cannot arbitrate —
-#     invalid, retry.
-backdate_ghost() {  # $1=lock $2=ghost token $3=age-secs -> 0 iff the round premise is intact
-  local pre post now mt
-  pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
-  [ "$pre" = "$2" ] || return 1
-  backdate "$1" "$3" 2>/dev/null || return 1
-  post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')"
-  [ "$post" = "$2" ] && return 0
-  [ -e "$1" ] || return 1
-  now="$(date +%s)"
-  mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1
-  [ $(( now - mt )) -lt $(( $3 / 2 )) ]
-}
+# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup()
+# above and fails loudly if the suite died before setting DONE=1.
+trap finish EXIT
 
-# Wait for every waiter's WAITING line while keeping the ghost lock FRESH
-# (touch -c to now, no-create so a released path is never resurrected): a
-# fresh ghost cannot be judged stale, so no waiter can steal it before the
-# guarded backdate — without this, a sync stalled past STALE (slow worker
-# cold starts on a loaded box) lets the ghost age stale naturally and a
-# waiter steals it mid-sync. Freshening is race-safe: if a steal slipped in
-# anyway, touching the winner's (already fresh) live lock to "now" is a
-# harmless no-op, and backdate_ghost's pre-read catches the broken premise.
-sync_waiting_fresh() {  # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING
-  local lock="$1" deadline f ok=1
-  deadline=$(( $(date +%s) + $2 )); shift 2
-  for f in "$@"; do
-    until grep -q "WAITING for lock" "$f" 2>/dev/null; do
-      touch -c "$lock" 2>/dev/null
-      if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi
-      sleep 0.2
-    done
-  done
-  [ "$ok" = 1 ]
-}
+# Envelope-tier assertions (see failure-modes.md §K / §4 item 1). A wall-clock or poll-count
+# bound is a Tier-2 (best-effort latency) property, NOT a correctness one (see
+# guarantees.md BE-1). In the default 'strict' tier these behave exactly like
+# ok/bad. Under GCL_ENVELOPE_TIER=relax (nightly/deep stress runs) an envelope FAIL
+# becomes a WARN that does NOT increment FAIL — so an oversubscribed runner can't
+# turn a latency miss into a red — while every CORRECTNESS assertion keeps ok/bad
+# and stays hard in both tiers. TAP-aware so envelope assertions still count toward 1..N.
+ENVELOPE_TIER="${GCL_ENVELOPE_TIER:-strict}"
+ENV_WARN=0
+ok_envelope()  { PASS=$((PASS+1)); TAPN=$((TAPN+1)); echo "PASS[env]: $*"
+                 [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $*"; return 0; }
+bad_envelope() {
+  if [ "$ENVELOPE_TIER" = relax ]; then
+    ENV_WARN=$((ENV_WARN+1)); TAPN=$((TAPN+1)); echo "WARN[env-relaxed]: $*"
+    [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $* # env-relaxed"
+  else
+    FAIL=$((FAIL+1)); TAPN=$((TAPN+1)); echo "FAIL: $*"
+    [ "$GCL_TAP" = 1 ] && echo "not ok $TAPN - $*"
+  fi; return 0; }
+
+# epoch_to_stamp, backdate, backdate_ghost, and sync_waiting_fresh now live in
+# _harness.sh (sourced above) — shared byte-for-byte with the interop suite.
 
 # Clone a shell function under a new name — the steering tests' interposition
 # mechanism: a sourced test shell wraps a library internal (or a command like
@@ -135,66 +104,25 @@ clone_fn() {  # $1=existing function $2=new name
 }
 export -f clone_fn epoch_to_stamp backdate
 
-# Fabricate a lock file the way a real (foreign) holder would have written it:
-# token line + owner line. The token MUST be "tok."-prefixed (wire format) or
-# the steal's content guard will — correctly — refuse to steal it.
-fabricate_lock() {  # $1=path $2=token $3=owner
-  printf '%s\n%s\n' "$2" "$3" > "$1"
-}
+# fabricate_lock and wait_for_grep now live in _harness.sh (sourced above) —
+# shared byte-for-byte with the interop suite.
 
 # Wait (up to $2 seconds, default 15) for a marker file to appear. Holders
 # touch a ready-marker as their first act INSIDE the lock; tests gate on that
-# instead of sleep-margin head starts, which flaked under load.
+# instead of sleep-margin head starts, which flaked under load. Unit-only: the
+# interop suite has its own poll helper (wait_for, 50ms-iteration semantics).
 wait_for_file() {
   local f="$1" tries=$(( ${2:-15} * 20 ))
   while [ ! -e "$f" ] && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done
   [ -e "$f" ]
 }
 
-# Wait (up to $3 seconds, default 15) for a pattern to appear in a file.
-# Used to gate on the WAITING log line: proof the waiter actually contended,
-# without a fixed-length hold.
-wait_for_grep() {
-  local pat="$1" f="$2" tries=$(( ${3:-15} * 20 ))
-  while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done
-  grep -q "$pat" "$f" 2>/dev/null
-}
+# NB: Test 1 (the full-width concurrency CANARY) now lives in its own suite file,
+# tests/git-commit-lock.canary.test.sh, so it runs as a naturally-parallel CI job
+# (it is ~half the Windows unit wall-clock). The $INCR critical-section string it
+# used moved out with it (no other unit test uses it).
 
-# Critical section that loses updates without a mutex: read, gap, write+1.
-INCR='n="$(cat "$1")"; sleep 0.03; echo $((n+1)) > "$1"'
-
-echo "== Test 1: concurrent workers, mutual exclusion (repeated rounds, $GCL_MODE width) =="
-# A single pass is too weak to trust a rare exclusion race (the release-steal
-# bug found 2026-05-30 lost ~1 update per 25 only intermittently). Repeat
-# several rounds; ANY lost update across ALL rounds fails the test.
-# MAX_WAIT caps a regression at 180s per worker instead of the 420s default;
-# STALE stays comfortably above any realistic hold so nothing is ever stolen.
-N=$T1_N; ROUNDS=$T1_ROUNDS; t1_fail=0; T1ERR="$WORK/excl.err"; : > "$T1ERR"
-for r in $(seq 1 "$ROUNDS"); do
-  COUNTER="$WORK/counter.$r"; echo 0 > "$COUNTER"
-  LOCK="$WORK/excl.$r.lock"; LOG="$WORK/excl.$r.log"; : > "$LOG"; pids=()
-  for _ in $(seq 1 "$N"); do
-    AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=120 \
-      AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=180 \
-      bash "$LIB" run -- bash -c "$INCR" _ "$COUNTER" 2>> "$T1ERR" &
-    pids+=($!)
-  done
-  for p in "${pids[@]}"; do wait "$p"; done
-  c="$(cat "$COUNTER")"; a="$(grep -c ACQUIRED "$LOG")"; rl="$(grep -c RELEASED "$LOG")"
-  if [ "$c" != "$N" ] || [ "$a" != "$N" ] || [ "$rl" != "$N" ] || [ -e "$LOCK" ]; then
-    t1_fail=1; echo "  round $r: counter=$c acquired=$a released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)"
-  fi
-done
-[ "$t1_fail" = 0 ] && ok "$ROUNDS rounds x $N workers ($GCL_MODE): no lost updates, balanced acquire/release, no leftover lock" \
-                    || bad "mutual-exclusion failure in at least one round (see above)"
-# Regression: under contention the lock file routinely vanishes mid-mtime-probe;
-# that must NOT be misdiagnosed as "staleness detection broken" (false WARNING
-# observed 2026-06-10 before the probe got its retry loop).
-grep -q "Staleness detection is BROKEN" "$T1ERR" \
-  && bad "spurious mtime-probe WARNING under contention (see $T1ERR)" \
-  || ok "no spurious mtime-probe warnings under contention"
-
-echo "== Test 2: stale lock (old file mtime) is stolen; holder comes from line 2 =="
+if section "Test 2: stale lock (old file mtime) is stolen; holder comes from line 2"; then
 LOCK="$WORK/steal.lock"; LOG="$WORK/steal.log"; : > "$LOG"; MARKER="$WORK/steal-marker"
 fabricate_lock "$LOCK" "tok.fake.99999.1" "pid=99999 host=ghost"
 backdate "$LOCK" 9999                       # make the FILE mtime ancient -> stale
@@ -208,8 +136,9 @@ grep -q STOLE "$LOG" && ok "log records a steal" || bad "no STOLE entry"
 grep -q "holder=pid=99999 host=ghost" "$LOG" \
   && ok "STALE log line carries the holder parsed from line 2" \
   || bad "holder from line 2 missing in the STALE log line"
+fi
 
-echo "== Test 2b: crash recovery under CONTENTION — claim-serialized: zero displacement, zero 98s ($GCL_MODE: $T2B_ROUNDS rounds) =="
+if section "Test 2b: crash recovery under CONTENTION — claim-serialized: zero displacement, zero 98s ($GCL_MODE: $T2B_ROUNDS rounds)"; then
 # The claim SERIALIZES stealers, so the straggler-robs-recovery-winner race
 # is PREVENTED, not detected-and-repaired. Scenario: one crashed lock, N
 # waiters judging stale in the same poll window (the launch/backdate sync
@@ -230,16 +159,49 @@ echo "== Test 2b: crash recovery under CONTENTION — claim-serialized: zero dis
 # WINNER'S live lock), the attempt is kept only if its outcome is clean and
 # otherwise discarded and retried (bounded), instead of failing assertions
 # the protocol never violated.
-T2B_N=4
+#
+# Waiter count is swept over $T_AXIS_A (see load-testing-strategy.md): one iteration at N=4 by
+# default (byte-identical to today) and at N=4,12,24 under GCL_TEST_SWEEP=1.
+# Every sweep iteration's assertions carry an " at N=<count>" tag so a sweep
+# failure says which N broke; that tag is SUPPRESSED in the default (non-sweep)
+# run (t2b_ntag empty) so the messages are byte-identical to today — the first
+# assertion already names the count via "$T2B_N waiters". The correctness
+# invariants asserted here (zero 98, exactly one steal, no move-aside, clean
+# final state) stay ok/bad strict (not envelope) at all N — but that requires
+# STALE >> the winner's EFFECTIVE hold, which grows with N under load (the
+# winner is one of N concurrent processes; oversubscription stretches the wall
+# time between its create and release), so STALE is floored to N when sweeping
+# (t2b_stale) — at the default floor it is the same 8 as today. The per-waiter
+# wall-clock budget scales too: MAX_WAIT = 30*N (=> 120 at N=4, today's value)
+# so a wide sweep, where the losing waiters acquire in sequence after the winner
+# releases, has time to drain instead of timing out and looking like a product
+# failure.
 T2B_TRIES=3   # per-round attempts; see the backdate_ghost note
+for T2B_N in $T_AXIS_A; do
+# MAX_WAIT and STALE: today's exact values (120 / 8) in the default (non-sweep)
+# run so the env passed to the library is byte-identical; only the sweep's wider
+# N raise them. MAX_WAIT scales 30*N (=> 120 at N=4 anyway). STALE floors to N so
+# a wide fan-out's load-stretched winner hold (the winner is one of N concurrent
+# processes) can never make its own live lock look stale and trigger a
+# legitimate-but-unwanted second steal.
+if [ "$GCL_TEST_SWEEP" = 1 ]; then
+  t2b_maxwait=$(( 30 * T2B_N ))
+  [ "$T2B_N" -gt 8 ] && t2b_stale="$T2B_N" || t2b_stale=8
+  t2b_ntag=" at N=$T2B_N"
+else
+  t2b_maxwait=120; t2b_stale=8; t2b_ntag=""
+fi
 t2b_fail=0; t2b_stole=0; t2b_old_shape=0; t2b_disp=0; t2b_98=0; t2b_retried=0
 for r in $(seq 1 "$T2B_ROUNDS"); do
   t2b_valid=0
   for try in $(seq 1 "$T2B_TRIES"); do
-    GHOST="tok.ghost.t2b.$r.$try"
+    # Ghost token carries an N segment only when sweeping (distinct per N); the
+    # default keeps today's exact "tok.ghost.t2b.$r.$try" so the lock CONTENT
+    # the library sees is byte-identical.
+    if [ "$GCL_TEST_SWEEP" = 1 ]; then GHOST="tok.ghost.t2b.$T2B_N.$r.$try"; else GHOST="tok.ghost.t2b.$r.$try"; fi
     LOCK="$WORK/recov.$r.lock"; RAN="$WORK/recov.$r.ran"; : > "$RAN"
     GRAVESEEN="$WORK/recov.$r.graveseen"; SAMPSTOP="$WORK/recov.$r.sampstop"
-    rm -f "$GRAVESEEN" "$SAMPSTOP" "$LOCK" "$LOCK.next"
+    rm -f "$GRAVESEEN" "$SAMPSTOP" "$LOCK" "$LOCK.next" "$WORK/recov.$r".*.log
     fabricate_lock "$LOCK" "$GHOST" "pid=999 host=ghost" # fresh mtime: not yet stale
     # Move-aside sampler: ANY .dead.* sighting at ANY moment during the round
     # means the implementation stages the steal through an intermediate file
@@ -253,21 +215,21 @@ for r in $(seq 1 "$T2B_ROUNDS"); do
       done
     ) &
     sampler=$!
-    pids=()
+    pids=(); waiter_logs=()
     for i in $(seq 1 "$T2B_N"); do
       : > "$WORK/recov.$r.$i.log"   # per-waiter logs: concurrent appends to one log drop lines
-      AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov.$r.$i.log" AGENT_LOCK_STALE_SECS=8 \
-        AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \
+      waiter_logs+=("$WORK/recov.$r.$i.log")
+      AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov.$r.$i.log" AGENT_LOCK_STALE_SECS="$t2b_stale" \
+        AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t2b_maxwait" \
         bash "$LIB" run -- bash -c 'echo ran >> "$1"; sleep 0.1' _ "$RAN" 2>/dev/null &
       pids+=($!)
     done
     t2b_sync=1
-    if ! sync_waiting_fresh "$LOCK" 60 "$WORK/recov.$r.1.log" "$WORK/recov.$r.2.log" \
-                            "$WORK/recov.$r.3.log" "$WORK/recov.$r.4.log"; then
+    if ! sync_waiting_fresh "$LOCK" 60 "${waiter_logs[@]}"; then
       t2b_sync=0
       for i in $(seq 1 "$T2B_N"); do
         grep -q "WAITING for lock" "$WORK/recov.$r.$i.log" 2>/dev/null \
-          || echo "  round $r: waiter $i never logged WAITING"
+          || echo "  N=$T2B_N round $r: waiter $i never logged WAITING"
       done
     fi
     backdate_ghost "$LOCK" "$GHOST" 9999; bd=$?   # all waiters now judge the ghost stale together
@@ -276,8 +238,8 @@ for r in $(seq 1 "$T2B_ROUNDS"); do
       wait "${pids[$((i-1))]}"; rc=$?
       case "$rc" in
         0)  ;;
-        98) round_98=$((round_98+1)); echo "  round $r: waiter $i rc=98 — displacement under the claim protocol" ;;
-        *)  round_badrc=$((round_badrc+1)); echo "  round $r: waiter $i rc=$rc (want 0)" ;;
+        98) round_98=$((round_98+1)); echo "  N=$T2B_N round $r: waiter $i rc=98 — displacement under the claim protocol" ;;
+        *)  round_badrc=$((round_badrc+1)); echo "  N=$T2B_N round $r: waiter $i rc=$rc (want 0)" ;;
       esac
     done
     touch "$SAMPSTOP"; wait "$sampler" 2>/dev/null
@@ -300,7 +262,7 @@ for r in $(seq 1 "$T2B_ROUNDS"); do
       { [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } && round_dirty=1
       if [ "$round_dirty" = 1 ]; then
         t2b_retried=$((t2b_retried+1))
-        echo "  round $r try $try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying"
+        echo "  N=$T2B_N round $r try $try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying"
         rm -f "$LOCK" "$LOCK.next" "$RAN" "$GRAVESEEN" "$SAMPSTOP"
         continue
       fi
@@ -312,40 +274,42 @@ for r in $(seq 1 "$T2B_ROUNDS"); do
     nran="$(grep -c ran "$RAN")"
     [ "$nran" = "$T2B_N" ] || {
       t2b_fail=1
-      echo "  round $r: only $nran/$T2B_N commands ran"
+      echo "  N=$T2B_N round $r: only $nran/$T2B_N commands ran"
     }
     [ -e "$LOCK" ] && {
       t2b_fail=1
-      echo "  round $r: leftover lock"
+      echo "  N=$T2B_N round $r: leftover lock"
     }
     [ -e "$LOCK.next" ] && {
       t2b_fail=1
-      echo "  round $r: leftover claim"
+      echo "  N=$T2B_N round $r: leftover claim"
     }
     [ -e "$GRAVESEEN" ] && {
       t2b_fail=1
-      echo "  round $r: a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!"
+      echo "  N=$T2B_N round $r: a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!"
     }
     t2b_stole=$((t2b_stole + $(grep -c "STOLE-BY-CLAIM" "$WORK/recov.$r.all.log")))
     t2b_old_shape=$((t2b_old_shape + $(grep -c "STOLE stale lock" "$WORK/recov.$r.all.log")))
     t2b_disp=$((t2b_disp + $(grep -c "STEAL-DISPLACED" "$WORK/recov.$r.all.log")))
     break
   done
-  [ "$t2b_valid" = 1 ] || { t2b_fail=1; echo "  round $r: no clean round under a conclusive backdate in $T2B_TRIES attempts"; }
+  [ "$t2b_valid" = 1 ] || { t2b_fail=1; echo "  N=$T2B_N round $r: no clean round under a conclusive backdate in $T2B_TRIES attempts"; }
 done
-[ "$t2b_retried" = 0 ] || echo "  note: $t2b_retried discarded attempt(s) — harness backdate race, not a protocol verdict"
+[ "$t2b_retried" = 0 ] || echo "  note: $t2b_retried discarded attempt(s) at N=$T2B_N — harness backdate race, not a protocol verdict"
 [ "$t2b_fail" = 0 ] && ok "$T2B_ROUNDS rounds x $T2B_N waiters on one crashed lock: all ran, clean final state, no move-aside file ever existed" \
-  || bad "crash-recovery contention failure (see above)"
-[ "$t2b_98" = 0 ] && ok "zero spurious 98s — the claim serialized recovery (unserialized: near-certain displacement)" \
-  || bad "$t2b_98 waiter(s) exited 98 — displacement happened under the claim protocol"
-[ "$t2b_stole" = "$T2B_ROUNDS" ] && ok "exactly one STOLE-BY-CLAIM per recovery (x$t2b_stole/$T2B_ROUNDS rounds)" \
-  || bad "STOLE-BY-CLAIM count $t2b_stole != $T2B_ROUNDS rounds (want exactly one steal per recovery)"
-[ "$t2b_old_shape" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged" \
-  || bad "'STOLE stale lock' line appeared x$t2b_old_shape — an unserialized steal lane is present"
-[ "$t2b_disp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)" \
-  || bad "STEAL-DISPLACED fired x$t2b_disp — displacement-repair machinery present?"
-
-echo "== Test 3: REGRESSION — EMPTY lock file (crash between create and write) is still stolen =="
+  || bad "crash-recovery contention failure$t2b_ntag (see above)"
+[ "$t2b_98" = 0 ] && ok "zero spurious 98s$t2b_ntag — the claim serialized recovery (unserialized: near-certain displacement)" \
+  || bad "$t2b_98 waiter(s) exited 98$t2b_ntag — displacement happened under the claim protocol"
+[ "$t2b_stole" = "$T2B_ROUNDS" ] && ok "exactly one STOLE-BY-CLAIM per recovery$t2b_ntag (x$t2b_stole/$T2B_ROUNDS rounds)" \
+  || bad "STOLE-BY-CLAIM count $t2b_stole != $T2B_ROUNDS rounds$t2b_ntag (want exactly one steal per recovery)"
+[ "$t2b_old_shape" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged$t2b_ntag" \
+  || bad "'STOLE stale lock' line appeared x$t2b_old_shape$t2b_ntag — an unserialized steal lane is present"
+[ "$t2b_disp" = 0 ] && ok "zero STEAL-DISPLACED lines$t2b_ntag (prevention, not detect-and-repair)" \
+  || bad "STEAL-DISPLACED fired x$t2b_disp$t2b_ntag — displacement-repair machinery present?"
+done
+fi
+
+if section "Test 3: REGRESSION — EMPTY lock file (crash between create and write) is still stolen"; then
 # The file-protocol descendant of the 2026-05-30 orphan bug: an acquirer that
 # died after the open but before (or mid-) content write leaves an empty file.
 # Staleness MUST come from the file mtime and the content guard MUST class an
@@ -359,8 +323,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \
   bash "$LIB" run -- bash -c 'echo after > "$1"' _ "$MARKER"; rc=$?
 [ "$rc" = 0 ] && ok "empty-file orphan stolen (no hang)" || bad "orphan NOT stolen (rc=$rc) — regression!"
 [ "$(cat "$MARKER")" = after ] && ok "command ran after stealing orphan" || bad "command did not run"
+fi
 
-echo "== Test 4: a LIVE lock is NOT stolen (waiter logs WAITING, blocks, then proceeds) =="
+if section "Test 4: a LIVE lock is NOT stolen (waiter logs WAITING, blocks, then proceeds)"; then
 LOCK="$WORK/live.lock"; LOG="$WORK/live.log"; : > "$LOG"; ORDER="$WORK/order"; echo none > "$ORDER"
 READY="$WORK/t4.ready"; GO4="$WORK/t4.go"
 # Holder keeps the lock until the test has SEEN the waiter contend (the
@@ -383,8 +348,9 @@ wait "$waiter"; wait "$holder"
 [ "$(tr '\n' ',' < "$ORDER")" = "none,holder-start,holder-end,waiter-ran," ] \
   && ok "ordering correct" || bad "ordering wrong: $(tr '\n' ',' < "$ORDER")"
 grep -q STOLE "$LOG" && bad "waiter wrongly STOLE a live lock" || ok "no wrongful steal of live lock"
+fi
 
-echo "== Test 4b: a ROBBED slow holder detects the theft and FAILS with 98 on release =="
+if section "Test 4b: a ROBBED slow holder detects the theft and FAILS with 98 on release"; then
 # The fail-open ceiling: a hold longer than the stale window CAN be stolen by a
 # contender. The robbed holder must DETECT this at release (the lock file is
 # gone, or carries the thief's token) and exit EXACTLY 98 (the reserved
@@ -415,8 +381,9 @@ wait "$vpid"; victim_rc=$?
 grep -q "WARNING: lock LOST" "$LOG" && ok "robbed holder logged a loud theft WARNING" || bad "no theft WARNING logged"
 [ "$thief_rc" = 0 ] && ok "thief (its own fresh hold) released cleanly (rc 0)" || bad "thief rc=$thief_rc (should be 0)"
 grep -q thief-work "$OUT" && ok "thief did its work" || bad "thief work missing"
+fi
 
-echo "== Test 4c: a slow but UNCONTENDED holder keeps its lock (slowness != failure) =="
+if section "Test 4c: a slow but UNCONTENDED holder keeps its lock (slowness != failure)"; then
 # Documents the boundary: exceeding the stale window is only dangerous when a
 # contender actually steals. With no waiter, the file is never moved, the token
 # still matches, and release succeeds. (If this failed, the lock would punish
@@ -427,16 +394,18 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 AGENT_LOCK
 [ "$solo_rc" = 0 ] && ok "uncontended slow holder released cleanly (rc 0)" || bad "uncontended slow holder rc=$solo_rc (should be 0)"
 grep -q "WARNING: lock LOST" "$LOG" && bad "spurious theft WARNING with no contender" || ok "no spurious WARNING when uncontended"
 grep -q solo-done "$OUT" && ok "uncontended slow holder did its work" || bad "work missing"
+fi
 
-echo "== Test 5: run propagates the command's exit code, releases either way =="
+if section "Test 5: run propagates the command's exit code, releases either way"; then
 LOCK="$WORK/rc.lock"; LOG="$WORK/rc.log"; : > "$LOG"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exit 0'; rc=$?
 [ "$rc" = 0 ] && ok "exit 0 propagated" || bad "exit 0 not propagated (rc=$rc)"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exit 7'; rc=$?
 [ "$rc" = 7 ] && ok "exit 7 propagated" || bad "exit code not propagated (rc=$rc)"
 [ -e "$LOCK" ] && bad "lock left held after run" || ok "lock released after run (success and failure)"
+fi
 
-echo "== Test 6: default lock FILE and log live in the git dir =="
+if section "Test 6: default lock FILE and log live in the git dir"; then
 SCRATCH="$WORK/scratch"; mkdir -p "$SCRATCH"
 git -C "$SCRATCH" init -q; git -C "$SCRATCH" config user.email t@t; git -C "$SCRATCH" config user.name t
 GITDIR="$(git -C "$SCRATCH" rev-parse --absolute-git-dir)"
@@ -455,8 +424,9 @@ touch "$GO6"
 wait "$h6"
 [ -e "$GITDIR/commit.lock" ] && bad "default lock file left behind after release" || ok "default lock file removed on release"
 [ -f "$GITDIR/git-commit-lock.log" ] && ok "lock log created in git dir ($GITDIR)" || bad "no log in git dir"
+fi
 
-echo "== Test 7: CLI usage errors exit 96 (stderr); explicit --help/-h exits 0 (stdout) =="
+if section "Test 7: CLI usage errors exit 96 (stderr); explicit --help/-h exits 0 (stdout)"; then
 bash "$LIB" >/dev/null 2>&1;            [ "$?" = 96 ] && ok "no args -> 96" || bad "no args rc=$? (want 96)"
 bash "$LIB" frobnicate > "$WORK/t7.err.out" 2> "$WORK/t7.err.err"
 [ "$?" = 96 ] && ok "unknown subcommand -> 96" || bad "unknown subcommand rc=$? (want 96)"
@@ -475,8 +445,9 @@ for h in --help -h; do
     && ok "$h -> usage on stdout, exit 0, stderr empty" \
     || bad "$h rc=$rc (want 0) stdout-usage=$(grep -c '^usage:' "$WORK/t7.help.out") stderr=$(head -c 60 "$WORK/t7.help.err")"
 done
+fi
 
-echo "== Test 8: acquire timeout exits 97 and the command NEVER runs =="
+if section "Test 8: acquire timeout exits 97 and the command NEVER runs"; then
 LOCK="$WORK/tmo.lock"; LOG="$WORK/tmo.log"; : > "$LOG"; READY="$WORK/t8.ready"; DONE8="$WORK/t8.done"
 # Holder keeps the lock until the test says so (marker, not a fixed sleep —
 # under heavy load a slow-starting waiter once arrived AFTER a 4s holder had
@@ -522,8 +493,9 @@ grep -q "raise AGENT_LOCK_MAX_WAIT" "$WORK/t8.warn3.err" \
   || ok "explicit MAX_WAIT silences the knob-relation warning (left-default gate kept)"
 wait "$h8"; rc=$?
 [ "$rc" = 0 ] && ok "holder unaffected by the timed-out waiter" || bad "holder rc=$rc (want 0)"
+fi
 
-echo "== Test 9: sub-floor (pre-2000) file mtime is NOT treated as stale =="
+if section "Test 9: sub-floor (pre-2000) file mtime is NOT treated as stale"; then
 # The FILETIME-zero guard: a freshly created file can transiently report a 1601
 # mtime to an observer on Windows (probes C/C1b);
 # anything before 2000-01-01 must be classed unsettled — the waiter WAITS (and
@@ -539,8 +511,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
 grep -q STOLE "$LOG" && bad "sub-floor lock was wrongly STOLEN" || ok "no steal of sub-floor lock"
 [ -f "$LOCK" ] && ok "sub-floor lock file untouched" || bad "sub-floor lock file was removed"
 rm -f "$LOCK"
+fi
 
-echo "== Test 10: every worktree gets its OWN lock (git-dir scoping) =="
+if section "Test 10: every worktree gets its OWN lock (git-dir scoping)"; then
 WTREPO="$WORK/wtrepo"; mkdir -p "$WTREPO"
 git -C "$WTREPO" init -q; git -C "$WTREPO" config user.email t@t; git -C "$WTREPO" config user.name t
 git -C "$WTREPO" commit -q --allow-empty -m init
@@ -573,8 +546,9 @@ wait "$h10"
 [ -e "$WTGD/commit.lock" ] && bad "worktree lock left behind" || ok "worktree lock released"
 [ -f "$WTGD/git-commit-lock.log" ] && ok "worktree log lives in its worktree git dir" || bad "no log at $WTGD"
 [ -e "$MAINGD/commit.lock" ] && bad "main-repo lock left behind" || ok "main-repo lock released"
+fi
 
-echo "== Test 11: TERM mid-hold — lock released, wrapper dies with 128+15 =="
+if section "Test 11: TERM mid-hold — lock released, wrapper dies with 128+15"; then
 # Two discriminators: (a) the EXIT/TERM trap must actually
 # release the lock when the `run` wrapper is killed; (b) the wrapper must NOT
 # swallow the signal (a swallowing wrapper releases, keeps going, and exits 0
@@ -598,8 +572,9 @@ wait "$w11"; rc=$?
                 || bad "TERM'd run wrapper rc=$rc (want 143)"
 [ -e "$LOCK" ] && bad "lock left held after TERM" || ok "lock released on TERM"
 grep -q RELEASED "$LOG" && ok "release logged on TERM path" || bad "no RELEASED entry on TERM path"
+fi
 
-echo "== Test 12: sourced API — acquire/release, traps, strict-mode hygiene =="
+if section "Test 12: sourced API — acquire/release, traps, strict-mode hygiene"; then
 # 12a: sourcing must not impose errexit/nounset/pipefail; acquire/release work
 # across separate commands; reentrant acquire is refused (rc 1, lock kept);
 # release is idempotent. Distinct failure codes pinpoint the broken step.
@@ -691,8 +666,9 @@ done
 wait "$p12"; rc=$?
 [ "$rc" = 143 ] && ok "post-release shell dies on TERM (143) — signal disposition restored" \
                 || bad "post-release shell rc=$rc on TERM (want 143; signal-immune shell?)"
+fi
 
-echo "== Test 13: garbage AGENT_LOCK_* numerics fall back to defaults with a note =="
+if section "Test 13: garbage AGENT_LOCK_* numerics fall back to defaults with a note"; then
 LOCK="$WORK/num.lock"; LOG="$WORK/num.log"; : > "$LOG"
 AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \
   AGENT_LOCK_STALE_SECS=banana AGENT_LOCK_POLL_SECS=-1 AGENT_LOCK_MAX_WAIT=0 \
@@ -701,8 +677,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \
 [ "$rc" = 0 ] && ok "run succeeds despite garbage numeric config" || bad "rc=$rc with garbage numerics"
 n="$(grep -c "ignoring invalid" "$WORK/t13.err")"
 [ "$n" = 4 ] && ok "all 4 garbage values noted on stderr, incl. CLAIM_STALE_SECS (got $n)" || bad "expected 4 'ignoring invalid' notes, got $n"
+fi
 
-echo "== Test 14: run outside any git repo hard-fails 96 unless AGENT_LOCK_PATH is set =="
+if section "Test 14: run outside any git repo hard-fails 96 unless AGENT_LOCK_PATH is set"; then
 NR="$WORK/norepo"; mkdir -p "$NR"
 ( cd "$NR" && env GIT_CEILING_DIRECTORIES="$WORK" bash "$LIB" run -- bash -c 'true' ) 2> "$WORK/t14.err"; rc=$?
 [ "$rc" = 96 ] && ok "run outside a repo refused with 96" || bad "run outside a repo rc=$rc (want 96)"
@@ -710,8 +687,9 @@ grep -q "AGENT_LOCK_PATH" "$WORK/t14.err" && ok "refusal message mentions AGENT_
 ( cd "$NR" && env GIT_CEILING_DIRECTORIES="$WORK" AGENT_LOCK_PATH="$NR/x.lock" AGENT_LOCK_LOG="$NR/x.log" \
     bash "$LIB" run -- bash -c 'true' ) 2>/dev/null; rc=$?
 [ "$rc" = 0 ] && ok "explicit AGENT_LOCK_PATH works outside a repo" || bad "explicit AGENT_LOCK_PATH outside repo rc=$rc"
+fi
 
-echo "== Test 14b: SOURCING outside a repo warns on stderr and creates NO files =="
+if section "Test 14b: SOURCING outside a repo warns on stderr and creates NO files"; then
 # Sourcing keeps the CWD fallback (it must never explode), but the warning
 # goes to STDERR — warning via the lock log instead would, as a side
 # effect, CREATE ./git-commit-lock.log in whatever random directory the
@@ -731,8 +709,9 @@ leftovers="$(ls -A "$NRS" 2>/dev/null)"
 # (There is deliberately no Test 15: the steal installs by rename-over and
 # never creates a move-aside (.dead.*) file, so there is no sweep to test.
 # An implementation must never create one; Test 2b's sampler enforces that.)
+fi
 
-echo "== Test 16: EMPTY lock file at release — unverifiable lane (2 / run:1), NOT a theft verdict =="
+if section "Test 16: EMPTY lock file at release — unverifiable lane (2 / run:1), NOT a theft verdict"; then
 # Truncation stands in for the probe-F window: a file that reads empty after
 # the retry ladder is a successor mid-create after a boundary steal, or
 # external truncation — it canNOT be our own failed write (acquire's
@@ -760,8 +739,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \
   bash "$LIB" run -- bash -c ': > "$AGENT_LOCK_PATH"; exit 7' 2>/dev/null; rc=$?
 [ "$rc" = 7 ] && ok "run keeps a failing command's own code (7) over the unverifiable 1" || bad "run empty-file+exit-7 rc=$rc (want 7)"
 rm -f "$LOCK"
+fi
 
-echo "== Test 16b: lock file GONE at release — definitive theft, exactly 98 =="
+if section "Test 16b: lock file GONE at release — definitive theft, exactly 98"; then
 # Acquire's read-back proved our
 # token was AT the path, so a missing file at release can only mean someone
 # renamed/removed it (a steal, or external interference) — report 98, loudly.
@@ -780,8 +760,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \
   bash "$LIB" run -- bash -c 'rm -f "$AGENT_LOCK_PATH"' 2>/dev/null; rc=$?
 [ "$rc" = 98 ] && ok "run reports 98 (overrides a successful command) when the lock file is gone" \
                || bad "run gone-at-release rc=$rc (want 98)"
+fi
 
-echo "== Test 16c: release rides out a TRANSIENT empty read (escalating retry ladder — ps1 parity) =="
+if section "Test 16c: release rides out a TRANSIENT empty read (escalating retry ladder — ps1 parity)"; then
 # A sub-second window in which the lock file reads EMPTY (stand-in for an AV
 # scanner's blocking handle, or a probe-F create->write gap that resolves)
 # must NOT produce the unverifiable verdict: the read-retry ladder (shared
@@ -814,8 +795,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c '
 grep -q "EMPTY/unreadable at release" "$WORK/t16c.err" \
   && bad "spurious unverifiable warning despite the token reappearing" \
   || ok "no unverifiable warning for the ridden-out transient"
+fi
 
-echo "== Test 17: NON-FILE at the lock path — never stolen, loud one-time config warning, waiters reach 97 =="
+if section "Test 17: NON-FILE at the lock path — never stolen, loud one-time config warning, waiters reach 97"; then
 # (a) a directory (a config typo like AGENT_LOCK_PATH=\$HOME, or a directory
 # lock left by an older release). The per-poll type guard fires regardless of
 # age — but only after the SAME concrete type is seen on two consecutive
@@ -890,8 +872,9 @@ else
   rm -f "$LOCK" 2>/dev/null
   echo "note: mkfifo unavailable/unusable here — FIFO guard not exercised (CI POSIX legs cover it)"
 fi
+fi
 
-echo "== Test 17d: REGRESSION — create/delete churn at the lock path must NOT fire the non-lock warning =="
+if section "Test 17d: REGRESSION — create/delete churn at the lock path must NOT fire the non-lock warning"; then
 # The per-poll guard's existence (-e/-L) and classification (-f && ! -L)
 # checks are SEPARATE stats. A rival's release/steal unlink landing between
 # them — or a Windows delete-pending ghost (the unlink queues behind a rival
@@ -962,26 +945,58 @@ if [ -n "$churn_pid" ]; then
   # never churned, so bash sees it reliably. Budget 60s: pwsh cold start on
   # a loaded box can take >15s.
   if wait_for_file "$START" 60; then
-    warn17d=0; got97=0
+    # Per-waiter lock logs (single-writer => drop-free): a SHARED log drops lines
+    # under concurrent appends (cf. the per-waiter logs at Test 2B), which would make
+    # the WAITING anti-vacuity count below unreliable. Rebuilt into $LOG after the runs.
+    warn17d=0; n0=0; n1=0; n97=0; n98=0; nother=0; rc_bad=""
     for r in 1 2 3; do
       pids=()
       for i in 1 2 3 4; do
-        AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 \
+        AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/t17d.$r.$i.log" AGENT_LOCK_STALE_SECS=300 \
           AGENT_LOCK_POLL_SECS=0.02 AGENT_LOCK_MAX_WAIT=2 \
           bash "$LIB" run -- bash -c 'true' 2> "$WORK/t17d.$r.$i.err" &
         pids+=($!)
       done
       for i in 1 2 3 4; do
         wait "${pids[$((i-1))]}"; rc=$?
-        [ "$rc" = 97 ] && got97=$((got97+1))
+        # A CLEAN command ('true') under this churn has exactly FOUR correct terminal
+        # codes — do NOT tighten this set: rc 1 is the real catch that made the old
+        # got97>=1 assertion flaky (see the Test 17d de-flake plan).
+        #   0  acquired in an absent window, clean release
+        #   1  acquired, but release read the held lock EMPTY (the churner's
+        #      create->write window) -> release rc 2 -> lock_run demotes the clean
+        #      command to 1 (ownership unverifiable; correct, not a defect)
+        #   97 never won an absent window within MAX_WAIT -> timed out
+        #   98 churner overwrote the hold before release -> designed theft detection
+        case "$rc" in
+          0)  n0=$((n0+1)) ;;
+          1)  n1=$((n1+1)) ;;
+          97) n97=$((n97+1)) ;;
+          98) n98=$((n98+1)) ;;
+          *)  nother=$((nother+1)); rc_bad="$rc_bad $r.$i=$rc" ;;
+        esac
         n="$(grep -c 'is not a lock file' "$WORK/t17d.$r.$i.err")"
         warn17d=$((warn17d+n))
       done
     done
+    # Rebuild the consolidated churn.log artifact from the drop-free per-waiter logs.
+    # 'cat glob > file' is a redirect, not a pipe (no SC2002); then count WAITING from
+    # the single rebuilt file.
+    cat "$WORK"/t17d.*.log > "$LOG" 2>/dev/null || :
+    waited="$(grep -c 'WAITING for lock' "$LOG")"
+    echo "note: T17d outcomes rc0=$n0 rc1=$n1 rc97=$n97 rc98=$n98 other=$nother; WAITING=$waited"
     [ "$warn17d" = 0 ] && ok "12 waiters polled through churn with ZERO spurious non-lock warnings" \
                        || bad "churned regular file fired $warn17d non-lock warning(s) — per-poll guard TOCTOU regression!"
-    [ "$got97" -ge 1 ] && ok "waiters still timed out at 97 under churn ($got97/12)" \
-                       || bad "no waiter reached 97 under churn (got97=$got97/12) — timeout lane bypassed?"
+    # Replaces the old got97>=1 assertion (timeout is only ONE of the correct outcomes;
+    # which one occurs is machine-speed luck). Assert each waiter reached a DESIGNED
+    # terminal state instead — catches a real product regression (crash/139, 96, …).
+    [ "$nother" = 0 ] && ok "all 12 waiters reached a designed terminal state (rc in {0,1,97,98})" \
+                      || bad "waiter(s) hit an undesigned rc under churn:$rc_bad (rc0=$n0 rc1=$n1 rc97=$n97 rc98=$n98)"
+    # Anti-vacuity: WAITING is logged only after a create was blocked by a PRESENT lock,
+    # immediately before the per-poll type guard that warn17d guards — so >=1 proves the
+    # churn produced real contention and the guarded path ran. 0 => dead/absent churner.
+    [ "$waited" -ge 1 ] && ok "churn exercised the blocked-poll type-guard lane ($waited WAITING line(s))" \
+                        || bad "no WAITING logged under churn — contention never happened; test ran vacuously"
   else
     bad "T17d churner never signalled its start marker"
     echo "  diag: churner pid=$churn_pid alive=$(kill -0 "$churn_pid" 2>/dev/null && echo yes || echo no)"
@@ -998,8 +1013,9 @@ if [ -n "$churn_pid" ]; then
 else
   echo "note: $churn_skip — churn-vs-guard regression not exercised here (CI legs cover it)"
 fi
+fi
 
-echo "== Test 18: stale NON-LOCK CONTENT at the lock path is never stolen; torn tokens split on the tok. prefix =="
+if section "Test 18: stale NON-LOCK CONTENT at the lock path is never stolen; torn tokens split on the tok. prefix"; then
 # The content guard (age-gated): steal only an empty file or a line 1 starting
 # "tok.". A real user file at a typo'd AGENT_LOCK_PATH must survive, forever.
 # (a) a user file
@@ -1042,8 +1058,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \
   && ok "tok.-prefixed torn token IS stolen by staleness (crash-orphan lane)" \
   || bad "tok.-prefixed torn token not stolen (rc=$rc marker=$(cat "$MARKER"))"
 grep -q STOLE "$LOG" && ok "steal of the torn token logged" || bad "no STOLE entry for torn token"
+fi
 
-echo "== Test 19: wire format — token on line 1 (tok.-prefixed), owner on line 2 =="
+if section "Test 19: wire format — token on line 1 (tok.-prefixed), owner on line 2"; then
 # Pins the on-disk format the ps1 port must match, and that token parsing
 # takes LINE 1 only (an owner line present must not pollute the token).
 LOCK="$WORK/wire.lock"; LOG="$WORK/wire.log"; : > "$LOG"
@@ -1059,43 +1076,86 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c '
 ' _ "$LIB" "$LOCK"; rc=$?
 [ "$rc" = 0 ] && ok "lock file carries token (line 1, tok.-prefixed) + owner (line 2); release parses line 1 with owner present" \
               || bad "wire-format check failed at step code $rc"
+fi
 
-echo "== Test 20: claim contention — N concurrent stealers, ONE claim winner ($GCL_MODE: $T20_N workers) =="
+if section "Test 20: claim contention — N concurrent stealers, ONE claim winner ($GCL_MODE: $T20_N workers)"; then
 # N stealers race one ancient ghost: exactly one wins the O_EXCL claim and
 # steals (one STOLE-BY-CLAIM); the rest lose the claim create and acquire
 # normally in sequence after the winner releases. No displacement (zero
-# LOST/98), no leftovers. STALE=5 keeps a loaded box from re-stealing the
-# winner's brief hold.
+# LOST/98), no leftovers. STALE keeps a loaded box from re-stealing the
+# winner's brief hold — that bound only holds while STALE >> the winner's
+# effective hold, which (counter-intuitively) grows with N: the WINNER is one
+# of N concurrently-spawned bash processes, so under oversubscription the wall
+# time between its create and its release stretches with the contention. So
+# STALE must scale with N too (see t20_stale below), keeping "exactly one
+# steal" a strict, config-independent correctness invariant at every N.
+#
+# Waiter count is swept (see load-testing-strategy.md). Unlike Test 2b/16, this test's floor is NOT
+# 4 — it is the MODE-driven $T20_N (5 REDUCED / 10 FULL), the count CI already
+# stresses. So instead of iterating the shared T_AXIS_A ("4 ...") it builds its
+# own list: just $T20_N by default (byte-identical), and $T20_N plus the sweep's
+# higher counts (12, 24) under GCL_TEST_SWEEP=1 — preserving today's per-PR AND
+# full-mode coverage while still widening the sweep. MAX_WAIT scales 30*N (the
+# workers run `true`, so this is ample headroom, never the floor's behaviour).
 LOCK="$WORK/contend.lock"
-fabricate_lock "$LOCK" "tok.ghost.t20" "pid=888 host=ghost"
+T20_FLOOR="$T20_N"
+if [ "$GCL_TEST_SWEEP" = 1 ]; then
+  T20_AXIS="$T20_FLOOR"
+  for _n in 12 24; do [ "$_n" = "$T20_FLOOR" ] || T20_AXIS="$T20_AXIS $_n"; done
+else
+  T20_AXIS="$T20_FLOOR"
+fi
+for T20_N in $T20_AXIS; do
+# N-tag for assertion messages: empty in the default run (byte-identical), set
+# only when sweeping so each N's pass/fail line is attributable.
+if [ "$GCL_TEST_SWEEP" = 1 ]; then t20_ntag=" at N=$T20_N"; else t20_ntag=""; fi
+# MAX_WAIT and STALE: keep today's exact values (120 / 5) in the default
+# (non-sweep) run so the env passed to the library is byte-identical; only the
+# sweep's wider N raise them. MAX_WAIT scales 30*N (workers run `true`, ample
+# headroom). STALE floors to N so a wide fan-out's load-stretched winner hold
+# can NEVER make a live lock look stale -> the "exactly one steal" invariant
+# stays true at N=24 just as at the floor. The fixture ghost token likewise
+# carries an N segment only when sweeping (distinct tokens per N), so the
+# default lock CONTENT the library sees is unchanged too.
+if [ "$GCL_TEST_SWEEP" = 1 ]; then
+  t20_maxwait=$(( 30 * T20_N ))
+  [ "$T20_N" -gt 5 ] && t20_stale="$T20_N" || t20_stale=5
+  t20_ghost="tok.ghost.t20.$T20_N"
+else
+  t20_maxwait=120; t20_stale=5; t20_ghost="tok.ghost.t20"
+fi
+rm -f "$WORK/contend".*.log "$LOCK" "$LOCK.next"
+fabricate_lock "$LOCK" "$t20_ghost" "pid=888 host=ghost"
 backdate "$LOCK" 9999
 pids=(); t20_fail=0
 for i in $(seq 1 "$T20_N"); do
   : > "$WORK/contend.$i.log"
-  AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/contend.$i.log" AGENT_LOCK_STALE_SECS=5 \
-    AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \
+  AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/contend.$i.log" AGENT_LOCK_STALE_SECS="$t20_stale" \
+    AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t20_maxwait" \
     bash "$LIB" run -- bash -c 'true' 2>/dev/null &
   pids+=($!)
 done
 for i in $(seq 1 "$T20_N"); do
   wait "${pids[$((i-1))]}"; rc=$?
-  [ "$rc" = 0 ] || { t20_fail=1; echo "  worker $i rc=$rc (want 0)"; }
+  [ "$rc" = 0 ] || { t20_fail=1; echo "  N=$T20_N worker $i rc=$rc (want 0)"; }
 done
 cat "$WORK/contend."*.log > "$WORK/contend.all.log"
 nst="$(grep -c "STOLE-BY-CLAIM" "$WORK/contend.all.log")"
 nacq="$(grep -c "ACQUIRED" "$WORK/contend.all.log")"
 nrel="$(grep -c "RELEASED" "$WORK/contend.all.log")"
 nlost="$(grep -c "lock LOST" "$WORK/contend.all.log")"
-[ "$t20_fail" = 0 ] && ok "$T20_N concurrent stealers all completed with rc 0" || bad "claim-contention worker failures (see above)"
-[ "$nst" = 1 ] && ok "exactly ONE claim winner stole the ghost (STOLE-BY-CLAIM x$nst)" \
-               || bad "STOLE-BY-CLAIM x$nst (want exactly 1 — the claim must serialize stealers)"
+[ "$t20_fail" = 0 ] && ok "$T20_N concurrent stealers all completed with rc 0" || bad "claim-contention worker failures$t20_ntag (see above)"
+[ "$nst" = 1 ] && ok "exactly ONE claim winner stole the ghost$t20_ntag (STOLE-BY-CLAIM x$nst)" \
+               || bad "STOLE-BY-CLAIM x$nst$t20_ntag (want exactly 1 — the claim must serialize stealers)"
 [ "$nacq" = "$T20_N" ] && [ "$nrel" = "$T20_N" ] && ok "balanced ACQUIRED/RELEASED ($nacq/$nrel of $T20_N)" \
-                                                  || bad "ACQUIRED=$nacq RELEASED=$nrel (want $T20_N each)"
-[ "$nlost" = 0 ] && ok "zero LOST warnings under claim contention" || bad "$nlost LOST warnings under claim contention"
-[ -e "$LOCK" ] && bad "leftover lock after contention" || ok "no leftover lock"
-[ -e "$LOCK.next" ] && bad "leftover claim after contention" || ok "no leftover claim"
+                                                  || bad "ACQUIRED=$nacq RELEASED=$nrel$t20_ntag (want $T20_N each)"
+[ "$nlost" = 0 ] && ok "zero LOST warnings under claim contention$t20_ntag" || bad "$nlost LOST warnings under claim contention$t20_ntag"
+[ -e "$LOCK" ] && bad "leftover lock after contention$t20_ntag" || ok "no leftover lock$t20_ntag"
+[ -e "$LOCK.next" ] && bad "leftover claim after contention$t20_ntag" || ok "no leftover claim$t20_ntag"
+done
+fi
 
-echo "== Test 21: crashed-claimant and empty-claim orphans age out; steals resume =="
+if section "Test 21: crashed-claimant and empty-claim orphans age out; steals resume"; then
 # (a) an aged foreign claim (crashed claimant): cleared by CLAIM-STALE-CLEARED,
 # then the steal completes; recovery latency bounded.
 LOCK="$WORK/cc.lock"; LOG="$WORK/cc.log"; : > "$LOG"
@@ -1109,7 +1169,7 @@ t21_t1=$(date +%s)
 [ "$rc" = 0 ] && ok "waiter recovered through a crashed claimant's claim (rc 0)" || bad "rc=$rc behind a crashed claim"
 grep -q "CLAIM-STALE-CLEARED" "$LOG" && ok "aged claim cleared (CLAIM-STALE-CLEARED logged, with age)" || bad "no CLAIM-STALE-CLEARED entry"
 grep -q "STOLE-BY-CLAIM" "$LOG" && ok "steal completed after the clear" || bad "no STOLE-BY-CLAIM after clearing the crashed claim"
-[ $((t21_t1 - t21_t0)) -le 20 ] && ok "recovery latency bounded ($((t21_t1 - t21_t0))s)" || bad "recovery took $((t21_t1 - t21_t0))s (>20s)"
+[ $((t21_t1 - t21_t0)) -le 20 ] && ok_envelope "recovery latency bounded ($((t21_t1 - t21_t0))s)" || bad_envelope "recovery took $((t21_t1 - t21_t0))s (>20s)"
 [ -e "$LOCK.next" ] && bad "claim leftover after recovery" || ok "claim path clean after recovery"
 # (b) an EMPTY claim file (claimant died between create and write): same lane.
 LOCK="$WORK/ccempty.lock"; LOG="$WORK/ccempty.log"; : > "$LOG"
@@ -1120,8 +1180,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
   bash "$LIB" run -- bash -c 'true' 2>/dev/null; rc=$?
 [ "$rc" = 0 ] && ok "empty claim orphan aged out and recovery completed (rc 0)" || bad "rc=$rc behind an empty claim orphan"
 grep -q "CLAIM-STALE-CLEARED" "$LOG" && ok "empty claim cleared via the same staleness lane" || bad "empty claim was not cleared"
+fi
 
-echo "== Test 22: NON-CLAIM objects at the claim path — never deleted, per-path warn state =="
+if section "Test 22: NON-CLAIM objects at the claim path — never deleted, per-path warn state"; then
 # (a) a directory at ${LOCK}.next blocks steals (waiter reaches 97), is never
 # deleted, and warns once naming the claim path.
 LOCK="$WORK/cwt.lock"; LOG="$WORK/cwt.log"; : > "$LOG"
@@ -1132,10 +1193,16 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
   bash "$LIB" run -- bash -c 'true' 2> "$WORK/t22a.err"; rc=$?
 [ "$rc" = 97 ] && ok "dir at claim path: steals blocked, waiter timed out (97)" || bad "dir at claim path: rc=$rc (want 97)"
 [ -f "$LOCK.next/sub/f" ] && ok "directory at claim path untouched" || bad "directory at claim path was damaged!"
-grep -q "is not a claim file" "$WORK/t22a.err" && ok "loud claim-path config warning on stderr" || bad "no claim-path config warning"
-grep -q "it is a directory" "$WORK/t22a.err" && ok "claim warning names the detected type (directory)" || bad "claim warning does not name the type"
 n="$(grep -c "is not a claim file" "$WORK/t22a.err")"
-[ "$n" = 1 ] && ok "claim-path warning fired exactly once (got $n)" || bad "claim-path warning fired $n times (want 1)"
+# "warning fired at all" is timing-dependent (the two-poll confirmation needs poll
+# headroom before MAX_WAIT, which an oversubscribed runner can starve) -> envelope.
+# The warn-once dedup (never >1) and the type-naming are CORRECTNESS -> strict (the
+# latter only asserted when a warning actually fired).
+[ "$n" -ge 1 ] && ok_envelope "claim-path config warning fired (got $n)" || bad_envelope "no claim-path config warning (n=$n)"
+[ "$n" -le 1 ] && ok "claim-path warning not duplicated (n=$n)" || bad "claim-path warning fired $n times (warn-once broken)"
+if [ "$n" -ge 1 ]; then
+  grep -q "it is a directory" "$WORK/t22a.err" && ok "claim warning names the detected type (directory)" || bad "claim warning does not name the type"
+fi
 grep -q "STOLE-BY-CLAIM" "$LOG" && bad "stole despite a squatted claim path" || ok "no steal through a squatted claim path"
 [ -f "$LOCK" ] && ok "stale lock left in place (cannot be stolen safely)" || bad "lock vanished behind a squatted claim path"
 # (b) a free LOCK path is UNaffected by claim-path junk: normal acquire works.
@@ -1228,8 +1295,9 @@ AGENT_LOCK_PATH="$PPD2/c1.lock" AGENT_LOCK_LOG="$PPD2/ppg2.log" AGENT_LOCK_STALE
 grep -q "is not a claim file" "$PPD2/ba.err" && grep -q "is not a lock file" "$PPD2/ba.err" \
   && ok "claim-path warning did not suppress the lock-path warning (reverse order)" \
   || bad "lock-path warning suppressed after a claim-path warning (shared warn-once state?)"
+fi
 
-echo "== Test 23: live-slow holder — re-verify under the claim sees a fresh lock, CLAIM-ABORT (fresh), no steal =="
+if section "Test 23: live-slow holder — re-verify under the claim sees a fresh lock, CLAIM-ABORT (fresh), no steal"; then
 # Steered deterministically: the lock's mtime is renewed (as a live-slow
 # holder's re-create/renewal would) at the exact step-2 re-verify position,
 # via a sourced shell that wraps the library's verify internal. The claimant
@@ -1260,8 +1328,9 @@ wait "$w23"; rc=$?
 [ "$rc" = 0 ] && ok "waiter then acquired and released normally (rc 0)" || bad "waiter rc=$rc after the slow holder released"
 grep -q "STOLE-BY-CLAIM" "$LOG" && bad "live lock was STOLEN despite the fresh re-verify" || ok "no steal of the live-slow holder's lock"
 [ -e "$LOCK.next" ] && bad "claim leftover after the fresh abort" || ok "claim deleted on the fresh abort"
+fi
 
-echo "== Test 24: OVERAGED own claim — CLAIM-ABORT (contested), no rename =="
+if section "Test 24: OVERAGED own claim — CLAIM-ABORT (contested), no rename"; then
 # A suspended claimant's recheck must refuse to proceed on its own overaged
 # claim (a clearer may be acting on it). Steered: every recheck sees the
 # claim backdated past CLAIM_STALE. Mutation check: an implementation that
@@ -1287,8 +1356,9 @@ l1=""; IFS= read -r l1 < "$LOCK" || true
 [ "$l1" = "tok.ghost.t24" ] && ok "ghost lock untouched by the contested aborts" || bad "ghost lock was modified (line1=$l1)"
 [ -e "$LOCK.next" ] && bad "claim leftover after contested aborts" || ok "claim deleted on each contested abort"
 rm -f "$LOCK"
+fi
 
-echo "== Test 25: discovery-position matrix — own-claim-installed discovered on EVERY exit =="
+if section "Test 25: discovery-position matrix — own-claim-installed discovered on EVERY exit"; then
 # A rival's rename can install OUR claim as the lock while we sit at any
 # post-claim position. Each position steers that rename to the exact spot
 # (wrapping a library internal or shadowing mv/rm/touch in a sourced shell)
@@ -1391,8 +1461,9 @@ for pos in step2-fresh recheck-gone touch-gone lock-gone contested deletion-gone
     bad "position $pos: rc=$rc discovery=$(grep -c DISCOVERY-HOLD "$LOG") expect-line=$(grep -cF "$expect" "$LOG") lock-left=$([ -e "$LOCK" ] && echo yes || echo no) claim-left=$([ -e "$LOCK.next" ] && echo yes || echo no)"
   fi
 done
+fi
 
-echo "== Test 26: delayed claim still installs a FRESH lease (the pre-rename touch) =="
+if section "Test 26: delayed claim still installs a FRESH lease (the pre-rename touch)"; then
 # A claim aged close to CLAIM_STALE (steered: backdated 40s of 60 at the
 # recheck) must still install a lock whose mtime is ~now — the step-3.2
 # touch resets the clock; rename preserves it (probe R2). A no-touch
@@ -1423,8 +1494,9 @@ case "$rc" in
   *)  bad "delayed-claim lease harness rc=$rc" ;;
 esac
 grep -q "STOLE-BY-CLAIM" "$LOG" && ok "the delayed claim still completed its steal" || bad "no STOLE-BY-CLAIM in the lease test"
+fi
 
-echo "== Test 27: lock GONE at re-verify — CLAIM-ABORT (gone), NO rename onto the absent path =="
+if section "Test 27: lock GONE at re-verify — CLAIM-ABORT (gone), NO rename onto the absent path"; then
 # A live-slow holder releasing under a claimant must route to the normal
 # create race, never a rename onto the absent path. Mutation check: a
 # renaming implementation would install the CLAIM token; the correct one
@@ -1455,8 +1527,9 @@ else
   bad "claim token vs acquired token: claim='$ctok' acquired='$atok' (equal or missing => renamed onto the absent path?)"
 fi
 grep -q "DISCOVERY-HOLD" "$LOG" && bad "spurious discovery-HOLD in the gone lane" || ok "no spurious discovery-HOLD"
+fi
 
-echo "== Test 28: SUB-FLOOR claim mtime is never cleared — treated as just-created =="
+if section "Test 28: SUB-FLOOR claim mtime is never cleared — treated as just-created"; then
 LOCK="$WORK/cfloor.lock"
 LOG="$WORK/cfloor.log"
 : >"$LOG"
@@ -1472,8 +1545,9 @@ grep -q "CLAIM-STALE-CLEARED" "$LOG" && bad "sub-floor claim was CLEARED — mti
                                      || ok "sub-floor claim never cleared (floor applies to the claim)"
 [ -f "$LOCK.next" ] && ok "sub-floor claim file untouched" || bad "sub-floor claim file was removed"
 rm -f "$LOCK" "$LOCK.next"
+fi
 
-echo "== Test 29: BLOCKED steal rename — claim deleted IMMEDIATELY, no CLAIM_STALE penalty =="
+if section "Test 29: BLOCKED steal rename — claim deleted IMMEDIATELY, no CLAIM_STALE penalty"; then
 # The rename is forced to fail-with-the-lock-still-present (a shadowed mv —
 # the no-delete-share squat, deterministically). The claimant must delete its
 # own claim at once and re-poll: with CLAIM_STALE=600, a leftover claim would
@@ -1496,14 +1570,15 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
   ' _ "$LIB" 2>/dev/null; rc=$?
 [ "$rc" = 97 ] && ok "blocked-steal waiter honoured MAX_WAIT (97)" || bad "blocked-steal rc=$rc (want 97)"
 nclaim="$(grep -c "] CLAIM " "$LOG")"
-[ "$nclaim" -ge 2 ] && ok "claim re-created on later attempts (x$nclaim) — deleted immediately, no ageout penalty" \
-                    || bad "only $nclaim CLAIM line(s) — the failed steal's claim was left to age out (60s-class penalty)"
+[ "$nclaim" -ge 2 ] && ok_envelope "claim re-created on later attempts (x$nclaim) — deleted immediately, no ageout penalty" \
+                    || bad_envelope "only $nclaim CLAIM line(s) — the failed steal's claim was left to age out (60s-class penalty)"
 grep -q "steal FAILED" "$LOG" && ok "blocked rename logged (damped steal FAILED)" || bad "no steal FAILED log line"
 [ -e "$LOCK.next" ] && bad "claim leftover after the blocked steal attempts" || ok "no claim leftover at exit"
 [ -f "$LOCK" ] && ok "squatted lock left in place" || bad "lock vanished in the blocked lane"
 rm -f "$LOCK"
+fi
 
-echo "== Test 30: static checks — the claim touch is NON-creating with an explicit existence check =="
+if section "Test 30: static checks — the claim touch is NON-creating with an explicit existence check"; then
 grep -q 'touch -c -- "\$_LOCK_CLAIM_PATH"' "$LIB" \
   && ok "claim touch uses 'touch -c --' (non-creating)" \
   || bad "no 'touch -c -- \$_LOCK_CLAIM_PATH' in the implementation"
@@ -1513,11 +1588,22 @@ grep -A3 'touch -c -- "\$_LOCK_CLAIM_PATH"' "$LIB" | grep -q -- '-e "\$_LOCK_CLA
 bad_touch="$(grep 'touch ' "$LIB" | grep '_LOCK_CLAIM_PATH' | grep -v -- '-c')"
 [ -z "$bad_touch" ] && ok "no creating touch of the claim path anywhere" \
                     || bad "creating touch of the claim path found: $bad_touch"
+fi
 
-echo "== Test 31: LEAKED-claim discovery — the leaked-token memory closes the unverified-claim lanes =="
+if section "Test 31: LEAKED-claim discovery — the leaked-token memory closes the unverified-claim lanes"; then
 # (a) main leg: a recheck-unreadable exit leaks the claim token; a rival
-# later installs that claim as the lock; the leaver's per-poll memory check
-# adopts it (HOLD) and release returns 0.
+# (the external mv below) then installs that claim as the lock; the leaver
+# adopts it (HOLD) and release returns 0. Adoption may go through EITHER of
+# the product's two discovery routes — both correct: the inline
+# ownership-discovery read that is the unreadable branch's final act
+# (git-commit-lock.sh:822, "DISCOVERY-HOLD: ...") if the external mv lands
+# before it, or the per-poll leaked-token-memory check
+# (git-commit-lock.sh:1382, "DISCOVERY-HOLD (leaked-token memory)") on a later
+# poll if it lands after. Which wins is a pure scheduling race — the external
+# mv vs the leaver's inline discover ONE statement later (sh:1112 leak-add ->
+# sh:1114 discover) — and is load-sensitive, so this leg accepts either and
+# records which fired. The memory route is pinned DETERMINISTICALLY by
+# sub-leg (b) below; the direct route by Test 25's discovery-position matrix.
 # NB: _lock_read_tok / _lock_cur_token shadows run inside COMMAND
 # SUBSTITUTIONS (subshells), so their fire-once state must live in flag
 # FILES — a variable assignment would be lost when the subshell exits.
@@ -1547,8 +1633,18 @@ else
 fi
 wait "$w31"; rc=$?
 [ "$rc" = 0 ] && ok "leaver discovered its installed leaked claim and released rc 0" || bad "leaked-discovery harness rc=$rc"
-grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG" && ok "adoption went through the leaked-token memory" \
-                                                      || bad "no leaked-token-memory DISCOVERY-HOLD"
+# Either discovery route is correct here (see the leg comment); accept both,
+# record which fired, fail only if NEITHER adopted the leaked claim. ("$LOG"
+# is dedicated to this leg, so there is no cross-talk.) "DISCOVERY-HOLD:"
+# (immediate colon) matches ONLY the direct route; the memory route reads
+# "DISCOVERY-HOLD (leaked-token memory):" — disjoint, and checked first.
+if grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG"; then
+  ok "adoption went through the leaked-token memory (per-poll route; the mv landed after the inline discover)"
+elif grep -q "DISCOVERY-HOLD:" "$LOG"; then
+  ok "adoption went through the inline ownership-discovery read (direct route; the mv landed first) — memory route pinned by sub-leg (b)"
+else
+  bad "no DISCOVERY-HOLD adoption of the leaked claim by EITHER route"
+fi
 [ -e "$LOCK" ] && bad "lock leftover after leaked-claim adoption" || ok "lock released cleanly after adoption"
 [ -e "$LOCK.next" ] && bad "claim leftover after leaked-claim adoption" || ok "no claim leftover"
 # Hmm wait: STALE=300 — the ghost is backdated 9999 so it IS stale; fine.
@@ -1704,8 +1800,9 @@ case "$(uname -s 2>/dev/null)" in
     echo "note: the blocked-unlink feeder leg is Windows-only by construction (POSIX open handles never block unlink); the read-shadow legs above cover the memory machinery"
     ;;
 esac
+fi
 
-echo "== Test 32: per-attempt tokens — an abandoned own-token lock never aliases discovery or release =="
+if section "Test 32: per-attempt tokens — an abandoned own-token lock never aliases discovery or release"; then
 # Walk: the first CREATE's read-back is forced blank (and the abandoned lock
 # backdated stale). A later CLAIM attempt is steered into a recheck-gone
 # discovery against that abandoned lock: a reused-per-acquire-token
@@ -1748,8 +1845,63 @@ grep -q "DISCOVERY-HOLD" "$LOG" && bad "FALSE discovery-HOLD on the abandoned ow
                                 || ok "no false discovery-HOLD — the abandoned token did not alias the claim attempt"
 grep -q "STOLE-BY-CLAIM" "$LOG" && ok "the abandoned lock was then reclaimed by a normal steal" \
                                 || bad "no STOLE-BY-CLAIM of the abandoned lock"
+fi
+
+if section "Test 32b: steal-path read-back FAILED — rename-over WON but the lock did not read back our token (F2)"; then
+# The steal-path twin of Test 32. Here the stealer WINS the claim race AND wins
+# the rename-over (STOLE-BY-CLAIM is logged, the ghost is destroyed), but the
+# mandatory post-rename read-back verification (git-commit-lock.sh:1171) comes
+# back wrong. The product must NOT take the hold: it clears its claim token and
+# re-enters the wait loop (git-commit-lock.sh:1176-1179) — never a silent
+# false-hold (which, after a STOLE-BY-CLAIM, would mean a mis-attributed hold of
+# a destroyed-ghost path). We fault-inject the read-back with a one-shot
+# _lock_cur_token shadow gated on the claim token being SET (the INVERSE of Test
+# 32's `-z` gate), so it lands at the STEAL read-back (claim token live, not yet
+# held), not the create one. On firing we also backdate the just-installed
+# abandoned lock stale so the re-steal is immediate (same trick as Test 32 —
+# keeps it fast and deterministic). Attempt 2 (shadow spent) reads back the real
+# token and acquires normally.
+LOCK="$WORK/stealrb.lock"; LOG="$WORK/stealrb.log"; : > "$LOG"
+fabricate_lock "$LOCK" "tok.ghost.t32b" "pid=9 host=ghost"; backdate "$LOCK" 9999
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=5 \
+  AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \
+  bash -c '
+    source "$1" || exit 70
+    clone_fn _lock_cur_token _ct_orig
+    SF1="$AGENT_LOCK_PATH.steer1"      # flag FILE: the cur_token shadow runs in subshells
+    _lock_cur_token() {
+      if [ ! -e "$SF1" ] && [ "${_LOCK_HELD:-0}" = 0 ] && [ -n "$_LOCK_CLAIM_TOKEN" ]; then
+        : > "$SF1"
+        backdate "$AGENT_LOCK_PATH" 9999 2>/dev/null || true
+        printf ""
+        return 0
+      fi
+      _ct_orig "$@"
+    }
+    lock_acquire || exit 72
+    lock_release || exit 74
+    exit 0
+  ' _ "$LIB" 2>/dev/null; rc=$?
+[ "$rc" = 0 ] && ok "steal read-back failure re-entered wait; a later steal acquired and released rc 0" \
+              || bad "steal-readback harness rc=$rc"
+grep -q "steal rename completed but read-back" "$LOG" \
+  && ok "the steal-path read-back-verification failure lane ran (F2)" \
+  || bad "F2 lane never ran (the read-back fault did not land at the steal read-back)"
+nstole="$(grep -c "STOLE-BY-CLAIM" "$LOG")"
+[ "$nstole" -ge 2 ] && ok "re-stole after the failed read-back (STOLE-BY-CLAIM x$nstole)" \
+                    || bad "expected >=2 STOLE-BY-CLAIM (won-rename then re-steal), got $nstole"
+warn_line="$(grep -n "steal rename completed but read-back" "$LOG" | head -1 | cut -d: -f1)"
+acq_line="$(grep -n "ACQUIRED " "$LOG" | tail -1 | cut -d: -f1)"
+if [ -n "$warn_line" ] && [ -n "$acq_line" ] && [ "$warn_line" -lt "$acq_line" ]; then
+  ok "no false-hold: the read-back WARNING preceded the eventual ACQUIRED"
+else
+  bad "ordering: expected the F2 WARNING (line $warn_line) before ACQUIRED (line $acq_line)"
+fi
+[ -e "$LOCK" ] && bad "lock leftover after the steal-readback walk" || ok "lock released cleanly"
+[ -e "$LOCK.next" ] && bad "claim leftover after the steal-readback walk" || ok "no claim leftover"
+fi
 
-echo "== Test 33: TERM mid-claim — the trap deletes the claim (token-checked), no 98, no ageout penalty =="
+if section "Test 33: TERM mid-claim — the trap deletes the claim (token-checked), no 98, no ageout penalty"; then
 # (a) main: claimant paused inside its claim window (at the touch), TERM'd.
 # The trap must delete OUR claim, run the discovery read (miss: the ghost is
 # foreign), restore traps, re-raise (143) — and must NOT touch the lock.
@@ -1880,8 +2032,9 @@ case "$(uname -s 2>/dev/null)" in
     echo "note: TERM-blocked-unlink leg is Windows-only by construction (POSIX open handles never block unlink)"
     ;;
 esac
+fi
 
-echo "== Test 34: TERM on a STEAL-acquired hold releases exactly like a create-acquired one =="
+if section "Test 34: TERM on a STEAL-acquired hold releases exactly like a create-acquired one"; then
 # All acquisition paths go through the shared claim-the-hold helper, so a
 # steal-acquired holder must run the same HELD/trap machinery: release on
 # TERM, re-raise, 143 (T11's contract, on a steal-acquired hold).
@@ -1904,8 +2057,9 @@ wait "$w34"; rc=$?
 [ "$rc" = 143 ] && ok "TERM'd steal-acquired holder exited 143 (signal re-raised)" || bad "steal-acquired TERM rc=$rc (want 143)"
 [ -e "$LOCK" ] && bad "lock left held after TERM on a steal-acquired hold" || ok "steal-acquired lock released on TERM"
 grep -q "RELEASED" "$LOG" && ok "release logged on the steal-acquired TERM path" || bad "no RELEASED entry for the steal-acquired hold"
+fi
 
-echo "== Test 35: release-time leaked-claim cleanup — displaced hold cleans its own installed leak, 98 =="
+if section "Test 35: release-time leaked-claim cleanup — displaced hold cleans its own installed leak, 98"; then
 # (a) B leaks token L (recheck-unreadable; the ghost vanishes at the same
 # moment), acquires fresh N normally; a rival installs L over the lock,
 # displacing B's held N. B's release must return 98 AND unlink L (the lock
@@ -1998,8 +2152,9 @@ esac
 grep -q "RELEASE-CLEANED-LEAKED-CLAIM" "$LOG" && bad "boundary variant wrongly logged a leaked-claim cleanup" \
                                               || ok "no cleanup line when the re-read backed off"
 rm -f "$LOCK" "$LOCK.next" "$WORK/t35b.succ"
+fi
 
-echo "== Test 36: arc-end resolution pass — an INCONCLUSIVE lock read keeps the entry pending; conclusive ones drop it =="
+if section "Test 36: arc-end resolution pass — an INCONCLUSIVE lock read keeps the entry pending; conclusive ones drop it"; then
 # The pass's entry-drop is gated on one lock-path read. That read resolves
 # the entry ONLY when it is conclusive: a DIFFERENT readable token, or the
 # path definitively absent. A lock PRESENT but unreadable/empty proves
@@ -2057,6 +2212,943 @@ grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG" && ok "the surviving entry
 grep -q "resolved tok=tok.leak.t36.2" "$LOG" && ok "conclusive resolution logged for the dropped entry" \
                                              || bad "no resolution log line for the conclusive drop"
 rm -f "$LOCK" "$LOCK.next"
+fi
+
+if section "Test 37: rename-refused — a directory appearing at the lock path mid-steal aborts the steal, no false hold"; then
+# The only acquire/steal VERDICT branch with no test: a NON-regular object (a
+# directory) appears AT the lock path between the claimant's final re-verify
+# (step 3.3, sees a stale FILE) and its rename-over, so the rename is refused
+# with the lock path occupied by a non-file. The claimant must classify this
+# as rename-refused (non-file at the lock path), delete its claim, take NO
+# hold, and re-poll to MAX_WAIT. Steered deterministically by shadowing mv:
+# the claim->lock rename (the `.next` move) is intercepted to swap the stale
+# lock FILE for a DIRECTORY at the lock path, then the real `mv -T` runs and
+# fails NATURALLY (mv refuses to overwrite a directory with a non-directory) —
+# exactly the wrong-type rename lane. The verifies don't call mv, so the lock
+# reads as a stale file through step 3.3; only the rename sees the directory.
+# Mutation check: an implementation that mis-classifies the refused rename
+# (e.g. treats it as blocked, or proceeds to STOLE-BY-CLAIM) fails the
+# no-false-hold / rename-refused assertions below.
+LOCK="$WORK/renref.lock"; LOG="$WORK/renref.log"; : > "$LOG"
+fabricate_lock "$LOCK" "tok.ghost.t37" "pid=9 host=ghost"; backdate "$LOCK" 9999
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=3 \
+  bash -c '
+    source "$1" || exit 70
+    # Make a DIRECTORY appear at the lock path BEFORE the real rename-over runs,
+    # by wrapping _lock_rename_over (NOT by shadowing mv). It is refused PORTABLY:
+    # GNU "mv -T" refuses to overwrite a directory with a non-directory, AND the
+    # no-mv-T fallback [ -d ] guard (BSD/macOS) refuses it too. A mv shadow that
+    # mkdirs the dir INSIDE the mv call works only on GNU: it lands AFTER the
+    # fallback [ -d ] check, and BSD "mv file dir" MOVES the file INTO the dir
+    # rather than erroring (this failed the macOS CI leg). NB: no apostrophes
+    # here -- this comment lives inside the bash -c single-quoted steering shell.
+    clone_fn _lock_rename_over _ro_orig
+    _lock_rename_over() {
+      command rm -f -- "$AGENT_LOCK_PATH" 2>/dev/null
+      command mkdir -- "$AGENT_LOCK_PATH" 2>/dev/null
+      _ro_orig
+    }
+    lock_acquire
+    exit $?
+  ' _ "$LIB" 2>/dev/null; rc=$?
+[ "$rc" = 97 ] && ok "rename-refused waiter honoured MAX_WAIT (97), never falsely held" \
+               || bad "rename-refused rc=$rc (want 97 — a false hold would exit 0)"
+grep -q "CLAIM-ABORT (rename-refused)" "$LOG" \
+  && ok "CLAIM-ABORT (rename-refused) logged — the wrong-type rename branch was hit" \
+  || bad "no CLAIM-ABORT (rename-refused) — branch not exercised"
+grep -q "non-file at the lock path" "$LOG" \
+  && ok "rename refusal classified as non-file at the lock path" \
+  || bad "missing 'non-file at the lock path' classification wording"
+grep -q "STOLE-BY-CLAIM" "$LOG" \
+  && bad "spurious STOLE-BY-CLAIM — the steal was claimed despite the refused rename" \
+  || ok "no STOLE-BY-CLAIM (no false steal of the directory-occupied path)"
+grep -q "DISCOVERY-HOLD" "$LOG" \
+  && bad "spurious discovery-HOLD — the victim wrongly believed it acquired" \
+  || ok "no spurious discovery-HOLD — ownership-discovery read found no hold"
+grep -q "acquire verification FAILED" "$LOG" \
+  && bad "read-back path entered — the rename was treated as having succeeded" \
+  || ok "rename treated as refused, not as a completed-then-unverified steal"
+[ -e "$LOCK.next" ] \
+  && bad "claim leftover (\$LOCK.next) after the rename-refused abort" \
+  || ok "claim file cleaned up — no leftover \$LOCK.next"
+[ -d "$LOCK" ] \
+  && ok "directory left in place at the lock path (never overwritten)" \
+  || bad "lock path is no longer the squatting directory"
+rm -rf "$LOCK" "$LOCK.next"
+fi
+
+if section "Test 38: step-3.3 pre-rename re-verify abort — claim cleaned, discovery, no false hold"; then
+# The step-2 re-verify (sh:1075) and the step-3.3 re-verify immediately before
+# the rename (sh:1149) are near-identical abort lanes; Test 23/27 exercise the
+# step-2 lane only, leaving 3.3 untested. Steered with a CALL-COUNTER on
+# _lock_verify_stale: call 1 (step-2) passes through to the REAL verdict
+# (stale — the ghost is backdated 9999s), so the steal proceeds PAST step-2;
+# call 2 (step-3.3) freshens the lock first, so the real verify reports "fresh"
+# and the abort fires SPECIFICALLY at step-3.3. The proof is the log suffix
+# "(lock re-verify before rename: fresh)" — step-2's suffix is "after claim",
+# so the string can only be the 3.3 lane. STALE_SECS=30 keeps the freshened
+# ghost fresh long enough that the post-abort re-poll does NOT re-steal before
+# the test removes the lock — so the waiter then acquires via the CREATE race
+# (no second STOLE-BY-CLAIM), the same shape as Test 23.
+LOCK="$WORK/pr33.lock"; LOG="$WORK/pr33.log"; : > "$LOG"
+fabricate_lock "$LOCK" "tok.ghost.t38" "pid=9 host=slow"; backdate "$LOCK" 9999
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=30 \
+  AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \
+  bash -c '
+    source "$1" || exit 70
+    clone_fn _lock_verify_stale _vs_orig
+    N=0
+    _lock_verify_stale() {
+      N=$((N+1))
+      # call 1 = step-2: pass through to the real verdict (stale). call 2 =
+      # step-3.3: freshen the ghost lock so the real verify now sees "fresh",
+      # tripping the pre-rename abort at the 3.3 position.
+      if [ "$N" = 2 ]; then command touch -- "$AGENT_LOCK_PATH"; fi
+      _vs_orig "$@"
+    }
+    lock_acquire || exit 72
+    lock_release || exit 74
+    exit 0
+  ' _ "$LIB" 2>/dev/null &
+w38=$!
+# Proof the 3.3 lane ran AND the steal got PAST step-2: the "before rename"
+# suffix is unique to the step-3.3 position (step-2 logs "after claim").
+wait_for_grep "lock re-verify before rename: fresh" "$LOG" 20 \
+  && ok "step-3.3 pre-rename re-verify aborted (fresh) — got past step-2 to the 3.3 lane" \
+  || bad "no step-3.3 'before rename' abort — the 3.3 lane did not run"
+grep -q "CLAIM-ABORT (fresh) tok=.* (lock re-verify before rename: fresh)" "$LOG" \
+  && ok "CLAIM-ABORT (fresh) logged at the 3.3 position (reason map: fresh)" \
+  || bad "no CLAIM-ABORT (fresh) with the 'before rename' suffix"
+grep -q "lock re-verify after claim" "$LOG" \
+  && bad "the abort fired at step-2 (after claim) — the call-counter let call 1 trip, not the 3.3 lane" \
+  || ok "no step-2 (after claim) abort — call 1 passed; only the 3.3 lane aborted"
+grep -q "STOLE-BY-CLAIM" "$LOG" \
+  && bad "a rename installed the claim — the 3.3 fresh abort did not prevent the steal" \
+  || ok "no STOLE-BY-CLAIM — no rename onto the lock from the aborted attempt"
+grep -q "DISCOVERY-HOLD" "$LOG" \
+  && bad "spurious DISCOVERY-HOLD — the victim wrongly held after the 3.3 abort" \
+  || ok "no false hold — the discovery read ran and the victim did not wrongly hold"
+[ -e "$LOCK.next" ] && bad "claim leftover immediately after the 3.3 fresh abort" \
+                    || ok "claim deleted on the 3.3 fresh abort"
+rm -f "$LOCK"                       # the slow holder releases normally
+wait "$w38"; rc=$?
+[ "$rc" = 0 ] && ok "waiter re-polled past the 3.3 abort, then acquired/released (rc 0)" \
+              || bad "waiter rc=$rc after the slow holder released (want 0)"
+[ -e "$LOCK.next" ] && bad "claim leftover after the waiter finished" || ok "no claim leftover at exit"
+rm -f "$LOCK" "$LOCK.next"
+fi
+
+
+if section "Test 39: foreign claim at recheck — left intact, discovery, no false 98"; then
+# After winning its claim and passing step-2 re-verify, the claimant rechecks
+# its OWN claim file before installing. The `gone` recheck leg is covered (Test
+# 25 recheck-gone / Test 32); the `foreign` leg is NOT: a waiter judged our
+# claim abandoned, cleared it, and a RIVAL re-claimed in its place, so the
+# recheck reads back a FOREIGN token at the claim path. The claimant must then
+# LEAVE the rival's claim alone, run the ownership-discovery read (the lock is
+# still the ghost, not ours -> no hold), and back off to re-poll — never a 98
+# (a mere claim recheck carries NO stolen-lease semantics) and never a deletion
+# of the rival's claim.
+#
+# Steering (Test 24/25 idiom): clone _lock_claim_state and, on the FIRST recheck
+# only (fire-once via a flag FILE so a subshell can't lose the state), overwrite
+# <lock>.next with a fresh-mtime foreign "tok.rival.*" token before delegating
+# to the original — exactly what a waiter-cleared + rival-reclaimed claim path
+# looks like. The original then classifies it `foreign`. CLAIM_STALE is large
+# and MAX_WAIT small so the freshly-planted rival claim is never aged out: it
+# survives, the create on the next poll loses to it, and the waiter times out
+# 97. Mutation check: an implementation that 98'd on a foreign recheck, or that
+# deleted/overwrote the rival's claim, or that false-HELD, fails the asserts.
+LOCK="$WORK/foreign-recheck.lock"; LOG="$WORK/foreign-recheck.log"; : > "$LOG"
+fabricate_lock "$LOCK" "tok.ghost.t39" "pid=9 host=ghost"; backdate "$LOCK" 9999
+SF="$LOCK.steered"; RIVAL="tok.rival.t39.deadbeef"; rm -f "$SF"
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \
+  SF="$SF" RIVAL="$RIVAL" \
+  bash -c '
+    source "$1" || exit 70
+    clone_fn _lock_claim_state _cs_orig
+    _lock_claim_state() {
+      # Fire ONCE, at the post-win recheck of OUR claim: a waiter cleared ours
+      # and a rival re-claimed. Plant the rival token (fresh mtime => not stale)
+      # then classify via the real function.
+      if [ ! -e "$SF" ] && [ "$1" = "$_LOCK_CLAIM_TOKEN" ] \
+         && [ "$_LOCK_CLAIM_PATH" -ef "$AGENT_LOCK_PATH.next" ] 2>/dev/null; then
+        : > "$SF"
+        printf "%s\n%s\n" "$RIVAL" "pid=4242 host=rival" > "$_LOCK_CLAIM_PATH"
+      fi
+      _cs_orig "$@"
+    }
+    lock_acquire
+    exit $?
+  ' _ "$LIB" 2>/dev/null; rc=$?
+
+# The foreign-recheck branch ran (its log line is the proof the leg executed).
+grep -q "claim recheck: foreign token '$RIVAL' at the claim" "$LOG" \
+  && ok "foreign-recheck branch ran (rival token left at the claim, discovery read)" \
+  || bad "no foreign-recheck log line — branch not executed"
+# A mere claim recheck must NEVER report a stolen-lease 98.
+[ "$rc" = 98 ] && bad "false 98 on a foreign CLAIM recheck (no lease was ever held)" \
+              || ok "no false 98 on the foreign claim recheck (rc=$rc)"
+# No hold was ever taken: discovery saw the ghost, not our token.
+grep -q "DISCOVERY-HOLD" "$LOG" && bad "false discovery-HOLD on the foreign recheck" \
+                               || ok "no false hold (ownership-discovery read found the ghost, not ours)"
+grep -q "STOLE-BY-CLAIM" "$LOG" && bad "claimant stole despite a foreign claim at recheck" \
+                                || ok "no STOLE-BY-CLAIM — claimant backed off the foreign claim"
+# The rival's claim file SURVIVES, unmodified (left intact, never deleted).
+[ -e "$LOCK.next" ] && ok "rival's foreign claim file still present (not deleted)" \
+                    || bad "rival's foreign claim was deleted — must be left alone"
+rl1=""; IFS= read -r rl1 < "$LOCK.next" 2>/dev/null || true
+[ "$rl1" = "$RIVAL" ] && ok "rival's claim token intact (untouched: $rl1)" \
+                      || bad "rival's claim token modified (line1=$rl1, want $RIVAL)"
+grep -q "CLAIM-STALE-CLEARED" "$LOG" && bad "claimant aged-out/cleared the rival's fresh claim" \
+                                     || ok "rival's fresh claim never cleared as stale"
+# Clean outcome: the lock was never acquired; the waiter timed out (97).
+[ "$rc" = 97 ] && ok "waiter re-polled past the foreign claim and timed out cleanly (97)" \
+              || bad "rc=$rc (want 97 — clean re-poll/timeout behind the surviving rival claim)"
+# The ghost lock is untouched (never stolen).
+gl1=""; IFS= read -r gl1 < "$LOCK" 2>/dev/null || true
+[ "$gl1" = "tok.ghost.t39" ] && ok "ghost lock untouched by the foreign-recheck backoff" \
+                             || bad "ghost lock modified (line1=$gl1)"
+rm -f "$LOCK" "$LOCK.next" "$SF"
+fi
+
+if section "Test 40: exec-bypass boundary — exec in the lock-holding shell skips release (OOS-5); exec in a child does not"; then
+# `lock_run` runs the wrapped command vector with `"$@"` IN THE WRAPPER SHELL
+# (git-commit-lock.sh), so a command that is itself an `exec` REPLACES the
+# lock-holding wrapper process: the trailing `lock_release` AND the EXIT trap
+# are both skipped, and the lock is left held with no RELEASED logged. This is
+# the one interleaving that can SILENTLY lose an update (guarantees.md OOS-5) —
+# this test pins the exact boundary so a future change to the release/trap
+# wiring can't quietly widen or close it without a red.
+
+# (a1) BYPASS: `run -- exec true` — the wrapped command IS an exec, so it
+# replaces the wrapper. Release + EXIT trap are skipped: lock LEFT, no RELEASED
+# (ACQUIRED proves the hold was taken, so "no RELEASED" means the trap really
+# was bypassed, not that nothing ran).
+LOCK="$WORK/t40.bypass.lock"; LOG="$WORK/t40.bypass.log"; : > "$LOG"
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- exec true; rc=$?
+[ "$rc" = 0 ] && ok "run -- exec true exits 0 (the exec'd command's code)" \
+              || bad "run -- exec true rc=$rc (want 0)"
+grep -q ACQUIRED "$LOG" && ok "run -- exec true did take the lock (ACQUIRED logged)" \
+                        || bad "run -- exec true: no ACQUIRED — the hold never happened, test is vacuous"
+[ -e "$LOCK" ] && ok "run -- exec true LEFT the lock file (release bypassed by exec)" \
+               || bad "run -- exec true: lock released — exec did NOT bypass (boundary changed)"
+grep -q RELEASED "$LOG" && bad "run -- exec true logged RELEASED — the EXIT trap was NOT skipped (boundary changed)" \
+                        || ok "run -- exec true logged NO RELEASED (EXIT trap skipped — OOS-5 boundary)"
+rm -f "$LOCK"
+
+# (a2) CONTROL — NO bypass: `run -- bash -c 'exec true'` — the exec replaces the
+# CHILD, not the wrapper, so the wrapper releases normally: lock GONE, RELEASED
+# logged. The opposite outcome to (a1) is the whole point; assert both so the
+# test documents the exact boundary.
+LOCK="$WORK/t40.child.lock"; LOG="$WORK/t40.child.log"; : > "$LOG"
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exec true'; rc=$?
+[ "$rc" = 0 ] && ok "run -- bash -c 'exec true' exits 0" \
+              || bad "run -- bash -c 'exec true' rc=$rc (want 0)"
+[ -e "$LOCK" ] && bad "run -- bash -c 'exec true' LEFT the lock — exec in a child must NOT bypass" \
+               || ok "run -- bash -c 'exec true' released the lock (exec in a child does not bypass)"
+grep -q RELEASED "$LOG" && ok "run -- bash -c 'exec true' logged RELEASED (the control: release ran)" \
+                        || bad "run -- bash -c 'exec true' logged NO RELEASED — the control case did not release"
+rm -f "$LOCK"
+
+# (a3) REALISTIC sourced bypass: `lock_acquire; exec true` in a sourcing shell
+# (a subshell so it can't take the suite down) — the holder execs away before
+# release, leaving the lock held. This is the shape a real caller hits if it
+# execs while holding instead of calling lock_release.
+LOCK="$WORK/t40.sourced.lock"; LOG="$WORK/t40.sourced.log"; : > "$LOG"
+( AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c '
+    source "$1" || exit 70
+    lock_acquire || exit 72
+    exec true
+  ' _ "$LIB" ); rc=$?
+[ "$rc" = 0 ] && ok "sourced lock_acquire; exec true exits 0" \
+              || bad "sourced lock_acquire; exec true rc=$rc (want 0)"
+[ -e "$LOCK" ] && ok "sourced lock_acquire; exec true LEFT the lock held (release skipped)" \
+               || bad "sourced lock_acquire; exec true released the lock — exec did not bypass"
+grep -q RELEASED "$LOG" && bad "sourced exec-while-holding logged RELEASED — the trap was not skipped" \
+                        || ok "sourced exec-while-holding logged NO RELEASED (release + trap skipped)"
+rm -f "$LOCK"
+
+# (b) SILENT-LOSS boundary: a DISPLACED holder that execs a 0-exit is UNWARNED.
+# Build a holder H that (sourced) acquires, backdates its OWN lock ancient so a
+# contender steals it (H is now displaced — a rival token sits at the path),
+# then execs a 0-exit. Because the exec skips BOTH release and the EXIT trap,
+# the displacement-detection in lock_release NEVER runs: H exits 0 with no
+# WARNING and no 98. This is exactly the documented silent boundary (OOS-5): a
+# non-unwinding exit while displaced cannot report that the hold was not
+# exclusive. (backdate/epoch_to_stamp are export -f'd by the preamble, so the
+# steering shell inherits them.)
+LOCK="$WORK/t40.silent.lock"; LOG="$WORK/t40.silent.log"; : > "$LOG"
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 bash -c '
+    source "$1" || exit 70
+    lock_acquire || exit 72             # H holds the lock
+    backdate "$2" 9999                  # H'"'"'s own lock now ancient -> instantly stealable
+    # A contender steals it (separate process) — H is displaced once a rival
+    # token lands at the path.
+    AGENT_LOCK_PATH="$2" AGENT_LOCK_LOG="$3" AGENT_LOCK_STALE_SECS=1 \
+      AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \
+      bash "$1" run -- true
+    exec true                           # H execs 0 — neither release nor trap runs
+  ' _ "$LIB" "$LOCK" "$LOG"; rc=$?
+[ "$rc" = 0 ] && ok "displaced holder's exec-0 exits 0 (no unwinding ran)" \
+              || bad "displaced holder's exec-0 rc=$rc (want 0)"
+grep -q "STOLE-BY-CLAIM" "$LOG" \
+  && ok "the contender genuinely displaced H (STOLE-BY-CLAIM logged) — H WAS displaced" \
+  || bad "no STOLE-BY-CLAIM — H was not actually displaced, the (b) premise is gone"
+grep -q "lock LOST" "$LOG" \
+  && bad "H logged a 'lock LOST' displacement WARNING — the exec did NOT skip release/trap" \
+  || ok "displaced holder's exec-0 emitted NO 'lock LOST' WARNING (silent boundary — OOS-5)"
+grep -q "WARNING" "$LOG" \
+  && bad "an unexpected WARNING was logged by the displaced exec-0 holder" \
+  || ok "displaced holder's exec-0 emitted NO WARNING at all (unwarned silent loss)"
+rm -f "$LOCK"
+fi
+
+if section "Test 41: forward clock jump steals a live lock — detected as 98, never silent (E2)"; then
+# Staleness is age = now - mtime (git-commit-lock.sh ~:928, ~:1409), where `now`
+# is _lock_now. A process whose clock has LEAPED FORWARD computes an inflated age
+# for everyone's lock, so it can judge a LIVE, fresh lock ancient and steal it.
+# This is correctness-safe but liveness-degraded: it degrades into the already-
+# handled robbed-holder lane (Test 4b) — the displaced holder DETECTS the theft
+# at release and exits 98 with a loud WARNING; it never silently double-commits.
+#
+# Steering (no real sleep/backdate): holder H acquires and HOLDS a fresh lock on
+# a NORMAL clock. Waiter W has _lock_now shadowed to return the real now PLUS a
+# large offset (+9999s), so H's just-created lock looks ~9999s old to W and W
+# steals it. STALE=100 means the lock is genuinely fresh under a normal clock
+# (without the jump W would block, never steal — the jump is what's causal);
+# CLAIM_STALE=99999 keeps W's own just-created claim (also judged ~9999s old by
+# W's jumped clock) well under the claim-stale window, so W's recheck does not
+# self-abort (contested) and the steal proceeds to rename.
+LOCK="$WORK/fwdjump.lock"; LOG="$WORK/fwdjump.log"; : > "$LOG"; OUT="$WORK/fwdjump-out"; : > "$OUT"
+READY="$WORK/t41.ready"; TDONE="$WORK/t41.thief-done"
+# Holder H (sourced, NORMAL clock): create+hold a fresh lock, signal READY, hold
+# until told the waiter is done, then release and exit with the release rc.
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=100 \
+  AGENT_LOCK_CLAIM_STALE_SECS=99999 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=120 \
+  bash -c '
+    source "$1" || exit 70
+    lock_acquire || exit 72
+    echo h-work >> "$2"
+    touch "$3"
+    until [ -e "$4" ]; do sleep 0.05; done
+    lock_release
+    exit $?
+  ' _ "$LIB" "$OUT" "$READY" "$TDONE" &
+hpid=$!
+wait_for_file "$READY" || bad "T41 holder never signalled ready (lock not held)"
+# Waiter W (sourced, clock JUMPED +9999s): _lock_now returns real now + offset, so
+# every age it computes is inflated and H's fresh lock reads as ancient. W acquires
+# (by stealing) then releases; run in the FOREGROUND so its rc is captured.
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=100 \
+  AGENT_LOCK_CLAIM_STALE_SECS=99999 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \
+  bash -c '
+    source "$1" || exit 70
+    clone_fn _lock_now _now_orig
+    _lock_now() { echo $(( $(_now_orig) + 9999 )); }
+    lock_acquire || exit 72
+    echo w-work >> "$2"
+    lock_release
+    exit $?
+  ' _ "$LIB" "$OUT"
+wpid_rc=$?
+touch "$TDONE"
+wait "$hpid"; h_rc=$?
+# W judged H's live, fresh lock ancient under the jumped clock and stole it.
+grep -q "STOLE-BY-CLAIM" "$LOG" \
+  && ok "forward-jumped waiter stole a LIVE fresh lock (STOLE-BY-CLAIM)" \
+  || bad "no STOLE-BY-CLAIM — jumped waiter did not steal the live lock"
+[ "$wpid_rc" = 0 ] && ok "thief (its own fresh hold) released cleanly (rc 0)" \
+                   || bad "thief rc=$wpid_rc (its own fresh hold should release 0)"
+grep -q w-work "$OUT" && ok "thief did its work" || bad "thief work missing"
+# The proof: the premature steal was DETECTED, not silent — H exits exactly 98.
+[ "$h_rc" = 98 ] && ok "robbed holder detected the premature steal — exits exactly 98" \
+                 || bad "robbed holder rc=$h_rc (forward-jump steal must degrade to 98, never silent)"
+grep -q "WARNING: lock LOST" "$LOG" \
+  && ok "robbed holder logged a loud theft WARNING (no silent double-commit)" \
+  || bad "no theft WARNING logged for the forward-jump steal"
+rm -f "$LOCK" "$LOCK.next"
+fi
+
+if section "Test 42: mtime unreadable — staleness disabled, fail-safe (no steal), warn-once, 97 (E3)"; then
+# §E3: if the lock file's mtime cannot be read AT ALL (every probe fails on a
+# PRESENT file), staleness detection is BROKEN. The mtime floor fails closed to
+# "fresh": _lock_verify_stale returns state=fresh, so a crashed/stale holder is
+# NEVER stolen — recovery is disabled and waiters block to MAX_WAIT (97). The
+# tool must say so LOUDLY, exactly once per process. The concurrency canary
+# (formerly Test 1, now tests/git-commit-lock.canary.test.sh) only asserts the
+# NEGATIVE (the warning must NOT fire under healthy contention); this drives the
+# positive lane.
+#
+# Steering: shadow _lock_stat_mtime — the INNER single-probe (sh:606, runs
+# stat/date and prints the epoch) — to return EMPTY for the LOCK path while it
+# is PRESENT. We must NOT shadow _lock_path_mtime (sh:629): that is the 3x-retry
+# wrapper that EMITS the warn-once, so shadowing it would remove the very
+# warning we assert. With the inner probe empty on a present file,
+# _lock_path_mtime retries 3x, sees the file present-but-unreadable, fires the
+# warn-once and sets _LOCK_MTIME="" -> _lock_verify_stale -> fresh -> no steal.
+# The shadow returns empty ONLY for the lock path: _lock_stat_mtime is also used
+# for the CLAIM file's mtime (sh:1120/1230), which must keep working, and other
+# paths fall through to the real probe.
+T42_LOCK="$WORK/t42.lock"; T42_LOG="$WORK/t42.log"; T42_ERR="$WORK/t42.err"
+: > "$T42_LOG"; : > "$T42_ERR"
+# A STALE ghost that WOULD normally be stolen (backdated 9999s, well past STALE):
+# the whole point is that it is NOT stolen because its age can't be established.
+fabricate_lock "$T42_LOCK" "tok.ghost.t42.99999" "pid=99999 host=ghost"
+backdate "$T42_LOCK" 9999
+T42_INNER='
+  source "$1" || exit 70
+  clone_fn _lock_stat_mtime _sm_orig
+  # Return EMPTY for the present lock path; defer to the real probe otherwise
+  # (the claim-file mtime at sh:1120/1230 must stay readable).
+  _lock_stat_mtime() {
+    if [ "$1" = "$AGENT_LOCK_PATH" ]; then printf ""; return 0; fi
+    _sm_orig "$@"
+  }
+  lock_acquire; exit $?
+'
+# Tight timing: small MAX_WAIT so the blocked waiter reaches 97 in ~2-3s.
+AGENT_LOCK_PATH="$T42_LOCK" AGENT_LOCK_LOG="$T42_LOG" AGENT_LOCK_STALE_SECS=2 \
+  AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \
+  bash -c "$T42_INNER" _ "$LIB" 2>"$T42_ERR"; t42_rc=$?
+
+# (1) The fail-safe lane ran: the warn-once line appears. It is logged via
+#     _lock_log (lock log) AND echoed to stderr; assert either surface.
+if grep -q "Staleness detection is BROKEN" "$T42_LOG" "$T42_ERR" 2>/dev/null \
+   || grep -q "cannot read the lock file's mtime" "$T42_ERR" 2>/dev/null; then
+  ok "mtime-unreadable: 'Staleness detection is BROKEN' fail-safe warning fired"
+else
+  bad "mtime-unreadable: no broken-staleness warning (fail-safe lane did not run); err=$(cat "$T42_ERR")"
+fi
+# (2) NO steal: the stale ghost is NOT stolen and is left in place.
+if grep -q "STOLE-BY-CLAIM" "$T42_LOG" 2>/dev/null || grep -q "STOLE" "$T42_LOG" 2>/dev/null; then
+  bad "mtime-unreadable: ghost was STOLEN — staleness should have been disabled"
+else
+  ok "mtime-unreadable: no steal (recovery disabled, ghost not stolen)"
+fi
+g42="$(head -n 1 -- "$T42_LOCK" 2>/dev/null | tr -d '\r')"
+[ "$g42" = "tok.ghost.t42.99999" ] \
+  && ok "mtime-unreadable: stale ghost lock left in place (token unchanged)" \
+  || bad "mtime-unreadable: ghost lock disturbed (line1=$g42, want tok.ghost.t42.99999)"
+# (3) The waiter blocks to MAX_WAIT and exits 97 (recovery disabled).
+[ "$t42_rc" = 97 ] \
+  && ok "mtime-unreadable: waiter blocked to MAX_WAIT and exited 97" \
+  || bad "mtime-unreadable: waiter rc=$t42_rc (want 97 — was the stale ghost stolen?)"
+# (4) Warn-once: the broken-staleness warning fires EXACTLY once per process.
+t42_warns="$(grep -c "Staleness detection is BROKEN" "$T42_ERR" 2>/dev/null)"; t42_warns="${t42_warns:-0}"
+[ "$t42_warns" -le 1 ] \
+  && ok "mtime-unreadable: broken-staleness warning fired at most once on stderr ($t42_warns)" \
+  || bad "mtime-unreadable: warning repeated ($t42_warns times — warn-once broken)"
+rm -f "$T42_LOCK" "$T42_LOCK.next"
+fi
+
+if section "Test 43: malformed/unreadable lock content at the poll guard — never stolen, warned/skipped"; then
+# Two sibling branches of the in-acquire steal CONTENT GUARD (git-commit-lock.sh
+# ~:1419-1444), both gated on an already-stale candidate, neither of which the
+# torn/empty/tok.-prefixed cases (Tests 17/18) reach:
+#   (a) #18 — line 1 is NON-EMPTY but BLANK (whitespace/CR only): the trim at
+#       :1421 reduces it to empty, but the file is NOT empty (`-s` true) and the
+#       read SUCCEEDED, so it lands in the final `else` -> _lock_warn_nonlock
+#       "its content is not lock-shaped" (the `is not a lock file` config
+#       warning). NO steal; waiters reach 97.
+#   (b) #17 — the content read FAILS on a present, non-empty regular file (the
+#       `[ "$rdrc" -ne 0 ]` lane at :1432): logs "steal skipped: stale lock
+#       content unreadable"; NO steal; waiters reach 97. We can't make a real
+#       file unreadable on every platform (a chmod-000 file still reads for its
+#       owner on Windows/Cygwin), so we STEER it: source the lib in-process and
+#       shadow the `read` builtin to fail ONLY for the inline steal-guard read,
+#       identified by its direct caller `lock_acquire` (FUNCNAME[1]) — the
+#       _lock_read_tok / _lock_verify_stale reads delegate to `builtin read`, so
+#       only the :1420 site is perturbed.
+
+# (a) #18 — whitespace-only line 1: non-empty, blank, read OK -> never stolen, warned.
+LOCK="$WORK/t43blank.lock"; LOG="$WORK/t43blank.log"; : > "$LOG"
+printf ' \n' > "$LOCK"; backdate "$LOCK" 9999          # one space + LF: non-empty, blank line 1
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \
+  bash "$LIB" run -- bash -c 'true' 2> "$WORK/t43a.err"; rc=$?
+[ "$rc" = 97 ] && ok "#18 blank line 1: waiter timed out (97) instead of stealing" \
+               || bad "#18 blank line 1: rc=$rc (want 97)"
+grep -q "is not a lock file" "$WORK/t43a.err" \
+  && ok "#18 config warning fired (line 1 not lock-shaped)" || bad "#18 no config warning for blank line 1"
+grep -q "non-lock object at lock path (its content is not lock-shaped)" "$LOG" \
+  && ok "#18 log records the non-lock-shaped classification (branch ran)" \
+  || bad "#18 missing the non-lock-shaped log line (branch did not run)"
+grep -q "STOLE" "$LOG" && bad "#18 blank-content file was STOLEN" || ok "#18 no steal of the blank-content file"
+[ -f "$LOCK" ] && ok "#18 blank-content file left in place" || bad "#18 blank-content file was removed"
+rm -f "$LOCK"
+
+# (b) #17 — steal-guard content read FAILS on a present, non-empty file.
+# Steering shell: source the lib, shadow the `read` builtin to fail ONLY when
+# invoked directly by lock_acquire (the inline steal read at sh:1420). The ghost
+# is tok.-prefixed and ancient, so absent the shadow it WOULD be stolen — the
+# 97 outcome plus the "steal skipped ... unreadable" line prove the failed-read
+# lane (not some other refusal) is what blocked the steal.
+LOCK="$WORK/t43unread.lock"; LOG="$WORK/t43unread.log"; : > "$LOG"
+fabricate_lock "$LOCK" "tok.ghost.t43" "pid=9 host=ghost"; backdate "$LOCK" 9999
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \
+  bash -c '
+    source "$1" || exit 70
+    # Shadow the read builtin; reach the real one via `builtin read`. Fail only
+    # the steal-guard read (its direct caller is lock_acquire) so the
+    # _lock_read_tok / _lock_verify_stale reads stay intact.
+    read() {
+      if [ "${FUNCNAME[1]:-}" = lock_acquire ]; then return 1; fi
+      builtin read "$@"
+    }
+    lock_acquire || exit 97
+    lock_release || exit 74
+    exit 0
+  ' _ "$LIB" 2> "$WORK/t43b.err"; rc=$?
+[ "$rc" = 97 ] && ok "#17 unreadable steal content: waiter timed out (97) instead of stealing" \
+               || bad "#17 unreadable steal content: rc=$rc (want 97)"
+grep -q "steal skipped: stale lock content unreadable" "$LOG" \
+  && ok "#17 log records the skipped steal (unreadable branch ran)" \
+  || bad "#17 missing the 'steal skipped ... unreadable' log line (branch did not run)"
+grep -q "STOLE" "$LOG" && bad "#17 ghost was STOLEN despite the unreadable content read" \
+                       || ok "#17 no steal while the steal-guard read fails"
+[ -f "$LOCK" ] && ok "#17 stale ghost left in place" || bad "#17 stale ghost was removed"
+rm -f "$LOCK"
+fi
+
+if section "Test 44: socket & device-node at the lock path — never stolen/deleted, refused (97)"; then
+# The never-steal wrong-type guard (git-commit-lock.sh ~:1557-1567) classifies
+# NON-regular objects at the lock path so they are NEVER stolen and NEVER
+# deleted: a real config error (a typo'd AGENT_LOCK_PATH, a stray special file)
+# must wedge waiters to 97 with a loud one-time config warning, not get
+# clobbered. Test 17 covers the directory / symlink / FIFO arms of that
+# classifier; this test covers the two remaining arms — the SOCKET (-S) and the
+# DEVICE NODE (-b/-c) — both of which name their detected type in the warning.
+# For each: rc 97, the object survives unchanged (same type), the warning fires
+# naming the type, and nothing is ever stolen.
+
+# (a) a UNIX-DOMAIN SOCKET at the lock path. Fabricated with a backgrounded
+# python3 AF_UNIX bind (the socket inode persists while the process holds it);
+# skipped where a real socket can't be made AND classified -S by the running
+# shell — notably default Git-Bash on Windows, whose bundled python is a native
+# build with no socket.AF_UNIX (probed: bind raises AttributeError, so no inode
+# appears). CI's POSIX legs exercise this arm. The listener is reaped by its
+# EXACT pid at the end (never by name).
+LOCK="$WORK/sock.lock"; LOG="$WORK/sock.log"; : > "$LOG"
+SOCKERR="$WORK/sock.py.err"; sock_pid=""; sock_ok=0
+if command -v python3 >/dev/null 2>&1; then
+  rm -f "$LOCK"
+  python3 -c 'import socket,sys,time
+s=socket.socket(socket.AF_UNIX)
+s.bind(sys.argv[1])
+sys.stderr.write("bound\n"); sys.stderr.flush()
+time.sleep(30)' "$LOCK" 2> "$SOCKERR" &
+  sock_pid=$!
+  # Gate on the socket actually existing AND classifying -S (not just the pid
+  # being alive): on a no-AF_UNIX build the process exits immediately with no
+  # inode, so we must positively confirm the object before relying on it.
+  for _ in $(seq 1 100); do
+    [ -S "$LOCK" ] && { sock_ok=1; break; }
+    kill -0 "$sock_pid" 2>/dev/null || break
+    sleep 0.05
+  done
+fi
+if [ "$sock_ok" = 1 ]; then
+  AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+    AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \
+    bash "$LIB" run -- bash -c 'true' 2> "$WORK/t44a.err"; rc=$?
+  [ "$rc" = 97 ] && ok "socket at lock path: waiter timed out (97), command never ran" \
+                 || bad "socket at lock path: rc=$rc (want 97)"
+  [ -S "$LOCK" ] && ok "socket untouched (never stolen/deleted, still a socket)" \
+                 || bad "socket at lock path was removed/replaced!"
+  grep -q "is not a lock file" "$WORK/t44a.err" && ok "loud config warning on stderr (socket)" \
+                                                || bad "no config warning for socket at lock path"
+  grep -q "it is a socket" "$WORK/t44a.err" && ok "warning names the detected type (socket)" \
+                                            || bad "warning does not name the socket type"
+  n="$(grep -c "is not a lock file" "$WORK/t44a.err")"
+  [ "$n" = 1 ] && ok "socket config warning fired exactly once per process (got $n)" \
+               || bad "socket config warning fired $n times (want 1)"
+  grep -q STOLE "$LOG" && bad "socket was STOLEN" || ok "no steal attempted on a socket"
+else
+  echo "note: cannot create a unix-domain socket here (no socket.AF_UNIX / not classified -S) — socket guard not exercised (CI POSIX legs cover it)"
+fi
+# Reap the listener by ITS exact pid only (bounded wait, then hard-kill of the
+# same pid as a last resort) — never by name. Harmless if it already exited.
+if [ -n "$sock_pid" ]; then
+  kill "$sock_pid" 2>/dev/null
+  for _ in $(seq 1 40); do kill -0 "$sock_pid" 2>/dev/null || break; sleep 0.05; done
+  kill -0 "$sock_pid" 2>/dev/null && kill -9 "$sock_pid" 2>/dev/null
+  wait "$sock_pid" 2>/dev/null
+fi
+rm -f "$LOCK"
+
+# (b) a DEVICE NODE at the lock path. mknod needs root, but /dev/null is a
+# character device that always exists, so we point AGENT_LOCK_PATH straight at
+# it: the -c arm of the classifier must refuse it. This is SAFE precisely
+# because the guard refuses — it is never opened-for-write, stolen, or deleted —
+# which the post-run assertion below proves (/dev/null is still a char device).
+# Skipped only if /dev/null somehow isn't a char device on this platform.
+if [ -c /dev/null ]; then
+  LOG="$WORK/dev.log"; : > "$LOG"
+  AGENT_LOCK_PATH="/dev/null" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \
+    AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \
+    bash "$LIB" run -- bash -c 'true' 2> "$WORK/t44b.err"; rc=$?
+  [ "$rc" = 97 ] && ok "device node (/dev/null) at lock path: waiter timed out (97), command never ran" \
+                 || bad "device node at lock path: rc=$rc (want 97)"
+  [ -c /dev/null ] && ok "/dev/null untouched (never stolen/deleted, still a char device)" \
+                   || bad "/dev/null was damaged — the guard must NEVER touch a device node!"
+  grep -q "is not a lock file" "$WORK/t44b.err" && ok "loud config warning on stderr (device node)" \
+                                                || bad "no config warning for device node at lock path"
+  grep -q "it is a device node" "$WORK/t44b.err" && ok "warning names the detected type (device node)" \
+                                                 || bad "warning does not name the device-node type"
+  n="$(grep -c "is not a lock file" "$WORK/t44b.err")"
+  [ "$n" = 1 ] && ok "device-node config warning fired exactly once per process (got $n)" \
+               || bad "device-node config warning fired $n times (want 1)"
+  grep -q STOLE "$LOG" && bad "device node was STOLEN" || ok "no steal attempted on a device node"
+else
+  echo "note: /dev/null is not a char device here — device-node guard not exercised (CI POSIX legs cover it)"
+fi
+fi
+
+
+if section "Test 45: log self-truncates past ~1 MB (rotation, not unbounded growth)"; then
+# _lock_log starts the log over (not rotate) once it grows past ~1MB: the size
+# check at the top of _lock_log truncates the file to empty before the write,
+# so a normal log-producing op on an oversized log leaves a small, well-formed
+# log carrying only the fresh protocol lines. Pre-fill > 1MB, run one clean
+# acquire+release, assert the log SHRANK and the lock still worked.
+LOCK="$WORK/t45.lock"; LOG="$WORK/t45.log"
+# Pre-fill comfortably above the 1048576-byte (1MB) threshold (~1.2MB of 'x').
+head -c 1200000 /dev/zero | tr '\0' 'x' > "$LOG"
+before=$(wc -c < "$LOG")
+[ "$before" -gt 1048576 ] && ok "pre-fill exceeds the 1MB threshold (${before} bytes)" \
+                          || bad "pre-fill not over threshold (${before} bytes)"
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'true'; rc=$?
+[ "$rc" = 0 ] && ok "lock op succeeded over an oversized log (rc=0)" \
+             || bad "lock op rc=$rc over oversized log (want 0)"
+after=$(wc -c < "$LOG")
+# Truncation fired iff the log is now far below the threshold (it holds only a
+# handful of fresh lines). Use 1MB as the boundary: any non-truncation leaves
+# it at/above the 1.2MB pre-fill.
+[ "$after" -lt 1048576 ] && ok "log shrank below threshold after the op (${before} -> ${after} bytes — rotation fired)" \
+                         || bad "log did NOT shrink (${before} -> ${after} bytes — truncation never fired)"
+# Well-formed: the new log carries the fresh protocol lines, not the old giant
+# 'x' content, and records the truncation.
+grep -q 'log exceeded 1MB; truncated' "$LOG" && ok "log records the self-truncation notice" \
+                                             || bad "no truncation notice in the restarted log"
+grep -q 'ACQUIRED' "$LOG" && grep -q 'RELEASED' "$LOG" \
+  && ok "restarted log carries fresh ACQUIRED + RELEASED protocol lines" \
+  || bad "restarted log missing fresh protocol lines (ACQUIRED/RELEASED)"
+grep -q 'xxxx' "$LOG" && bad "old oversized 'x' content survived into the restarted log" \
+                      || ok "old oversized content is gone (clean restart, not appended)"
+[ -e "$LOCK" ] && bad "lock left held after run" || ok "lock released after the over-threshold run"
+rm -f "$LOCK" "$LOG"
+fi
+
+if section "Test 46: EXIT while waiting (no hold) — no-hold trap arc, no spurious release"; then
+# Covers _lock_on_exit's no-hold arc-end (sh:1009,1017-1018).
+# A sourced waiter, blocked in the wait loop against a LIVE held lock, exits 0
+# while still parked — the EXIT trap is STILL '_lock_on_exit' (the timeout's
+# trap-restore has NOT run, because we never time out), so EXIT fires the
+# handler on the NO-HOLD path: claim-trap cleanup (no token => no-op),
+# leaked-resolve, restore traps. NO release semantics may run (we never held).
+#
+# Why interposition and not "lock_acquire times out 97 then exit": the 97
+# timeout path itself runs _lock_restore_traps BEFORE returning, so by the time
+# the caller exits the EXIT trap is already gone and _lock_on_exit never fires
+# (verified: post-97 `trap -p EXIT` is empty). To exercise the EXIT-while-
+# WAITING arc the process must leave the loop via `exit` with the trap still
+# armed — so W shadows `sleep` (called once per poll inside the wait loop) to
+# park on a marker, then `exit 0` from inside that first poll-sleep. At that
+# point _LOCK_HELD=0 and no claim is in flight (the live lock is never stale, so
+# no steal/claim was attempted), which is exactly the no-hold arc.
+T46_INNER='
+  source "$1" || exit 70
+  F46=0
+  sleep() {
+    if [ "$F46" = 0 ]; then
+      F46=1
+      command touch "$T46R"                 # signal: parked in the wait loop
+      until [ -e "$T46G" ]; do command sleep 0.05; done
+      # Record the live EXIT trap so the assertions can prove _lock_on_exit
+      # (not a bare/restored trap) is what fires on the exit below.
+      trap -p EXIT > "$T46T"
+      exit 0                                  # EXIT while waiting, no hold held
+    fi
+    command sleep "$@"
+  }
+  lock_acquire
+  echo "REACHED-UNEXPECTED rc=$?" >&2        # the shadowed sleep must exit first
+'
+LOCK="$WORK/exitwait.lock"; LOG="$WORK/exitwait.log"; : > "$LOG"
+HLOG="$WORK/exitwait.h.log"; : > "$HLOG"
+T46R="$WORK/t46.ready"; T46G="$WORK/t46.go"; T46T="$WORK/t46.trap"
+rm -f "$T46R" "$T46G" "$T46T" "$LOCK" "$LOCK.next"
+# H: holder — sourced, takes a FRESH live lock and parks until released. STALE is
+# huge so the lock is never judged stealable; W therefore stays a pure waiter.
+HR="$WORK/t46.hready"; HG="$WORK/t46.hgo"; rm -f "$HR" "$HG"
+HR="$HR" HG="$HG" \
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$HLOG" AGENT_LOCK_STALE_SECS=600 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \
+  bash -c '
+    source "$1" || exit 70
+    lock_acquire || exit 72
+    touch "$HR"
+    until [ -e "$HG" ]; do sleep 0.05; done
+    lock_release
+  ' _ "$LIB" 2>/dev/null &
+h46=$!
+wait_for_file "$HR" 30 || bad "T46 holder never acquired the lock"
+htok=""; IFS= read -r htok < "$LOCK" || true       # the live holder's token
+# W: the waiter that will exit while parked in the wait loop (no hold).
+T46R="$T46R" T46G="$T46G" T46T="$T46T" \
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=600 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \
+  bash -c "$T46_INNER" _ "$LIB" 2>/dev/null &
+w46=$!
+# Gate on W proving it reached the wait-loop poll (its WAITING line is logged,
+# and its shadowed sleep touched the ready marker) before releasing it to exit.
+wait_for_grep "WAITING for lock" "$LOG" 30 || bad "T46 waiter never logged WAITING"
+wait_for_file "$T46R" 30 || bad "T46 waiter never reached its wait-loop poll"
+touch "$T46G"
+wait "$w46"; rc=$?
+# Core assertion: W exited cleanly via the EXIT no-hold arc, with NO release
+# semantics — it never held the lock, so a RELEASED or a 98/'lock LOST' would
+# mean the handler wrongly ran the holding branch.
+[ "$rc" = 0 ] && ok "waiter exited 0 via the EXIT-while-waiting no-hold arc" \
+              || bad "T46 waiter rc=$rc (want 0; EXIT trap mishandled the no-hold arc?)"
+grep -q RELEASED "$LOG" && bad "spurious RELEASED on the no-hold EXIT arc (release ran without a hold)" \
+                        || ok "no RELEASED on the no-hold EXIT arc (no release semantics)"
+grep -q "lock LOST" "$LOG" && bad "98-classification ran on the no-hold EXIT arc" \
+                           || ok "no 98 classification on the no-hold EXIT arc"
+# The trap that fired was our handler, not a bare/restored one — this is the
+# discriminator that the EXIT-WHILE-WAITING arc ran (vs a post-97 exit, where
+# the trap is already empty). Mirrors Test 12d's trap-restoration idiom.
+grep -q "_lock_on_exit" "$T46T" && ok "EXIT trap still armed as _lock_on_exit at exit (no-hold arc, not post-97)" \
+                                || bad "EXIT trap was not _lock_on_exit at exit (got: $(cat "$T46T" 2>/dev/null))"
+# The waiter left no claim behind (it never claimed — the live lock is not stale).
+[ -e "$LOCK.next" ] && bad "waiter left a claim file behind on the no-hold EXIT arc" \
+                    || ok "no leftover claim from the no-hold EXIT waiter"
+# H's lock is untouched — still the holder's original token, still held.
+l1=""; IFS= read -r l1 < "$LOCK" 2>/dev/null || true
+[ -n "$htok" ] && [ "$l1" = "$htok" ] && ok "holder's lock untouched by the dying waiter (token intact)" \
+                                      || bad "holder's lock changed by the dying waiter (was=$htok now=$l1)"
+# Release H and confirm it shut down cleanly (no fallout from W's exit).
+touch "$HG"; wait "$h46" 2>/dev/null
+grep -q "lock LOST" "$HLOG" && bad "holder saw a stolen lease (98) — the waiter's exit disturbed the hold" \
+                            || ok "holder released its still-held lock cleanly (no 98)"
+rm -f "$LOCK" "$LOCK.next" "$T46R" "$T46G" "$T46T" "$HR" "$HG"
+fi
+
+if section "Test 47: no-mv-T rename-over fallback (BSD/macOS lane) forced via _LOCK_MVT=0 — steal still installs"; then
+# _lock_rename_over (git-commit-lock.sh ~:961-979) probes once for GNU `mv -T`
+# and caches the verdict in _LOCK_MVT (""=unprobed, 1=supported, 0=not). On
+# Linux/MINGW the probe ALWAYS picks `mv -T`, so the no-`-T` fallback lane
+# (~:976-977: a last-instant `[ -d "$dst" ]` guard + a bare `mv`) is NEVER
+# executed in CI except on a real BSD/macOS runner. Pre-seeding _LOCK_MVT=0 in
+# the sourced steal shell BEFORE any acquire makes the `[ -z "$_LOCK_MVT" ]`
+# probe short-circuit (the var is already non-empty), forcing the fallback on
+# the common leg. Two scenarios:
+#   (a) a normal steal of a stale ghost under _LOCK_MVT=0 installs the lock via
+#       the unlink-free bare-`mv` fallback (STOLE-BY-CLAIM, the steal acquires);
+#   (b) a DIRECTORY squatting the lock path under _LOCK_MVT=0 is refused by the
+#       fallback's `[ -d ]` last-instant guard (no clobber) — the fallback-path
+#       analogue of Test 37's `mv -T` natural refusal.
+# Determinism proof that the fallback truly ran (not GNU `mv -T`): scenario (a)
+# shadows `mv` to record, per invocation touching ".next", whether `-T` was
+# passed; under _LOCK_MVT=0 the steal's claim->lock rename MUST be a bare `mv`
+# (no `-T`). A control run WITHOUT the override is asserted to still steal, so a
+# pass cannot come from the override having silently broken the steal entirely.
+
+# ---- (a) forced-fallback steal of a stale ghost: STOLE-BY-CLAIM via bare mv ----
+LOCK="$WORK/mvt0.lock"; LOG="$WORK/mvt0.log"; : > "$LOG"
+MVTRACE="$WORK/mvt0.mvtrace"; : > "$MVTRACE"
+fabricate_lock "$LOCK" "tok.ghost.t47" "pid=9 host=ghost"; backdate "$LOCK" 9999
+# Sourced steal shell: pre-seed _LOCK_MVT=0, shadow `mv` to log the flags it was
+# called with on the ".next" (claim->lock) rename, then call the real `mv`.
+AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \
+  bash -c '
+    source "$1" || exit 70
+    _LOCK_MVT=0                                  # force the no-mv-T fallback lane
+    export MVTRACE_PATH="$2"                     # pass the trace path into mv() via env
+    mv() {
+      case "$*" in
+        *".next"*) printf "%s\n" "$*" >> "$MVTRACE_PATH" ;;  # record claim->lock rename flags
+      esac
+      command mv "$@"
+    }
+    lock_acquire || exit 72
+    lock_release || exit 74
+    exit 0
+  ' _ "$LIB" "$MVTRACE" 2>/dev/null; rc=$?
+[ "$rc" = 0 ] && ok "T47(a): forced-fallback steal acquired+released rc 0 (_LOCK_MVT=0)" \
+              || bad "T47(a): forced-fallback steal rc=$rc (want 0)"
+grep -q "STOLE-BY-CLAIM" "$LOG" \
+  && ok "T47(a): stale ghost stolen via the no-mv-T fallback (STOLE-BY-CLAIM logged)" \
+  || bad "T47(a): no STOLE-BY-CLAIM under _LOCK_MVT=0 — fallback did not install the lock"
+grep -q "ACQUIRED" "$LOG" && grep -q "RELEASED" "$LOG" \
+  && ok "T47(a): fallback steal produced a clean ACQUIRED/RELEASED pair" \
+  || bad "T47(a): missing ACQUIRED/RELEASED after the fallback steal"
+# The mv trace proves the fallback lane (bare mv, no -T) actually carried the
+# claim->lock rename — the whole point of forcing _LOCK_MVT=0.
+[ -s "$MVTRACE" ] \
+  && ok "T47(a): claim->lock rename went through the shadowed mv (trace non-empty)" \
+  || bad "T47(a): no .next rename recorded — the steal did not rename-over as expected"
+if grep -q -- '-T' "$MVTRACE"; then
+  bad "T47(a): claim->lock rename used 'mv -T' — the GNU fast path ran, fallback NOT forced"
+else
+  ok "T47(a): claim->lock rename used a BARE mv (no -T) — the BSD/macOS fallback lane was taken"
+fi
+{ [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } \
+  && bad "T47(a): leftover lock/claim after the fallback steal+release" \
+  || ok "T47(a): clean final state (no lock, no claim) after fallback steal+release"
+
+# ---- (a-control) same steal WITHOUT the override still succeeds ----
+# Guards against a false pass where _LOCK_MVT=0 silently broke the steal: the
+# unmodified library must steal the identical ghost too (here via mv -T).
+LOCKC="$WORK/mvt0c.lock"; LOGC="$WORK/mvt0c.log"; : > "$LOGC"
+fabricate_lock "$LOCKC" "tok.ghost.t47c" "pid=9 host=ghost"; backdate "$LOCKC" 9999
+AGENT_LOCK_PATH="$LOCKC" AGENT_LOCK_LOG="$LOGC" AGENT_LOCK_STALE_SECS=2 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \
+  bash -c 'source "$1" || exit 70; lock_acquire || exit 72; lock_release || exit 74; exit 0' \
+  _ "$LIB" 2>/dev/null; rcc=$?
+[ "$rcc" = 0 ] && grep -q "STOLE-BY-CLAIM" "$LOGC" \
+  && ok "T47(a-control): unmodified steal of the same ghost also succeeds (override didn't trivially break it)" \
+  || bad "T47(a-control): control steal rc=$rcc / no STOLE-BY-CLAIM (the (a) pass may be vacuous)"
+
+# ---- (b) directory at the lock path under _LOCK_MVT=0: [ -d ] guard refuses ----
+# The fallback's last-instant `[ -d "$dst" ]` guard (sh:976) must refuse to
+# rename a file over a directory — Test 37's no-clobber outcome, reached via the
+# fallback rather than `mv -T`'s natural directory refusal. Test 37 shadows `mv`
+# so the directory appears just before the real `mv -T` refuses it; that timing
+# does NOT exercise the fallback's `[ -d ]` because the swap lands AFTER the
+# library has already passed line 976. To hit the fallback guard itself we wrap
+# `_lock_rename_over`: the wrapper installs the directory and pins _LOCK_MVT=0,
+# THEN calls the unmodified original — whose own `[ -d "$dst" ]` check (line 976)
+# now sees the directory and returns 1, with NO library `mv`/`mv -T` ever run.
+# The verifies (step 3.3) ran before the wrapper, so they saw a stale FILE; the
+# directory exists only from the wrapper's first line onward. This is the
+# fallback-lane analogue of Test 37's wrong-type refusal.
+LOCKB="$WORK/mvt0dir.lock"; LOGB="$WORK/mvt0dir.log"; : > "$LOGB"
+fabricate_lock "$LOCKB" "tok.ghost.t47b" "pid=9 host=ghost"; backdate "$LOCKB" 9999
+AGENT_LOCK_PATH="$LOCKB" AGENT_LOCK_LOG="$LOGB" AGENT_LOCK_STALE_SECS=1 \
+  AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=3 \
+  bash -c '
+    source "$1" || exit 70
+    clone_fn _lock_rename_over _ro_orig
+    _lock_rename_over() {
+      # Land a DIRECTORY at the lock path, then force the fallback lane and run
+      # the REAL rename-over: its own `[ -d ]` guard (sh:976) must refuse (rc 1).
+      command rm -f -- "$AGENT_LOCK_PATH" 2>/dev/null
+      command mkdir -- "$AGENT_LOCK_PATH" 2>/dev/null
+      _LOCK_MVT=0
+      _ro_orig
+    }
+    lock_acquire
+    exit $?
+  ' _ "$LIB" 2>/dev/null; rcb=$?
+[ "$rcb" = 97 ] && ok "T47(b): fallback [ -d ] guard refused; waiter honoured MAX_WAIT (97), no false hold" \
+               || bad "T47(b): rc=$rcb (want 97 — a clobber/false hold would differ)"
+grep -q "CLAIM-ABORT (rename-refused)" "$LOGB" \
+  && ok "T47(b): CLAIM-ABORT (rename-refused) logged — fallback guard hit the wrong-type lane" \
+  || bad "T47(b): no CLAIM-ABORT (rename-refused) — fallback guard branch not exercised"
+grep -q "non-file at the lock path" "$LOGB" \
+  && ok "T47(b): refusal classified as non-file at the lock path" \
+  || bad "T47(b): missing 'non-file at the lock path' classification"
+grep -q "STOLE-BY-CLAIM" "$LOGB" \
+  && bad "T47(b): spurious STOLE-BY-CLAIM — the directory-occupied path was falsely stolen" \
+  || ok "T47(b): no STOLE-BY-CLAIM (the [ -d ] guard prevented a false steal)"
+[ -d "$LOCKB" ] \
+  && ok "T47(b): directory left in place at the lock path (never clobbered by the fallback mv)" \
+  || bad "T47(b): lock path no longer the squatting directory — the guard failed to protect it"
+[ -e "$LOCKB.next" ] \
+  && bad "T47(b): claim leftover (\$LOCK.next) after the fallback rename-refused abort" \
+  || ok "T47(b): claim file cleaned up — no leftover \$LOCK.next"
+rm -rf "$LOCK" "$LOCK.next" "$LOCKC" "$LOCKC.next" "$LOCKB" "$LOCKB.next"
+fi
+
+
+if section "Test 48: unwritable lock dir -> clean 97, command never runs, no false hold (F4)"; then
+# F4 (failure-modes.md §4 item 5): a read-only / unwritable lock-dir parent makes the
+# O_EXCL create fail every poll, so the waiter times out at 97 — no corruption, no
+# false hold, and the wrapped command never runs. POSIX-only: chmod 0555 is a no-op
+# for writes on Git-Bash/NTFS (the create would wrongly succeed), so skip-with-note
+# on Windows; the Linux/macOS CI legs exercise it.
+case "$(uname -s)" in
+  MINGW*|MSYS*|CYGWIN*)
+    echo "note: Test 48 skipped on Windows — chmod 0555 does not deny writes on NTFS; the POSIX CI legs cover it" ;;
+  *)
+    T48DIR="$WORK/t48.nowrite"; T48LOG="$WORK/t48.log"; mkdir -p "$T48DIR"; : > "$T48LOG"
+    T48MARK="$WORK/t48.ran"; rm -f "$T48MARK"
+    chmod 0555 "$T48DIR"
+    AGENT_LOCK_PATH="$T48DIR/commit.lock" AGENT_LOCK_LOG="$T48LOG" \
+      AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \
+      bash "$LIB" run -- bash -c "touch '$T48MARK'" 2> "$WORK/t48.err"; rc=$?
+    [ "$rc" = 97 ] && ok "F4 unwritable lock dir: waiter timed out (97)" \
+                   || bad "F4 unwritable lock dir: rc=$rc (want 97)"
+    [ ! -e "$T48MARK" ] && ok "F4: the wrapped command never ran" \
+                        || bad "F4: the wrapped command ran despite no lock"
+    [ ! -e "$T48DIR/commit.lock" ] && ok "F4: no lock file created in the unwritable dir" \
+                                   || bad "F4: a lock file appeared in an unwritable dir"
+    grep -q "WAITING for lock" "$T48LOG" && ok "F4: logged WAITING (the create kept failing)" \
+                                         || bad "F4: no WAITING log"
+    grep -q "TIMEOUT after" "$T48LOG" && ok "F4: logged the TIMEOUT" || bad "F4: no TIMEOUT log"
+    chmod 0755 "$T48DIR" 2>/dev/null; rm -rf "$T48DIR"   # restore so cleanup() can rm -rf $WORK
+    ;;
+esac
+fi
+
+if section "Test 49: failing log path -> lock still works, the log write is swallowed (F2/J1)"; then
+# F2/J1 (failure-modes.md §4 item 5): logging is best-effort (every write ends || true).
+# Point AGENT_LOCK_LOG under a REGULAR FILE so every append/open fails ENOTDIR — the
+# lock must still acquire+release cleanly (rc 0) with the log write swallowed.
+# Portable (no chmod/perms). NOTE: bash's redirection-OPEN failure leaks to stderr
+# (the ||true is on the write, not the open), so do NOT assert clean stderr; and do
+# NOT grep the log (nothing is ever written to it).
+T49P="$WORK/t49.notadir"; : > "$T49P"          # a regular FILE; using it as a dir -> ENOTDIR
+T49LOG="$T49P/x.log"                            # every open/append under it fails ENOTDIR
+T49MARK="$WORK/t49.ran"; rm -f "$T49MARK"
+AGENT_LOCK_PATH="$WORK/t49.lock" AGENT_LOCK_LOG="$T49LOG" \
+  bash "$LIB" run -- bash -c "touch '$T49MARK'" 2>/dev/null; rc=$?
+[ "$rc" = 0 ] && ok "F2/J1 failing log: lock acquired+released, command ran (rc 0)" \
+             || bad "F2/J1 failing log: rc=$rc (want 0 — a bad log must not fail the lock)"
+[ -e "$T49MARK" ] && ok "F2/J1: the wrapped command ran" \
+                  || bad "F2/J1: the wrapped command did not run"
+[ ! -e "$WORK/t49.lock" ] && ok "F2/J1: lock released/cleaned up despite the failing log" \
+                          || bad "F2/J1: lock left behind"
+[ ! -e "$T49LOG" ] && ok "F2/J1: the log write was swallowed (no log file under the non-dir)" \
+                   || bad "F2/J1: a log file was created under a non-dir"
+rm -f "$T49P" "$WORK/t49.lock"
+fi
+
+if section "Test 50: ENOSPC on lock create/write -> wait then 97, no false hold (F1)"; then
+# F1 (failure-modes.md §4 item 5): a full filesystem makes the create's write fail
+# (ENOSPC); the created-but-write-failed file is an empty orphan and the waiter
+# times out at 97 — no corruption, no false hold. Real ENOSPC needs a full FS, which
+# needs root (a small tmpfs); `ulimit -f` is NOT usable (it raises SIGXFSZ and kills
+# the wrapper, the wrong lane). So: Linux + passwordless sudo only; skip-with-note
+# otherwise. The Linux CI leg (ubuntu runners have passwordless sudo) exercises it.
+if [ "$(uname -s)" = Linux ] && sudo -n true 2>/dev/null; then
+  T50MNT="$WORK/t50.full"; T50LOG="$WORK/t50.log"; mkdir -p "$T50MNT"; : > "$T50LOG"
+  T50MARK="$WORK/t50.ran"; rm -f "$T50MARK"
+  if sudo mount -t tmpfs -o size=64k tmpfs "$T50MNT" 2>/dev/null; then
+    dd if=/dev/zero of="$T50MNT/fill" bs=1k count=256 2>/dev/null || true   # fill to ENOSPC
+    AGENT_LOCK_PATH="$T50MNT/commit.lock" AGENT_LOCK_LOG="$T50LOG" \
+      AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \
+      bash "$LIB" run -- bash -c "touch '$T50MARK'" 2> "$WORK/t50.err"; rc=$?
+    [ "$rc" = 97 ] && ok "F1 ENOSPC: waiter timed out (97)" \
+                   || bad "F1 ENOSPC: rc=$rc (want 97)"
+    [ ! -e "$T50MARK" ] && ok "F1: the wrapped command never ran under ENOSPC" \
+                        || bad "F1: the wrapped command ran despite ENOSPC"
+    sudo umount "$T50MNT" 2>/dev/null
+  else
+    echo "note: Test 50 skipped — could not mount a tmpfs (sudo mount failed); covered where mountable"
+  fi
+  rmdir "$T50MNT" 2>/dev/null || true
+else
+  echo "note: Test 50 skipped — ENOSPC injection needs Linux + passwordless sudo (a small tmpfs); the Linux CI leg covers it"
+fi
+fi
 
 # NOTES (deliberately untested here):
 # * lock_release's LEFTOVER lane (the unlink blocked persistently) needs a
@@ -2064,10 +3156,23 @@ rm -f "$LOCK" "$LOCK.next"
 #   blocker is most naturally a pwsh FileShare.Read holder, so the interop
 #   suite owns that test (on POSIX, unlink never blocks on open handles and
 #   the lane is unreachable).
-# * lock_acquire's read-back-verification failure lane needs fault injection
-#   to make a winning create read back wrong; it is defence in depth (see the
-#   ACQUIRE VERIFICATION header section), not suite-covered.
+# * lock_acquire's read-back-verification failure lanes (defence in depth; see
+#   the ACQUIRE VERIFICATION header section) are covered via _lock_cur_token
+#   fault injection: the create-path lane (create won, read-back wrong) by
+#   Test 32, the steal-path lane (F2 — rename-over won, read-back wrong) by
+#   Test 32b.
+
+# Zero-match guard + selector-report line (shared helper in _harness.sh): a
+# set-but-non-matching GCL_TEST_ONLY ran NO test block, which without the guard
+# would fall through to a vacuous PASS=0 FAIL=0 "green" — a typo'd selector regex
+# would silently look like success; bail loudly instead. (The finish EXIT trap
+# also fires there since DONE is still 0; that exit is non-zero regardless.) When
+# the selector matched, it reports how many blocks ran. Both are gated on
+# GCL_TEST_ONLY being non-empty, so a default run stays byte-identical.
+selector_report
 
+DONE=1
 echo
-echo "==== RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ===="
+echo "==== RESULT: $PASS passed, $FAIL failed, $ENV_WARN envelope warning(s) (fan-out: $GCL_MODE) ===="
+[ "$GCL_TAP" = 1 ] && echo "1..$TAPN"
 [ "$FAIL" = 0 ]
diff --git a/tests/with-load.sh b/tests/with-load.sh
new file mode 100644
index 0000000..077511f
--- /dev/null
+++ b/tests/with-load.sh
@@ -0,0 +1,308 @@
+#!/usr/bin/env bash
+# with-load.sh — run a command under a calibrated, reproducible background load.
+#
+# Usage:   bash tests/with-load.sh <cmd> [args...]
+# Example: bash tests/with-load.sh bash tests/git-commit-lock.test.sh
+#
+# Wraps "$@", applies artificial background load for the command's lifetime, then
+# tears the load down (by EXACT spawned PIDs — never by name, so it is safe on a
+# shared dev box and doubly safe on an ephemeral CI runner) and exits with the
+# wrapped command's exit code.
+#
+# WHY load exists here (see docs/load-testing-strategy.md §1): this protocol's
+# *correctness* is load-independent (O_EXCL + atomic rename + per-attempt tokens
+# never consult the clock for a correctness decision), so load cannot break
+# exclusion. Load's only jobs are (J1) perturb scheduling so the protocol's
+# multi-syscall sequences get preempted at adversarial points, and (J2) stretch
+# the few genuinely timing-derived decisions. Magnitude past ~2x CPU
+# oversubscription mostly manufactures harness wall-clock flakes, not bugs — which
+# is why load is expressed as an oversubscription RATIO and the total ratio is
+# CAPPED.
+#
+# ── Calibrated interface (the contract nightly/deep-sweep CI calls against) ──────
+#
+#   GCL_STRESS_KIND        none | cpu | disk | both        (default: none)
+#                          none/unset => CLEAN PASS-THROUGH: zero added load, the
+#                          command's exit code is propagated verbatim.
+#
+#   GCL_STRESS_RATIO       Oversubscription ratio R = stressors / nproc, PER KIND.
+#                          (default: 1)  Stressors-per-kind = round(R * nproc),
+#                          floored at 1 when a kind is selected. Runner-independent:
+#                          "R=2" means the same pressure on a 2-core and a 32-core box,
+#                          whereas a raw hog count does not.
+#
+#   GCL_STRESS_RATIO_MAX   Cap on the TOTAL oversubscription ratio across all kinds
+#                          (default: 2). `both` runs cpu + disk, so its total ratio is
+#                          2*R; this cap scales each kind's stressor count down
+#                          proportionally so the runner is never wedged. Set the
+#                          deep-sweep flake-hunt higher deliberately.
+#
+#   GCL_STRESS_LOAD        BACK-COMPAT raw-count override. If set to a positive
+#                          integer it REPLACES the ratio computation: exactly N
+#                          stressors per selected kind (still capped by RATIO_MAX
+#                          unless GCL_STRESS_RATIO_MAX is also raised). Empty/unset =>
+#                          use the ratio. Kept so the existing deep-sweep
+#                          `stress_load=N` dispatch input keeps working.
+#
+#   GCL_STRESS_CGROUP      1 => on Linux with a writable cgroup v2 cpu controller,
+#                          PROBE the calibrated cgroup CPU-quota path (envelope leg).
+#                          The probe is recorded in the manifest. cgroup IO throttling
+#                          is experimental and intentionally NOT attempted here.
+#                          (default: 0)  Absent/unwritable => fall back to spinners.
+#
+#   GCL_LOAD_MANIFEST      Path for the per-run load-manifest JSON
+#                          (default: test-output/load-manifest.<pid>.json, created
+#                          under a known dir so CI can upload it). One file per run,
+#                          capturing {kind, R, nproc, stressor counts, achieved
+#                          slowdown, tool versions, os/arch, git sha} so any flake is
+#                          reproducible. Written on success too.
+#
+# CPU stressor: `stress-ng --cpu` when available (calibrated, measurable), else a
+#               portable bash spin loop (one busy core each).
+# Disk stressor: a tight create / write+fsync / delete loop over a small file on the
+#               same volume as the test scratch dir — metadata + write-back pressure
+#               that contends with the lock-file create/delete the suite itself does.
+#               (Always the portable shell hog; cross-platform, low-fidelity but real
+#               metadata-op pressure — see strategy §4.)
+set -uo pipefail
+
+# ── Inputs ───────────────────────────────────────────────────────────────────
+kind="${GCL_STRESS_KIND:-none}"
+nproc_count="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)"
+case "$nproc_count" in ''|*[!0-9]*) nproc_count=4 ;; esac
+[ "$nproc_count" -lt 1 ] && nproc_count=1
+
+ratio="${GCL_STRESS_RATIO:-1}"
+case "$ratio" in ''|*[!0-9]*) ratio=1 ;; esac   # integer ratios only (R in {0,1,2,…})
+
+ratio_max="${GCL_STRESS_RATIO_MAX:-2}"
+case "$ratio_max" in ''|*[!0-9]*) ratio_max=2 ;; esac
+
+raw_load="${GCL_STRESS_LOAD:-}"
+case "$raw_load" in *[!0-9]*) raw_load="" ;; esac   # non-numeric => ignore, use ratio
+
+manifest="${GCL_LOAD_MANIFEST:-test-output/load-manifest.$$.json}"
+
+# ── Stressor-count calibration ─────────────────────────────────────────────────
+# Per-kind count: raw-count override wins, else round(R * nproc) floored at 1.
+if [ -n "$raw_load" ]; then
+  per_kind="$raw_load"
+else
+  per_kind=$(( ratio * nproc_count ))
+  [ "$ratio" -gt 0 ] && [ "$per_kind" -lt 1 ] && per_kind=1
+fi
+
+# How many kinds spawn stressors.
+n_kinds=0
+case "$kind" in
+  cpu|disk) n_kinds=1 ;;
+  both)     n_kinds=2 ;;
+esac
+
+# R_total cap: total stressors must not exceed ratio_max * nproc. `both` would
+# otherwise be 2*per_kind; scale each kind down proportionally if it would breach.
+cpu_count=0
+disk_count=0
+capped="no"
+if [ "$n_kinds" -gt 0 ] && [ "$per_kind" -gt 0 ]; then
+  total_cap=$(( ratio_max * nproc_count ))
+  [ "$total_cap" -lt "$n_kinds" ] && total_cap="$n_kinds"   # always allow >=1 per active kind
+  requested_total=$(( per_kind * n_kinds ))
+  if [ "$requested_total" -gt "$total_cap" ]; then
+    per_kind=$(( total_cap / n_kinds ))
+    [ "$per_kind" -lt 1 ] && per_kind=1
+    capped="yes"
+  fi
+  case "$kind" in
+    cpu)  cpu_count="$per_kind" ;;
+    disk) disk_count="$per_kind" ;;
+    both) cpu_count="$per_kind"; disk_count="$per_kind" ;;
+  esac
+fi
+
+# ── Tool discovery ─────────────────────────────────────────────────────────────
+stress_ng_bin="$(command -v stress-ng 2>/dev/null || true)"
+stress_ng_ver="none"
+[ -n "$stress_ng_bin" ] && stress_ng_ver="$("$stress_ng_bin" --version 2>/dev/null | head -1 | tr -d '\r')"
+bash_ver="$(bash --version 2>/dev/null | head -1 | tr -d '\r')"
+os_uname="$(uname -srm 2>/dev/null | tr -d '\r' || echo unknown)"
+git_sha="$(git rev-parse --short HEAD 2>/dev/null || echo unknown)"
+
+# CPU mechanism actually used.
+cpu_mech="none"
+[ "$cpu_count" -gt 0 ] && { [ -n "$stress_ng_bin" ] && cpu_mech="stress-ng" || cpu_mech="spinner"; }
+
+# ── cgroup v2 CPU-quota probe (Linux envelope leg only; probe-gated) ───────────
+# We only PROBE writability + record it; we do not create scopes here (that needs a
+# usable systemd manager — see strategy §3). IO throttling is experimental: skipped.
+cgroup_probe="not-requested"
+if [ "${GCL_STRESS_CGROUP:-0}" = 1 ]; then
+  cgroup_probe="unavailable"
+  if [ "$(uname -s 2>/dev/null)" = "Linux" ] && [ -r /sys/fs/cgroup/cgroup.controllers ]; then
+    if grep -qw cpu /sys/fs/cgroup/cgroup.controllers 2>/dev/null; then
+      # cpu controller present at the v2 root; is a cpu.max writable in our subtree?
+      if [ -w /sys/fs/cgroup/cgroup.subtree_control ] 2>/dev/null; then
+        cgroup_probe="writable"   # the calibrated quota path is reachable on this leg
+      else
+        cgroup_probe="present-not-delegated"
+      fi
+    else
+      cgroup_probe="no-cpu-controller"
+    fi
+  else
+    cgroup_probe="no-cgroup-v2"
+  fi
+fi
+
+# ── Stressor scratch dir (same volume as the test scratch) ─────────────────────
+hogdir="${TMPDIR:-/tmp}/gcl-stress.$$"
+mkdir -p "$hogdir" 2>/dev/null || hogdir="."
+
+# ── Spawn / teardown (track EXACT PIDs; kill only those) ───────────────────────
+hogs=()
+
+spawn_cpu() {
+  local i
+  if [ "$cpu_mech" = "stress-ng" ]; then
+    # One stress-ng manager spawning $cpu_count workers; reap the manager's PID.
+    "$stress_ng_bin" --cpu "$cpu_count" --cpu-load 100 >/dev/null 2>&1 &
+    hogs+=("$!")
+  else
+    for ((i = 0; i < cpu_count; i++)); do
+      bash -c 'while :; do :; done' &
+      hogs+=("$!")
+    done
+  fi
+}
+
+spawn_disk() {
+  local i
+  for ((i = 0; i < disk_count; i++)); do
+    bash -c '
+      d="$1"; j=0
+      while :; do
+        f="$d/dh.$$.$((j % 24))"
+        dd if=/dev/zero of="$f" bs=32k count=8 conv=fsync 2>/dev/null
+        rm -f "$f"
+        j=$((j + 1))
+      done' _ "$hogdir" &
+    hogs+=("$!")
+  done
+}
+
+cleanup() {
+  local p
+  for p in "${hogs[@]:-}"; do
+    [ -n "$p" ] && kill "$p" 2>/dev/null
+  done
+  # stress-ng forks workers under its manager; kill the worker group too (only the
+  # manager PIDs we spawned are used as the group leader — never a name match).
+  if [ "$cpu_mech" = "stress-ng" ]; then
+    for p in "${hogs[@]:-}"; do
+      [ -n "$p" ] && kill -- "-$p" 2>/dev/null   # negative PID = the manager's process group
+    done
+  fi
+  rm -rf "$hogdir" 2>/dev/null
+}
+trap cleanup EXIT INT TERM
+
+# ── Achieved-slowdown micro-benchmark (cheap fixed busy-loop, baseline vs loaded) ─
+# A small fixed integer loop timed once unloaded (baseline) and once mid-load gives a
+# coarse, reproducible "how much did this load slow a CPU-bound task" figure for the
+# manifest. Pure bash, no deps. Only run when load is actually applied — on the
+# none/pass-through path it would be pure overhead.
+micro_bench() {
+  local start end k=0
+  start="$(date +%s%N 2>/dev/null || echo 0)"
+  while [ "$k" -lt 50000 ]; do k=$((k + 1)); done
+  end="$(date +%s%N 2>/dev/null || echo 0)"
+  echo $(( (end - start) / 1000000 ))   # ms
+}
+
+# Will any stressors spawn? (kind selected AND a positive per-kind count.)
+will_load="no"
+case "$kind" in
+  cpu)  [ "$cpu_count"  -gt 0 ] && will_load="yes" ;;
+  disk) [ "$disk_count" -gt 0 ] && will_load="yes" ;;
+  both) { [ "$cpu_count" -gt 0 ] || [ "$disk_count" -gt 0 ]; } && will_load="yes" ;;
+esac
+
+base_ms=0
+loaded_ms=0
+slowdown="1.00"
+[ "$will_load" = yes ] && base_ms="$(micro_bench)"
+
+# ── Apply load ─────────────────────────────────────────────────────────────────
+case "$kind" in
+  cpu)  spawn_cpu ;;
+  disk) spawn_disk ;;
+  both) spawn_cpu; spawn_disk ;;
+  none) : ;;
+  *) echo "with-load: unknown GCL_STRESS_KIND='$kind' — running with NO load" >&2; kind="none" ;;
+esac
+
+if [ "${#hogs[@]}" -gt 0 ] && [ "$base_ms" -gt 0 ]; then
+  loaded_ms="$(micro_bench)"
+  # slowdown = loaded/base to 2 dp, integer-only arithmetic. Pad the centi-value to
+  # >=3 digits so the integer part is always whatever precedes the last 2 digits
+  # (handles slowdown <1.00 from timing noise, e.g. 80 -> "0.80").
+  centi="$(( loaded_ms * 100 / base_ms ))"
+  while [ "${#centi}" -lt 3 ]; do centi="0$centi"; done
+  slowdown="${centi%??}.${centi: -2}"
+fi
+
+# ── Write the load-manifest (best-effort; never fails the run) ──────────────────
+write_manifest() {
+  local dir
+  dir="$(dirname "$manifest")"
+  mkdir -p "$dir" 2>/dev/null || return 0
+  # Hand-rolled JSON (no jq/python dependency on the runner). Escape the JSON-special
+  # chars in string values: backslash, double-quote, and the control chars that the
+  # wrapped command line can legitimately contain (newline/tab/CR) — a raw newline in
+  # a value is invalid JSON. awk keeps this robust where sed's newline handling is not.
+  esc() {
+    printf '%s' "$1" | awk '
+      BEGIN { ORS = "" }
+      {
+        if (NR > 1) printf "\\n"          # join input lines with an escaped newline
+        gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); gsub(/\t/, "\\t"); gsub(/\r/, "\\r")
+        print
+      }'
+  }
+  {
+    printf '{\n'
+    printf '  "kind": "%s",\n'            "$(esc "$kind")"
+    printf '  "ratio_R": %s,\n'          "$ratio"
+    printf '  "ratio_max": %s,\n'        "$ratio_max"
+    printf '  "raw_load_override": "%s",\n' "$(esc "${raw_load:-}")"
+    printf '  "nproc": %s,\n'            "$nproc_count"
+    printf '  "cpu_stressors": %s,\n'    "$cpu_count"
+    printf '  "disk_stressors": %s,\n'   "$disk_count"
+    printf '  "total_stressors": %s,\n'  "${#hogs[@]}"
+    printf '  "ratio_total_capped": "%s",\n' "$capped"
+    printf '  "cpu_mechanism": "%s",\n'  "$(esc "$cpu_mech")"
+    printf '  "cgroup_cpu_probe": "%s",\n' "$(esc "$cgroup_probe")"
+    printf '  "baseline_ms": %s,\n'      "$base_ms"
+    printf '  "loaded_ms": %s,\n'        "$loaded_ms"
+    printf '  "achieved_slowdown": %s,\n' "$slowdown"
+    printf '  "stress_ng_version": "%s",\n' "$(esc "$stress_ng_ver")"
+    printf '  "bash_version": "%s",\n'   "$(esc "$bash_ver")"
+    printf '  "os_arch": "%s",\n'        "$(esc "$os_uname")"
+    printf '  "git_sha": "%s",\n'        "$(esc "$git_sha")"
+    printf '  "command": "%s"\n'         "$(esc "$*")"
+    printf '}\n'
+  } > "$manifest" 2>/dev/null || true
+}
+write_manifest "$@"
+
+echo "stress: kind=$kind R=$ratio nproc=$nproc_count cpu=$cpu_count disk=$disk_count" \
+     "mech=$cpu_mech capped=$capped slowdown=${slowdown}x manifest=$manifest :: $*"
+
+# ── Run the wrapped command, tear down, propagate its exit code ─────────────────
+"$@"
+rc=$?
+
+cleanup
+hogs=()
+echo "stress: hogs reaped; wrapped command rc=$rc"
+exit "$rc"