diff --git a/.agent/.gitkeep b/.agent/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/.github/scripts/nightly-triage.sh b/.github/scripts/nightly-triage.sh new file mode 100644 index 0000000..0a3964b --- /dev/null +++ b/.github/scripts/nightly-triage.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +# nightly-triage.sh — classify a nightly stress run's results and file/append a +# single labelled GitHub issue per (date, class), idempotently. +# +# Invoked by the `triage` job in .github/workflows/nightly.yml AFTER it has +# downloaded every matrix cell's `test-output/` artifact (each into a directory +# named `nightly-logs-/`, each carrying that cell's own +# `cell-conclusion.txt`). It reads only files on disk + `gh`; it makes no test +# decisions of its own beyond parsing the preserved logs. +# +# CLASSIFICATION: +# correctness — any `^FAIL:` line in a suite log (a genuine assertion failure). +# Files/append a `nightly-correctness` issue. The one class that +# demands investigation. (A job that concluded `failure`/timed out +# WITHOUT a `^FAIL:` line is infra, not correctness — see below.) +# envelope — no FAIL anywhere, but at least one `WARN[env-relaxed]` line in a +# log of a cell that *succeeded*. Tracked (`nightly-envelope`); the +# three wall-clock envelope assertions stretched under load — by +# design under GCL_ENVELOPE_TIER=relax — so NO investigation action. +# infra — a cell's artifact is missing, the cell job neither succeeded nor +# cleanly failed-on-an-assertion (timeout / cancelled / checkout +# failure / errored before any suite ran), OR — the EMPTY-ROUND +# GUARD — *no* cell produced any log at all. Filed `nightly-infra`. +# Crucially, "0 FAIL across 0 logs" is NEVER read as green: with no +# evidence we classify infra, not success. +# +# Idempotency: one open issue per (run-date, class). We search open issues by a +# stable title prefix + label; if one exists we append a comment, else we create. +# Re-running triage for the same date therefore appends rather than spamming. +# +# All-green (every cell success, no FAIL, no env warn, every artifact present) ⇒ +# NO issue of any kind is filed. +# +# Inputs (environment): +# ARTIFACTS_DIR dir holding the downloaded per-cell artifact directories +# (default: ./artifacts). Each cell dir is `nightly-logs-/`. +# (Per-cell job conclusions are read from FILES, not env: each stress cell writes +# its own `result` — success|failure|cancelled|skipped — to +# `/cell-conclusion.txt` under always(), and the script +# reads that file directly. Ground truth PER CELL, never a matrix +# aggregate.) +# EXPECTED_CELLS space-separated list of cell ids that were supposed to run +# (default: the six N1..N6 ids). Lets the empty-round / missing- +# artifact guard know what to expect. +# RUN_DATE UTC date stamp for the issue title (default: today, UTC). +# GITHUB_REPOSITORY / GH_TOKEN(GITHUB_TOKEN) the usual `gh` env. +# DRY_RUN=1 print the `gh` actions instead of running them (for local tests). +set -uo pipefail + +ARTIFACTS_DIR="${ARTIFACTS_DIR:-./artifacts}" +EXPECTED_CELLS="${EXPECTED_CELLS:-N1 N2 N3 N4 N5 N6}" +RUN_DATE="${RUN_DATE:-$(date -u +%Y-%m-%d)}" +DRY_RUN="${DRY_RUN:-0}" + +log() { printf '%s\n' "$*" >&2; } + +# A cell's log directory and its suite logs (may be absent ⇒ infra). +cell_logdir() { printf '%s/nightly-logs-%s' "$ARTIFACTS_DIR" "$1"; } + +# ── Read a cell's OWN recorded conclusion from its artifact (ground truth: each +# stress cell writes job.status to cell-conclusion.txt under always()). Absent +# file ⇒ `unknown` (handled like a missing artifact). ────────────────────────── +cell_conclusion() { + local cell="$1" f val="" + f="$(cell_logdir "$cell")/cell-conclusion.txt" + if [ -f "$f" ]; then + val="$(tr -d '[:space:]' < "$f" 2>/dev/null)" + fi + printf '%s' "${val:-unknown}" +} + +# ── Classify each expected cell. Accumulate evidence lines per class. ─────────── +correctness_evidence="" +envelope_evidence="" +infra_evidence="" + +any_log_seen=0 # for the empty-round guard + +for cell in $EXPECTED_CELLS; do + dir="$(cell_logdir "$cell")" + concl="$(cell_conclusion "$cell")" + + # Gather this cell's suite logs (unit/interop/integration *.log under the artifact). + logs=() + if [ -d "$dir" ]; then + while IFS= read -r f; do logs+=("$f"); done \ + < <(find "$dir" -type f -name '*.log' 2>/dev/null) + fi + + if [ "${#logs[@]}" -eq 0 ]; then + # No artifact / no logs for an expected cell. Distinguish: a clean job that + # somehow uploaded nothing is still suspect ⇒ infra (we cannot prove it green). + infra_evidence+="- ${cell}: no logs found (artifact missing or empty; job conclusion='${concl}')"$'\n' + log "[$cell] INFRA: no logs (conclusion=$concl)" + continue + fi + any_log_seen=1 + + # Scan the logs. + cell_fail=0 + cell_envwarn=0 + fail_lines="" + for f in "${logs[@]}"; do + if grep -qE '^FAIL:' "$f" 2>/dev/null; then + cell_fail=1 + # Keep up to 5 FAIL lines per log as evidence. + fail_lines+="$(grep -nE '^FAIL:' "$f" 2>/dev/null | head -5 | sed "s#^# ${f##*/}: #")"$'\n' + fi + if grep -qE 'WARN\[env-relaxed\]' "$f" 2>/dev/null; then + cell_envwarn=1 + fi + done + + if [ "$cell_fail" -eq 1 ]; then + # A real `^FAIL:` assertion line ⇒ correctness, regardless of job conclusion. + correctness_evidence+="- ${cell}: job='${concl}', FAIL lines present:"$'\n'"${fail_lines}" + log "[$cell] CORRECTNESS (cell_fail=$cell_fail conclusion=$concl)" + elif [ "$concl" != "success" ]; then + # Logs exist but the job did not cleanly succeed and there is no assertion FAIL: + # failure-without-^FAIL / timeout / cancelled / errored late ⇒ infra, not + # correctness and not green (a failure WITHOUT a FAIL line is a step + # timeout/late error, which is infra). + infra_evidence+="- ${cell}: logs present but job conclusion='${concl}' (failure/timeout/cancel without ^FAIL: line)"$'\n' + log "[$cell] INFRA (conclusion=$concl, no FAIL)" + elif [ "$cell_envwarn" -eq 1 ]; then + envelope_evidence+="- ${cell}: succeeded with WARN[env-relaxed] (envelope assertion(s) stretched under load — expected)"$'\n' + log "[$cell] ENVELOPE (success + env-relaxed warn)" + else + log "[$cell] OK (success, no FAIL, no env warn)" + fi +done + +# ── EMPTY-ROUND GUARD: if not a single expected cell produced any log, the run +# errored before any suite ran (checkout failure, total infra collapse). That is +# INFRA, never green — do not let "0 FAIL across 0 logs" pass as success. ────── +if [ "$any_log_seen" -eq 0 ]; then + empty_msg="EMPTY ROUND: none of the expected cells (${EXPECTED_CELLS}) produced any suite log. The workflow errored before any suite ran (checkout failure / total infra collapse) — this is NOT a passing nightly." + infra_evidence="${empty_msg}"$'\n'"${infra_evidence}" + log "EMPTY-ROUND GUARD fired: no logs from any cell." +fi + +# ── File/append issues, idempotently, one per (date, class). ──────────────────── +# Title prefix is stable per class+date so search-then-append is reliable. +file_issue() { # $1=class-label $2=title $3=body + local label="$1" title="$2" body="$3" existing="" + + if [ "$DRY_RUN" = 1 ]; then + log "DRY_RUN: would search open issues label=$label title~='$title'" + log "DRY_RUN: title='$title'" + log "DRY_RUN: body:"; printf '%s\n' "$body" >&2 + return 0 + fi + + # Search OPEN issues with this label whose title exactly matches (idempotency key). + # `gh issue list --search` uses GitHub search; we additionally filter the JSON by + # exact title to avoid a substring collision. + existing="$(gh issue list --state open --label "$label" \ + --search "$title in:title" --json number,title \ + --jq ".[] | select(.title == \"$title\") | .number" 2>/dev/null | head -1)" + + if [ -n "$existing" ]; then + log "Appending to existing issue #$existing ($label)" + if gh issue comment "$existing" --body "$body" >/dev/null; then + log "Appended comment to #$existing" + else + log "WARN: failed to append to #$existing" + fi + else + log "Creating new issue ($label): $title" + if gh issue create --title "$title" --label "$label" --body "$body" >/dev/null; then + log "Created issue ($label)" + else + log "WARN: failed to create issue ($label)" + fi + fi +} + +run_url="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-}/actions/runs/${GITHUB_RUN_ID:-}" +filed=0 + +if [ -n "$correctness_evidence" ]; then + body="Nightly stress run on **${RUN_DATE}** has CORRECTNESS failures (a \`^FAIL:\` assertion line in a suite log). **Investigate.** + +$correctness_evidence +Run: ${run_url} + +(Auto-filed by nightly-triage.sh; idempotent per (date, class) — re-runs append.)" + file_issue "nightly-correctness" "Nightly correctness failure — ${RUN_DATE}" "$body" + filed=1 +fi + +if [ -n "$infra_evidence" ]; then + body="Nightly stress run on **${RUN_DATE}** had INFRA issues (missing artifact / timeout / cancel / a cell job that failed or errored WITHOUT any \`^FAIL:\` line). Not a product failure, but the run did not produce trustworthy results — re-dispatch or investigate the runner. + +$infra_evidence +Run: ${run_url} + +(Auto-filed by nightly-triage.sh; idempotent per (date, class).)" + file_issue "nightly-infra" "Nightly infra issue — ${RUN_DATE}" "$body" + filed=1 +fi + +# Envelope is filed ONLY when there is no correctness failure (a correctness issue +# subsumes it — under a red run the env warns are noise). Tracked, no action. +if [ -z "$correctness_evidence" ] && [ -n "$envelope_evidence" ]; then + body="Nightly stress run on **${RUN_DATE}**: no correctness failures, but envelope (wall-clock) assertions were relaxed under load (\`WARN[env-relaxed]\`). This is EXPECTED under GCL_ENVELOPE_TIER=relax — tracked, **no investigation needed** unless it becomes persistent at low load. + +$envelope_evidence +Run: ${run_url} + +(Auto-filed by nightly-triage.sh; idempotent per (date, class).)" + file_issue "nightly-envelope" "Nightly envelope warning — ${RUN_DATE}" "$body" + filed=1 +fi + +if [ "$filed" -eq 0 ]; then + log "ALL GREEN: every expected cell succeeded, no FAIL, no env warn, all artifacts present. No issue filed." +fi + +# Triage itself succeeds whenever it ran to completion — it must not red the +# workflow for finding failures (those are surfaced as issues). It only fails if it +# could not run at all (handled by `set -uo pipefail` on a genuine scripting error). +exit 0 diff --git a/.github/workflows/deep-sweep.yml b/.github/workflows/deep-sweep.yml new file mode 100644 index 0000000..2877b52 --- /dev/null +++ b/.github/workflows/deep-sweep.yml @@ -0,0 +1,214 @@ +# deep-sweep — Tier D of the load-testing strategy (see docs/load-testing-strategy.md). +# +# ON-DEMAND ONLY. This workflow is `workflow_dispatch`-only: it NEVER runs on push +# or pull_request, and it NEVER gates anything (it is not a required check — this is +# a single-dev project with no branch protection). It exists purely as a deep +# flake-hunting tool — the +# "50-clean hunt" instrument from the load-testing strategy: dispatch it (often many +# times in parallel), pick a stress kind/magnitude, and repeat the full suite N +# times per job to surface intermittent, scheduling-sensitive flakes that a single +# zero-load per-PR run would never reproduce. +# +# Deep + loaded runs are SLOW (heavy CPU/disk oversubscription stretches every +# wall-clock-derived step), so timeouts here are deliberately generous and the +# envelope tier defaults to `relax` (an oversubscribed runner must not turn a +# latency miss into a red — only a real correctness FAIL should). +# +# The job names are intentionally distinct (`deep-*`). With no branch protection +# there is no required `tests-passed` context to avoid publishing, so this is now +# only cosmetic / for clarity — but kept so a deep run is never confused with the +# per-PR `tests` matrix in the checks UI. + +name: deep-sweep + +on: + workflow_dispatch: + inputs: + stress_kind: + description: 'Background load kind to apply via tests/with-load.sh' + type: choice + options: [none, cpu, disk, both] + default: both + stress_load: + description: 'Raw per-kind hog count override (GCL_STRESS_LOAD). Blank = use the ratio.' + type: string + default: '' + repeat: + description: 'How many times to repeat the suite run within each job (intermittent-flake hunt).' + type: string + default: '1' + envelope_tier: + description: 'GCL_ENVELOPE_TIER — relax (default) warns on latency misses; strict fails them.' + type: string + default: relax + +# Per-run-unique group so MANY parallel dispatches each get their own group and run +# concurrently (a fresh dispatch never cancels or is cancelled by an in-flight one); +# cancel-in-progress:false means a re-dispatch into the same run_id (impossible — +# run_id is unique per run) would still queue rather than cancel. In practice every +# dispatch is its own run, so the deep sweeps fan out freely and accept queue waves. +concurrency: + group: deep-${{ github.run_id }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + deep: + name: deep-${{ matrix.os }}${{ matrix.leg != 'all' && format(' ({0})', matrix.leg) || '' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false # every cell's verdict is a useful deep signal; let the rest finish + matrix: + # Mirrors tests.yml's per-OS legs (ubuntu all / macos all / windows unit / + # windows interop+integration); the canary runs as a step within the unit/all + # legs here, not a separate cell. Windows stays split because the bash-only + # unit suite is the wall-clock bottleneck there and the suites must not run + # concurrently inside one timing-sensitive 2-core runner. Generous deep + # timeouts: deep + loaded + repeated is far slower than the per-PR gate. + include: + - { os: ubuntu-24.04, leg: all, job_timeout: 180 } + - { os: macos-15, leg: all, job_timeout: 180 } + - { os: windows-2025, leg: unit, job_timeout: 120 } + - { os: windows-2025, leg: interop-integration, job_timeout: 120 } + timeout-minutes: ${{ matrix.job_timeout }} # backstop only: repeat * (loaded suite budgets) + upload headroom + defaults: + run: + shell: bash # on windows-2025 this is Git Bash (MINGW) — what the interop suite requires + env: + GCL_TEST_FULL: 1 # full fan-out — CI runners are dedicated + GCL_TEST_SWEEP: 1 # deep runs exercise the Axis-A waiter-count sweep too + GCL_ENVELOPE_TIER: ${{ inputs.envelope_tier }} + GCL_STRESS_KIND: ${{ inputs.stress_kind }} + GCL_STRESS_LOAD: ${{ inputs.stress_load }} # blank => with-load.sh falls back to the ratio + steps: + - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0 # v6.0.3, SHA-pinned + with: + persist-credentials: false # no job uses the token after fetch + + - name: Toolchain versions (for reconstructing failures) + run: | + uname -a + bash --version | head -1 + git --version + if command -v pwsh >/dev/null; then + pwsh -NoProfile -Command '"pwsh " + $PSVersionTable.PSVersion.ToString()' + else + echo "pwsh: NOT FOUND (interop suite will skip; integration runs bash-only)" + fi + if command -v powershell >/dev/null; then + powershell -NoProfile -Command '"powershell " + $PSVersionTable.PSVersion.ToString()' + else + echo "powershell (Windows PowerShell 5.1): NOT FOUND (interop Test 17 skips; expected on POSIX legs)" + fi + stat --version 2>/dev/null | head -1 || echo "stat: BSD variant" + command -v stress-ng >/dev/null && stress-ng --version | head -1 || echo "stress-ng: NOT FOUND (with-load.sh uses the portable spinner)" + echo "dispatch inputs: kind=${GCL_STRESS_KIND} load='${GCL_STRESS_LOAD}' repeat=${{ inputs.repeat }} envelope=${GCL_ENVELOPE_TIER}" + + # Each suite is repeated `repeat` times under load. The loop fails fast: the + # first failing iteration `exit 1`s the step (so the step — and job — go red on + # the earliest flake), and every iteration names its index in the log so a + # failure is attributable to a specific repeat. Under `shell: bash` (-eo + # pipefail) a failing suite pipeline already trips the step; the explicit + # PIPESTATUS check is a defensive backstop that also names the failing iteration. + - name: Unit suite (deep, looped x repeat, under load) + if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }} + timeout-minutes: ${{ matrix.os == 'windows-2025' && 100 || 90 }} + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/unit + run: | + mkdir -p test-output + n='${{ inputs.repeat }}' + case "$n" in ''|*[!0-9]*) n=1 ;; esac + [ "$n" -lt 1 ] && n=1 + echo "== unit: repeating $n time(s) under load ==" + for i in $(seq 1 "$n"); do + echo "== unit iteration $i/$n ==" + bash tests/with-load.sh bash tests/git-commit-lock.test.sh 2>&1 \ + | tee "test-output/unit-suite.iter$i.log" + rc=${PIPESTATUS[0]} + if [ "$rc" -ne 0 ]; then + echo "== unit iteration $i/$n FAILED (rc=$rc) — stopping deep sweep ==" + exit 1 + fi + done + + - name: Canary suite (deep, looped x repeat, under load) + # The concurrency canary moved into its own file; the deep flake hunt should + # exercise it (a concurrency canary is exactly what a deep+loaded+repeated hunt + # is for). Same legs as the unit suite, same loop/fail-fast wrapping. + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'unit') }} + timeout-minutes: ${{ matrix.os == 'windows-2025' && 100 || 90 }} + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary + run: | + mkdir -p test-output + n='${{ inputs.repeat }}' + case "$n" in ''|*[!0-9]*) n=1 ;; esac + [ "$n" -lt 1 ] && n=1 + echo "== canary: repeating $n time(s) under load ==" + for i in $(seq 1 "$n"); do + echo "== canary iteration $i/$n ==" + bash tests/with-load.sh bash tests/git-commit-lock.canary.test.sh 2>&1 \ + | tee "test-output/canary-suite.iter$i.log" + rc=${PIPESTATUS[0]} + if [ "$rc" -ne 0 ]; then + echo "== canary iteration $i/$n FAILED (rc=$rc) — stopping deep sweep ==" + exit 1 + fi + done + + - name: Interop suite (deep, looped x repeat, under load) + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }} # run even if an earlier suite failed — every signal is useful + timeout-minutes: 90 + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/interop + run: | + mkdir -p test-output + n='${{ inputs.repeat }}' + case "$n" in ''|*[!0-9]*) n=1 ;; esac + [ "$n" -lt 1 ] && n=1 + echo "== interop: repeating $n time(s) under load ==" + for i in $(seq 1 "$n"); do + echo "== interop iteration $i/$n ==" + bash tests/with-load.sh bash tests/git-commit-lock.interop.test.sh 2>&1 \ + | tee "test-output/interop-suite.iter$i.log" + rc=${PIPESTATUS[0]} + if [ "$rc" -ne 0 ]; then + echo "== interop iteration $i/$n FAILED (rc=$rc) — stopping deep sweep ==" + exit 1 + fi + done + + - name: Integration suite (deep, looped x repeat, under load) + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }} + timeout-minutes: 60 # its internal AGENT_LOCK_MAX_WAIT cap is 240s; x repeat under load + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/integration + run: | + mkdir -p test-output + n='${{ inputs.repeat }}' + case "$n" in ''|*[!0-9]*) n=1 ;; esac + [ "$n" -lt 1 ] && n=1 + echo "== integration: repeating $n time(s) under load ==" + for i in $(seq 1 "$n"); do + echo "== integration iteration $i/$n ==" + bash tests/with-load.sh bash tests/git-commit-lock.integration.test.sh 2>&1 \ + | tee "test-output/integration-suite.iter$i.log" + rc=${PIPESTATUS[0]} + if [ "$rc" -ne 0 ]; then + echo "== integration iteration $i/$n FAILED (rc=$rc) — stopping deep sweep ==" + exit 1 + fi + done + + - name: Upload deep-sweep artifacts (logs + load manifests, on success too) + if: ${{ always() }} # deep runs want the negatives to read the positives; upload even when green or cancelled + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1, SHA-pinned + with: + name: deep-logs-${{ matrix.os }}-${{ matrix.leg }}-${{ inputs.stress_kind }} # unique per (os, leg, kind) + path: test-output/ + include-hidden-files: true # lock logs + the load-manifest live under the scratch .git/ and test-output/; suite-generated, no secrets + if-no-files-found: warn + retention-days: 14 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 0000000..238d234 --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,316 @@ +name: nightly + +# Scheduled stress run: the test suites under calibrated background load (the +# `tests/with-load.sh` wrapper) at one oversubscription level R≈2, plus a kcov +# line-coverage gate and auto-triage of the results into labelled issues. +# +# This is NON-BLOCKING: there is no branch protection on this single-dev project +# (decision 2026-06-18), so nightly never gates a PR. Its job is to catch +# load-sensitive flakes and coverage regressions that the per-PR `tests.yml` +# (no-load, strict) cannot. +# +# NOTE for a future maintainer: GitHub auto-DISABLES a `schedule` trigger after +# ~60 days of repo inactivity. If the nightly history is empty, that may mean the +# schedule was disabled (not that every run passed) — re-enable / revive it with a +# manual `workflow_dispatch` run from the Actions tab. Rely on `workflow_dispatch` +# as the always-available manual trigger. + +on: + schedule: + - cron: '23 8 * * *' # 08:23 UTC daily — off-peak (low GitHub-hosted-runner contention) + workflow_dispatch: + +# One nightly at a time; a newer run supersedes an in-flight one. +concurrency: + group: nightly + cancel-in-progress: true + +permissions: + contents: read + +env: + # The suites run at full fan-out, with the envelope (wall-clock) assertions + # RELAXED so an oversubscribed runner cannot turn a latency stretch into a red + # (only correctness assertions can fail the suite under load), and with the + # Axis-A waiter-count sweep {4,12,24} enabled. + GCL_TEST_FULL: 1 + GCL_ENVELOPE_TIER: relax + GCL_TEST_SWEEP: 1 + # One oversubscription level R≈2 (stressors ≈ 2 * nproc per kind, total capped at + # GCL_STRESS_RATIO_MAX * nproc by with-load.sh). + GCL_STRESS_RATIO: 2 + +jobs: + # ── The 6 stress cells. Each runs the relevant suite(s) wrapped in with-load.sh + # under one GCL_STRESS_KIND. `leg` selects which suites run (the all/unit/ + # interop-integration legs as in tests.yml; the canary runs as a step here, not a + # leg): ubuntu/macos run the full set; windows splits unit vs interop-integration. ── + stress: + name: ${{ matrix.id }} ${{ matrix.os }} (${{ matrix.kind }}${{ matrix.leg != 'all' && format(', {0}', matrix.leg) || '' }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false # every cell's verdict is signal — and triage needs them all + matrix: + include: + - { id: N1, os: ubuntu-24.04, leg: all, kind: cpu, job_timeout: 70 } + - { id: N2, os: ubuntu-24.04, leg: all, kind: disk, job_timeout: 70 } + - { id: N3, os: ubuntu-24.04, leg: all, kind: both, job_timeout: 70 } + - { id: N4, os: macos-15, leg: all, kind: disk, job_timeout: 70 } + - { id: N5, os: windows-2025, leg: interop-integration, kind: disk, job_timeout: 55 } + - { id: N6, os: windows-2025, leg: unit, kind: both, job_timeout: 60 } + timeout-minutes: ${{ matrix.job_timeout }} # generous: load slows everything; backstop only + defaults: + run: + shell: bash # on windows-2025 this is Git Bash (MINGW) — what the interop suite requires + env: + GCL_STRESS_KIND: ${{ matrix.kind }} + steps: + - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0 # v6.0.3, SHA-pinned + with: + persist-credentials: false + + - name: Toolchain versions (for reconstructing failures) + run: | + uname -a + bash --version | head -1 + git --version + command -v stress-ng >/dev/null && stress-ng --version | head -1 || echo "stress-ng: NOT FOUND (with-load.sh uses the portable bash spinner)" + if command -v pwsh >/dev/null; then + pwsh -NoProfile -Command '"pwsh " + $PSVersionTable.PSVersion.ToString()' + else + echo "pwsh: NOT FOUND (interop suite will skip; integration runs bash-only)" + fi + if command -v powershell >/dev/null; then + powershell -NoProfile -Command '"powershell " + $PSVersionTable.PSVersion.ToString()' + else + echo "powershell (Windows PowerShell 5.1): NOT FOUND (interop Test 17 skips; expected on POSIX legs)" + fi + stat --version 2>/dev/null | head -1 || echo "stat: BSD variant" + + - name: Unit suite (under load) + if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }} + timeout-minutes: ${{ matrix.os == 'windows-2025' && 40 || 25 }} # raised: load + the N=24 sweep stretch wall-clock; a step timeout FAILS the step so the upload still runs + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/unit + run: | + mkdir -p test-output + bash tests/with-load.sh bash tests/git-commit-lock.test.sh 2>&1 | tee test-output/unit-suite.log + + - name: Canary suite (under load) + # The concurrency canary moved out of the unit suite into its own file; still + # exercise it under oversubscription here (concurrency + load is the highest-value + # canary scenario). Runs in the same legs the unit suite does (sequentially after + # it — nightly is non-blocking, so no separate parallel cell is warranted). + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'unit') }} + timeout-minutes: ${{ matrix.os == 'windows-2025' && 20 || 12 }} # load stretches the full-width canary; a step timeout FAILS the step so the upload still runs + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary + run: | + mkdir -p test-output + bash tests/with-load.sh bash tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/canary-suite.log + + - name: Interop suite (under load; bash + pwsh) + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }} # run even if an earlier suite failed — every signal is useful + timeout-minutes: 30 + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/interop + run: | + mkdir -p test-output + bash tests/with-load.sh bash tests/git-commit-lock.interop.test.sh 2>&1 | tee test-output/interop-suite.log + + - name: Integration suite (under load; real concurrent commits) + if: ${{ !cancelled() && (matrix.leg == 'all' || matrix.leg == 'interop-integration') }} + timeout-minutes: 20 # its internal AGENT_LOCK_MAX_WAIT cap is 240s; load + sweep stretch it + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/integration + run: | + mkdir -p test-output + bash tests/with-load.sh bash tests/git-commit-lock.integration.test.sh 2>&1 | tee test-output/integration-suite.log + + - name: Record this cell's conclusion (ground truth for triage) + if: ${{ always() }} # capture the cell's own status — even on timeout/cancel — into its artifact + run: | + mkdir -p test-output + # job.status here reflects THIS cell's run so far: success | failure | cancelled. + # A step timeout fails the step, which makes the job status `failure` by the time + # this always() step runs — so a no-FAIL timeout is recorded as `failure`, and the + # triage script (seeing logs present but conclusion!=success and no ^FAIL:) classes + # it infra. The per-cell status file is the authoritative signal triage reads. + printf '%s' "${{ job.status }}" > test-output/cell-conclusion.txt + echo "cell ${{ matrix.id }} conclusion: $(cat test-output/cell-conclusion.txt)" + + - name: Upload cell logs + load-manifest (on success too — we read the positives by the negatives) + if: ${{ always() }} # upload whether the cell passed, failed, or timed out — triage needs every cell's evidence + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1, SHA-pinned + with: + name: nightly-logs-${{ matrix.id }} # unique per cell; the triage job downloads these by name + path: test-output/ + include-hidden-files: true # lock logs live under the scratch repo's .git/ (hidden); suite-generated, no secrets + if-no-files-found: warn + retention-days: 14 + + # ── kcov line-coverage gate. Linux-only, no load, strict, unit + canary at FULL. + # Build kcov v43 from source (no apt package / prebuilt). Gate at 0.80. ────── + kcov: + name: kcov coverage (Linux, no load, strict) + runs-on: ubuntu-24.04 + timeout-minutes: 30 + env: + COVERAGE_FLOOR: '0.80' # tracks achieved (~83%) — RATCHET UP toward ~0.90 as Tier-A tests land; do not let it lead coverage + steps: + - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0 # v6.0.3, SHA-pinned + with: + persist-credentials: false + + - name: Install kcov build dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + cmake g++ make pkg-config \ + libdw-dev libelf-dev binutils-dev libcurl4-openssl-dev zlib1g-dev libiberty-dev + + - name: Build kcov v43 from source + run: | + set -euo pipefail + cd /tmp + curl -fsSL https://github.com/SimonKagstrom/kcov/archive/refs/tags/v43.tar.gz | tar xz + mkdir kcov-build && cd kcov-build + cmake ../kcov-43 + make -j"$(nproc)" + ./src/kcov --version + + - name: Run unit + canary suites under kcov (FULL, strict, no load) + env: + GCL_TEST_FULL: 1 + # Set strict EXPLICITLY here to override the workflow-level GCL_ENVELOPE_TIER: relax + # (which this step would otherwise inherit) — we want a true, clean coverage run with + # the wall-clock envelope assertions enforced, no load applied. + GCL_ENVELOPE_TIER: strict + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/kcov-unit + run: | + set -euo pipefail + mkdir -p test-output coverage + # The concurrency canary now lives in its own file; run BOTH the unit suite + # and the canary under kcov into the SAME output dir. kcov ACCUMULATES + # coverage across multiple runs that share one output dir, writing the union + # to coverage/kcov-out/kcov-merged/cobertura.xml (NOT a top-level + # cobertura.xml — the enforcement step below reads the merged union by + # selecting the highest lines-covered), so the canary's coverage of + # git-commit-lock.sh is preserved and the 0.80 floor cannot regress from the split. + /tmp/kcov-build/src/kcov --include-path="$(pwd)/git-commit-lock.sh" \ + coverage/kcov-out tests/git-commit-lock.test.sh 2>&1 | tee test-output/kcov-unit-suite.log + /tmp/kcov-build/src/kcov --include-path="$(pwd)/git-commit-lock.sh" \ + coverage/kcov-out tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/kcov-canary-suite.log + + - name: Enforce coverage floor (parse cobertura line-rate) + run: | + set -euo pipefail + # kcov does NOT write a top-level coverage/kcov-out/cobertura.xml. The two runs + # (unit + canary) into one outdir produce per-binary reports under + # coverage/kcov-out/./cobertura.xml and a merged union at + # coverage/kcov-out/kcov-merged/cobertura.xml. All cover the same source + # git-commit-lock.sh, so they share an identical lines-valid — a lines-valid + # tie-break would keep whatever find returns first (a single-suite report). + # Pick the highest lines-COVERED instead: the merged union has the most covered + # lines, so this robustly selects it (for a single run there's just one report). + cob="" + best_covered=-1 + while IFS= read -r f; do + c="$(grep -oE 'lines-covered="[0-9]+"' "$f" 2>/dev/null | head -1 | grep -oE '[0-9]+')" + c="${c:-0}" + if [ "$c" -gt "$best_covered" ]; then best_covered="$c"; cob="$f"; fi + done < <(find coverage/kcov-out -name cobertura.xml 2>/dev/null) + if [ -z "$cob" ] || [ ! -f "$cob" ]; then + echo "::error::no cobertura.xml found under coverage/kcov-out — kcov produced no report" + find coverage/kcov-out -maxdepth 3 -type f 2>/dev/null | sed 's/^/ /' + exit 1 + fi + echo "Parsing coverage from: $cob (lines-covered=$best_covered)" + # Prefer the precise lines-covered/lines-valid ratio (exact); fall back to the + # rounded line-rate attribute. Both live on the top-level tag. + covered="$(grep -oE 'lines-covered="[0-9]+"' "$cob" | head -1 | grep -oE '[0-9]+')" + valid="$(grep -oE 'lines-valid="[0-9]+"' "$cob" | head -1 | grep -oE '[0-9]+')" + rate="$(grep -oE 'line-rate="[0-9.]+"' "$cob" | head -1 | grep -oE '[0-9.]+')" + if [ -n "$covered" ] && [ -n "$valid" ] && [ "$valid" -gt 0 ]; then + # exact ratio to 4 dp, integer arithmetic (no bc/python dependency) + rate="$(awk -v c="$covered" -v v="$valid" 'BEGIN { printf "%.4f", c / v }')" + echo "Line coverage: $covered / $valid = $rate" + else + echo "Line coverage (from line-rate attribute): $rate (lines-covered/valid unavailable)" + fi + floor="$COVERAGE_FLOOR" + # Compare rate >= floor with awk (float-safe). + if awk -v r="$rate" -v f="$floor" 'BEGIN { exit !(r + 0 >= f + 0) }'; then + echo "PASS: line coverage $rate >= floor $floor" + echo "NOTE: the floor ($floor) tracks the achieved coverage (~0.83); ratchet it up toward ~0.90 as more Linux-coverable tests land. The Linux ceiling is ~0.94 (~30 lines are platform-gated)." + else + echo "::error::line coverage $rate is BELOW the floor $floor — coverage regressed" + echo "The floor tracks achieved coverage (~0.83) and should only ratchet UP as tests land. A drop means a test stopped exercising lines it used to. Investigate before lowering the floor." + exit 1 + fi + + - name: Upload coverage report (HTML + cobertura) + if: ${{ !cancelled() }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1, SHA-pinned + with: + name: kcov-coverage + path: | + coverage/kcov-out/ + test-output/kcov-unit-suite.log + test-output/kcov-canary-suite.log + include-hidden-files: true + if-no-files-found: warn + retention-days: 30 + + # ── Auto-triage. Downloads every cell's artifact, classifies (correctness / + # envelope / infra), and files/append ONE labelled issue per (date, class). + # Runs always() so a failed/cancelled cell is still triaged; the empty-round + # guard prevents "0 FAIL across 0 logs" being read as green. ───────────────── + triage: + name: Triage nightly results + needs: [stress, kcov] + if: ${{ always() }} + runs-on: ubuntu-24.04 + timeout-minutes: 10 + permissions: + issues: write + contents: read + steps: + - uses: actions/checkout@9f698171ed81b15d1823a05fc7211befd50c8ae0 # v6.0.3, SHA-pinned + with: + persist-credentials: false + + - name: Download all cell artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0, SHA-pinned + with: + path: artifacts + # pattern restricts to the per-cell logs (not kcov-coverage); merge-multiple off + # so each lands in its own `nightly-logs-/` dir, as the triage script expects. + pattern: nightly-logs-* + continue-on-error: true # a totally-missing artifact set must reach the empty-round guard, not error the job + + - name: Ensure triage labels exist (idempotent) + env: + GH_TOKEN: ${{ github.token }} + run: | + set -uo pipefail + gh label create nightly-correctness -c '#d73a4a' -d 'Nightly stress: a correctness assertion failed — investigate' --force || true + gh label create nightly-envelope -c '#fbca04' -d 'Nightly stress: wall-clock envelope relaxed under load — expected, tracked' --force || true + gh label create nightly-infra -c '#0e8a16' -d 'Nightly stress: infra issue (missing artifact / timeout / errored) — not a product failure' --force || true + + - name: Classify results and file/append issues + env: + GH_TOKEN: ${{ github.token }} + ARTIFACTS_DIR: artifacts + EXPECTED_CELLS: 'N1 N2 N3 N4 N5 N6' + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + run: | + set -uo pipefail + # Each cell's status is ground truth from its OWN artifact + # (nightly-logs-/cell-conclusion.txt, written by the stress job under + # always()), so the script never relies on the misleading matrix-aggregate + # `needs.stress.result`. The empty-round guard fires if NO cell artifact exists. + echo "Artifacts present:"; ls -la artifacts 2>/dev/null || echo " (none)" + bash .github/scripts/nightly-triage.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3d1424c..1b579e2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,16 +32,23 @@ jobs: strategy: fail-fast: false # an OS-specific failure is the signal we want; let the others finish matrix: - # Windows splits into two parallel jobs — the bash-only unit suite is the - # wall-clock bottleneck there (~309s vs interop 100s + integration 28s; - # process-spawn overhead, not the PowerShell engines). Suites must NOT run - # concurrently inside one runner: they're timing-sensitive on 2-core - # runners. POSIX legs are fast enough to stay single-job. + # The concurrency CANARY (Test 1, full-width 25x8) is its OWN suite file and + # runs as a separate parallel `canary` cell on EVERY arch — it is ~half the + # Windows unit wall-clock (process-spawn overhead, not the PowerShell engines) + # and cheap on POSIX, so parallelising it is the per-PR wall-clock win. + # Windows otherwise splits unit vs interop-integration. Suites must NOT run + # concurrently inside one runner: they're timing-sensitive on 2-core runners. + # `leg: all` runs unit+interop+integration but NOT canary (the canary step + # gates on `leg == 'canary'` only). The job-name + artifact-name templates + # already key on matrix.leg, so the `canary` leg is named/uploaded uniquely. include: - { os: ubuntu-24.04, leg: all, job_timeout: 35 } + - { os: ubuntu-24.04, leg: canary, job_timeout: 15 } - { os: macos-15, leg: all, job_timeout: 35 } + - { os: macos-15, leg: canary, job_timeout: 15 } - { os: windows-2025, leg: unit, job_timeout: 20 } - { os: windows-2025, leg: interop-integration, job_timeout: 22 } + - { os: windows-2025, leg: canary, job_timeout: 15 } timeout-minutes: ${{ matrix.job_timeout }} # backstop only: sum of the leg's step budgets + upload headroom defaults: run: @@ -70,6 +77,15 @@ jobs: fi stat --version 2>/dev/null | head -1 || echo "stat: BSD variant" + - name: Canary suite (full-width concurrency canary) + if: ${{ matrix.leg == 'canary' }} + timeout-minutes: ${{ matrix.os == 'windows-2025' && 7 || 6 }} # ~151s on Windows + headroom; a step timeout FAILS the step (not the job) so the upload still runs + env: + GCL_TEST_PRESERVE_DIR: ${{ github.workspace }}/test-output/failed-work/canary + run: | + mkdir -p test-output + bash tests/git-commit-lock.canary.test.sh 2>&1 | tee test-output/canary-suite.log + - name: Unit suite if: ${{ matrix.leg == 'all' || matrix.leg == 'unit' }} timeout-minutes: ${{ matrix.os == 'windows-2025' && 15 || 10 }} # a step timeout FAILS the step (not the job), so the upload step reliably runs; sized from run 27325978197 + one internal MAX_WAIT hang @@ -129,9 +145,13 @@ jobs: /tmp/shellcheck-v0.11.0/shellcheck --version /tmp/shellcheck-v0.11.0/shellcheck -S style \ git-commit-lock.sh \ + tests/_harness.sh \ tests/git-commit-lock.test.sh \ + tests/git-commit-lock.canary.test.sh \ tests/git-commit-lock.interop.test.sh \ tests/git-commit-lock.integration.test.sh \ + tests/with-load.sh \ + .github/scripts/nightly-triage.sh \ install.sh - name: PSScriptAnalyzer (gate at warning severity) diff --git a/.gitignore b/.gitignore index be293f3..6ab470c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,7 @@ # OS / editor cruft .DS_Store Thumbs.db -/.agent/review-queue -/.agent/review-queue.lock -/.agent/review-queue.lock.* -/.agent/last-opened -/.agent/.tmp.* +*.stackdump + +# Test/CI artifact output (manifests, suite logs); created at runtime, never committed. +test-output/ diff --git a/README.md b/README.md index 5bebc3a..2027cd0 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,11 @@ atomic create-or-fail open (`O_CREAT|O_EXCL` / `FileMode.CreateNew`) — atomic on local POSIX filesystems and NTFS alike, with no dependency on `flock` — whose content is the holder's unique token. Every worktree has its own git dir, so independent worktrees get independent locks, while all agents sharing -one checkout contend on the same lock. The lock is deliberately a stealable +one checkout contend on the same lock. The protocol's correctness rests on these +operations being atomic, which holds on local filesystems (ext4, APFS, NTFS, and +kin) but **not** on network or sync-backed storage — NFS, SMB shares, +Dropbox/OneDrive-synced directories — where exclusion may silently fail. Keep the +repo (and so its `.git/`) on a local disk. The lock is deliberately a stealable **lease**, not a kernel lock: in unattended agent fleets a hung-but-alive holder is at least as common as a crashed one, and a lock that can't be taken from a stuck holder halts the whole run — while a rare collision costs little @@ -94,6 +98,12 @@ against each other on all three OSes — not as platform support, but because two independent implementations hammering one lock is cheap adversarial verification of the protocol. +**Upgrade both implementations together.** Older releases stole with an +unserialized move-aside instead of the claim protocol, so the +no-displacement-during-recovery guarantee holds only when every party in a tree +runs a current version; a mixed-version tree degrades that prevention to +detection (exit 98) and can leave `.dead.*` files current versions don't clean. + ## Suggested agent instructions Agents only benefit from the lock if their instructions tell them to use it. @@ -246,9 +256,10 @@ knobs and how staleness and stealing work. ## Tests -Three suites — bash unit, bash + PowerShell interop, and an end-to-end -integration run of concurrent real commits — cover the tool, and CI runs -them on Linux, macOS, and Windows. How to run them and what each covers: +Four suites — bash unit, a bash concurrency canary, bash + PowerShell +interop, and an end-to-end integration run of concurrent real commits — +cover the tool, and CI runs them on Linux, macOS, and Windows. How to run +them and what each covers: [`docs/git-commit-lock.md#tests`](docs/git-commit-lock.md#tests). ## Licence diff --git a/docs/failure-modes.md b/docs/failure-modes.md new file mode 100644 index 0000000..e82810c --- /dev/null +++ b/docs/failure-modes.md @@ -0,0 +1,862 @@ +# git-commit-lock: failure-mode map and scope decisions + +**Status:** decision-support document. For each failure mode it states the +tool's *current* behavior (grounded in the product code and tests), classifies +it into one of three robustness tiers, and recommends whether it should be an +in-scope guarantee. The owner uses this to deliberately decide, per mode, "yes, +we guarantee this" or "no, out of scope." + +**Sources of truth, in order:** the product code +(`git-commit-lock.sh`, `git-commit-lock.ps1`) and the test suites +(`tests/git-commit-lock.test.sh`, `tests/git-commit-lock.canary.test.sh`, +`tests/git-commit-lock.interop.test.sh`, +`tests/git-commit-lock.integration.test.sh`). Every claim below cites +`file:line`. The narrative docs (`README.md`, `docs/git-commit-lock.md`) and +the implementation header comments are corroborating, not authoritative — where +this document relies on a header comment it has verified the comment against the +code. (Cited line numbers are against the tree at commit `c762899`; treat them +as anchors, not exact addresses, if the files move.) + +A note on epistemics: the bash file's header (`git-commit-lock.sh:1-426`) is +itself an exhaustive design narrative and the ps1 header +(`git-commit-lock.ps1:41-177`) mirrors it. They are unusually trustworthy as +documentation *because* the tests pin the behaviors they describe. This document +does not re-derive the protocol; it re-classifies it for a scope decision and +flags the boundaries the headers state but a reader might skip. + +--- + +## 1. The core guarantee (what must hold under ANY conditions) + +**No silent lost update — given cooperative wrapper unwind.** The absolute safety +property is that the tool never reports a *serialized* critical section that +wasn't: a holder whose lease was taken from it learns so — `lock_release` returns +**98** and logs a loud WARNING — rather than exiting success +(`git-commit-lock.sh:1607-1688`; `git-commit-lock.ps1:1717-1837`). The two +reserved failure codes mean the wrapped command was provably *not* run (96 usage, +97 timeout) or provably *not serialized* (98) (`git-commit-lock.sh:392-415`). + +Two honest qualifications make this a precise property rather than a slogan, and +both matter for the scope decision: + +- **It is a lease, not a kernel lock** (`docs/git-commit-lock.md:60-126` explains + why no OS primitive spans bash-on-MINGW and PowerShell/.NET). **Strict mutual + exclusion holds only *within* the staleness window** (default 300s): a hold that + overruns it *can* be stolen mid-work — "fail-open" — so two processes can + briefly *both* believe they hold the lock. That overlap is accepted by design + and made *detectable* (the displaced holder's 98 at release), not prevented + (`git-commit-lock.sh:213-227`). At most one process is ever the *legitimate* + holder; a displaced believer finds out at release. So "mutual exclusion" is a + Tier-1 guarantee **within the envelope (commits faster than STALE)**, not an + unconditional one. +- **Detection requires the wrapper to actually reach release.** The 98 path fires + on normal return and on trapped signals. It does **not** fire if the held process + is terminated or *replaced* without unwinding — an external SIGKILL, a bash + `exec` in the wrapped command (which replaces the holding shell, so neither + `lock_release` nor the EXIT trap runs), or PowerShell `[Environment]::Exit()` + (bypasses `Lock-Release`, the `finally`, and the `PowerShell.Exiting` backstop, + `git-commit-lock.ps1:221-245`). A *plain* `exit` is safe — it unwinds. A + non-unwinding exit returning 0 *while displaced* can report success without the + 98 (see **§H4**). The *next* holder still recovers via staleness, but the + abruptly-exiting one is not warned. Hence the precise statement: **no silent lost + update, provided the wrapper unwinds cooperatively.** + +Liveness (eventual recovery) and bounded stalls are best-effort within an +operating envelope (Tier 2), not absolute — and "recovery" means lock-shaped +orphans get reclaimed, **not** that every bad state self-heals (a foreign object +at the path is deliberately never auto-removed; see the tier split). + +The integration suite is the end-to-end witness for this guarantee on the real +use case: many workers committing into one repo, audited for "every commit +lands, history linear, no sweep-up, no `index.lock` races, no stolen leases, +clean tree" (`tests/git-commit-lock.integration.test.sh:10-12, 226-283`). + +### The three tiers used throughout + +1. **Correctness guarantee** — must hold under *any* conditions (load, slow FS, + adversarial scheduling). Two kinds, and the distinction matters: + - **Safety (unconditional):** no corruption, and **no silent lost update** — + the displaced holder detects the loss (98) *provided its wrapper reaches + release* (§1's hard-kill/`Exit()` caveat). Strict **mutual exclusion holds + within the staleness window**; beyond it the lease is + fail-open-but-detectable. + - **Recovery (for lock-shaped stale state, under the supported FS/clock/tooling + envelope):** a crashed holder's stale lock, an orphaned claim, and an empty + crash-orphan are eventually reclaimed. This does **not** extend to *foreign* + objects at the path — a directory, a real user file, or non-`tok.` junk + content are deliberately *never* auto-removed; they wait at 97 for an + operator. "Eventual recovery" means lock-shaped orphans self-clear, not that + every bad state self-heals. + If a *safety* property can break, it is a bug; a *recovery* property failing + outside its envelope (e.g. a foreign object, an unreadable clock) is a + classified Tier-2/3 degradation, not a Tier-1 violation. +2. **Best-effort within a stated envelope** — holds under normal/expected + conditions, degrades gracefully (and *detectably*) under pathological ones. + Everything wall-clock-bounded lives here, because wall-clock bounds depend on + scheduling: timeouts, recovery latency, the diagnostic warnings that depend + on timing. Correctness is preserved; only liveness/latency degrades. +3. **Out of scope** — explicitly not handled; the operating envelope excludes + it. Damage, if any, is bounded and documented. + +--- + +## 2. Summary table + +Legend — **Tier:** 1 correctness / 2 best-effort-in-envelope / 3 out-of-scope. +**Tested:** ✓ deterministic test · ~ load/timing-sensitive or partial · ○ +robust-by-code-but-unverified · S static/grep check · (plat) platform-gated. + +| # | Failure mode | Current behavior | Tier | Tested | Recommendation | +|---|---|---|---|---|---| +| A1 | Clean high contention (N workers, no crashes) | Serialized; no lost update | 1 | ✓ C:81-111 (canary), I:227-261/341-386, integ | **In scope.** Keep. | +| A2 | Thundering herd recovering one dead lock | Claim serializes; exactly one steal, zero displacement | 1 | ✓ U:212-346, I:884-1015 | **In scope.** Keep. | +| A3 | Many concurrent stealers on one ghost | One O_EXCL claim winner | 1 | ✓ U:1095-1128, I:1017-1088 | **In scope.** Keep. | +| B1 | Holder dies (crash/SIGKILL/power) mid-hold | Lease ages out; stolen after STALE | 1 (recovery) / 2 (latency) | ✓ U:197-210/348-361 | **In scope** (recovery). Latency = Tier 2. | +| B2 | Holder dies mid-CLAIM (trappable: INT/TERM) | Trap deletes claim, token-checked; discovery read | 1 | ✓ U:1857-1928, I:1151-1244 | **In scope.** Keep. | +| B3 | Holder dies mid-claim (untrappable: SIGKILL) | Claim ages out ≤ CLAIM_STALE; rival rename can install unowned lock, recovered ≤ STALE | 2 | ✓ U:1648-1677 (forensics) | **Accept** (residual 5). Bounded, no false success. | +| B4 | Slow but UNCONTENDED holder overruns STALE | Keeps its lock (nothing moved it) | 1 | ✓ U:419-429, I:494-499 | **In scope.** Keep. | +| B5 | Slow CONTENDED holder overruns STALE | Stolen; robbed holder detects at release → 98 | 1 (detection) | ✓ U:387-417, I:460-492 | **In scope.** This *is* fail-open-but-detectable. | +| C1 | Orphaned/stale lock | mtime-stale → stolen via claim | 1 | ✓ U:197-210 | **In scope.** Keep. | +| C2 | Empty lock (crash between create+write) | Empty + stale → stealable | 1 | ✓ U:348-361 | **In scope.** Keep. | +| C3 | Crashed-claimant / empty claim orphan | Ages out ≤ CLAIM_STALE; cleared | 1 (recovery) / 2 (latency) | ✓ U:1130-1154 | **In scope.** Keep. | +| C4 | Leaked claim (unverifiable unlink) | Leaked-token memory keeps ownership discoverable | 1 | ✓ U:1549-1758, U:2013-2164 | **In scope.** Keep. | +| D1 | Atomic rename-over (steal install) | `mv -T` / `File.Move(...,true)` / 5.1 unlink+move | 1 (local FS) | ✓ U:212-346, I:16d S:1141 | **In scope on local FS.** Boundary = D-axis. | +| D2 | O_EXCL atomic create | `set -C` redirect / `FileMode.CreateNew` | 1 (local FS) | ✓ throughout | **In scope on local FS.** | +| D3 | Wrong-type at path (dir/symlink/FIFO/dev/socket) | Never stolen/deleted; loud warn; waiters → 97 | 1 (bash + ps1-on-Win) / 2 (ps1-on-POSIX) | ✓ U:818-892/1156-1262/Test 37 (rename-refused mid-steal)/Test 44 (socket+device), ~(plat) | **In scope.** ps1-on-POSIX residual = accept. | +| D4 | Non-lock CONTENT at path (user file) | Never stolen (content guard); warn | 1 | ✓ U:1034-1076 | **In scope.** Two accepted residuals (§D4). | +| D5 | Case-insensitive FS path collision | Not handled explicitly | 3 | ✗ | **Likely non-issue;** see §D5. Decide. | +| E1 | Network/shared FS (NFS/SMB/9p/Dropbox) | Outside design guarantees (stated) | 3 | ✗ | **Out of scope** (stated). See §E — decide whether to *enforce*. | +| E2 | Multi-host clock skew / NTP jump | Single-clock assumption; documented (local jump → detected-98, safe) | 3 | ✗ | **Out of scope**; single-clock assumption documented. See §E2. | +| E3 | mtime probe unreadable (staleness clock broken) | Warns loudly once; treats as not-stale → safe, recovery disabled → 97 | 2 | ✓ U:Test 42 | **Accept** — fails safe + announced. See §E3. | +| F1 | Disk full (ENOSPC) during create/write | Create fails → wait; torn write ages out | 2/3 | ✓ U:Test 50 (Linux+sudo tmpfs; (plat) skip elsewhere) | **Tested** (§4 item 5) + document. See §F1. | +| F2 | ENOSPC during LOG write | Swallowed (`|| true`); silent log loss | 2 | ✓ U:Test 49 (portable failing-log path) | **Tested** (§4 item 5); logging best-effort, lock unaffected. | +| F3 | Inode / FD exhaustion | Create fails → wait → 97 | 2 | ○ (document-only) | **Document-only**: no deterministic portable injection. See §F3. | +| F4 | Read-only / unwritable lock dir or parent | `mkdir -p` best-effort; create fails → wait → 97 | 2 | ✓ U:Test 48 (POSIX `chmod 0555`; (plat) skip on Windows) | **Tested** (§4 item 5, highest-value). See §F4. | +| G1 | Lock path = a directory / `$HOME` typo | Never stolen/deleted; loud warn; → 97 | 1 | ✓ U:818-840 | **In scope.** Keep. | +| G2 | Garbage numeric config | Falls back to default + stderr note | 1 | ✓ U:695-703, I:554-608 | **In scope.** Keep. | +| G3 | `run` outside a git repo, no `AGENT_LOCK_PATH` | Refuses (96) | 1 | ✓ U:705-712 | **In scope.** Keep. | +| G4 | `MAX_WAIT ≤ STALE + CLAIM_STALE` (default MW) | Startup warning | 2 | ✓ U:497-522 | **In scope.** Keep. | +| H1 | SIGINT/SIGTERM mid-hold | Release + re-raise (143); traps restored | 1 | ✓ U:577-600/1989-2011 | **In scope.** Keep (bash). ps1 = §H. | +| H2 | EXIT-while-holding | Release + chain caller's EXIT trap | 1 | ✓ U:633-648 | **In scope.** Keep. | +| H3 | ps1 process death under `-File` | `PowerShell.Exiting` does NOT fire; relies on stale window | 2 | ○ (limit documented) | **Accept;** `run` path is covered. See §H. | +| H4 | Non-unwinding exit while held (SIGKILL / bash `exec` / `[Environment]::Exit()`) | Skips release → a displaced holder is unwarned (no 98); plain `exit` is safe | 2 | ~ (I:308-334 indirect) | **Document** the no-silent-loss boundary. See §H4. | +| I1 | bash⇄pwsh wire/format compatibility | Shared format; token grammar tightened to match | 1 | ✓ I:* throughout | **In scope.** Keep. | +| I2 | Mixed-VERSION tree (old unserialized steal) | Prevention degrades to detection (98); `.dead.*` litter | 3 | ✗ | **Out of scope:** "upgrade both together." Residual 4. | +| J1 | Logging subsystem failure | All log writes `|| true`; 1 MB self-truncate | 2 | ✓ U:Test 49 (via F2) | **Tested** (§4 item 5, via F2); logging never blocks the lock. | +| K1 | Extreme load / CPU oversubscription / slow FS | Correctness holds; wall-clock bounds stretch | 2 | ~ (CI stress) | **Envelope defined** (design doc + envelope tier). See §K — the key analytical section. | +| K2 | Internal time budgets (poll, MAX_WAIT, read ladder) | Fixed schedules; tunable | 2 | ✓/~ | **In scope** as Tier-2 envelope. See §K. | + +U = `tests/git-commit-lock.test.sh`, I = `tests/git-commit-lock.interop.test.sh`, +C = `tests/git-commit-lock.canary.test.sh` (the concurrency canary), +integ = `tests/git-commit-lock.integration.test.sh`. + +--- + +## 3. Per-mode detail + +### A. High contention / thundering herd + +**A1 — Clean contention, no crashes.** N processes race to acquire a free or +held-then-released lock. The acquire loop is one O_EXCL create attempt per poll; +exactly one creator wins, the rest poll and take turns +(`git-commit-lock.sh:1312-1361`). After winning, the acquirer re-reads its own +token (read-back verification, `git-commit-lock.sh:1352-1361`) before claiming +the hold — so even a create that "won" but whose file was concurrently +clobbered does not produce a false hold. +*Tier 1.* Tested heavily: the concurrency canary — mutual exclusion under many +concurrent workers, 8 rounds × 25 at FULL (`tests/git-commit-lock.canary.test.sh` +Test 1, `C:81-111`) — interop Test 1/Test 6 mixed bash+pwsh (`I:227-261`, the +strict deterministic counter `I:341-386`), and the integration suite's real-commit +swarm. (Crash-recovery / claim-contention witnesses stay in the unit suite: A2's +Test 2b, A3's Test 20.) **Recommend: in scope, keep.** This is the tool's whole reason to exist. + +**A2 — Thundering herd recovering one dead lock.** After a holder dies, *every* +waiter judges the same lock stale off the same mtime in the same poll window — +the worst case for displacement. The **claim protocol** is the answer: to steal, +a waiter must first win an O_EXCL claim file `.next`, re-verify staleness +under the claim, then install by one atomic rename-over +(`git-commit-lock.sh:1070-1218`, the steps narrated at `:82-115`). This +*prevents* the straggler-robs-recovery-winner race rather than detecting and +repairing it. *Tier 1.* Tested: unit Test 2b asserts zero spurious 98s, exactly +one `STOLE-BY-CLAIM` per round, and — via a background sampler — that **no +move-aside `.dead.*` file ever exists** (`U:212-346`); interop Test 16 proves +the same across mixed impls (`I:884-1015`). The header records the unserialized +baseline was probed to displace 5/5 with 4 waiters (`git-commit-lock.sh:233-234`). +**Recommend: in scope, keep — this is a load-bearing correctness property.** + +**A3 — Many concurrent stealers.** Distilled A2: N stealers, one O_EXCL claim +winner, the rest wait and acquire in sequence. *Tier 1.* Tested: unit Test 20 +(`U:1095-1128`), interop Test 16b (one bash claimant vs one ps1 claimant on one +ghost, cross-parsing each other's claim files, `I:1017-1088`). +**Recommend: in scope, keep.** + +> **Load caveat on A2/A3 (see §K):** *correctness* is load-independent (it rests +> on O_EXCL + atomic rename, not timing). What stretches under load is the +> *latency* to recover, and the *test harness's* ability to set up the race +> deterministically — Test 2b/16 carry heavy sync scaffolding and bounded +> discard-and-retry precisely because a fast waiter can complete an entire steal +> before the harness finishes backdating the ghost (`U:70-104, 285-336`). That +> is a test-harness envelope concern, not a protocol gap. + +### B. Holder death + +**B1 — Crash/SIGKILL/power loss mid-hold.** The lease ages out: once the lock +file's mtime is older than `STALE_SECS`, a waiter steals it. *Recovery is Tier +1; recovery latency is Tier 2* (bounded by STALE + poll cadence under normal +load). Tested via the stale-lock and empty-orphan steals (`U:197-210, 348-361`). +**Recommend: in scope (recovery); latency bound documented (§K).** + +**B2 — Trappable death mid-claim (INT/TERM).** The EXIT/INT/TERM handlers are +armed at acquire *start*, not at hold, in "claim-window mode" +(`git-commit-lock.sh:1299-1310, 987-997`). A trappable exit while a claim is in +flight runs the token-checked claim deletion (one bounded retry) and a final +discovery read; it never runs lock-release (98) semantics on a *mere claim*. +*Tier 1.* Tested: unit Test 33 — TERM mid-claim deletes our claim, leaves a +*foreign* claim intact, no 98, no ageout penalty (`U:1857-1928`); the matching +ps1 lane is interop Test 16e (`I:1151-1244`). **Recommend: in scope, keep.** + +**B3 — Untrappable death mid-claim (SIGKILL between claim and rename).** +Deliberately **accepted, not prevented** (residual 5, +`git-commit-lock.sh:266-282`). The orphaned claim normally just ages out at +CLAIM_STALE; the rare bad case is a suspended rival's rename installing it as an +*unowned* lock that stalls waiters ≤ STALE before the lease recovers it. Crucial +property: **no false success anywhere** — nobody believes they hold; the only +cost is a bounded stall, same class as B1 at far lower probability. The preventing +alternative (a two-rename compare-and-swap) was evaluated and rejected because it +reintroduces crash litter (`git-commit-lock.sh:276-282`). *Tier 2.* Tested for +forensics/recovery via the crashed-leaver leg of Test 31 (`U:1648-1677`). +**Recommend: accept as a documented bounded residual. Do not build the +two-rename CAS** — the cure is worse than the disease and the failure is already +false-success-free. + +**B4 — Slow but uncontended holder.** With no waiter, nothing moves the file; +the token still matches at release; success. *Tier 1.* Tested: unit Test 4c, +interop Test 9 (`U:419-429`, `I:494-499`). **Recommend: in scope, keep** — this +is what stops the lock punishing every slow-but-safe hold. + +**B5 — Slow CONTENDED holder (the fail-open ceiling).** A hold past STALE *with* +a contender gets stolen; the robbed holder detects it at release (file gone, or +a foreign token — both definitive because acquire's read-back proved our token +was at the path) and returns exactly **98** plus a WARNING +(`git-commit-lock.sh:1620-1688`). *Tier 1 for detection.* Tested: unit Test 4b, +interop Test 8 both directions (`U:387-417`, `I:460-492`). **Recommend: in +scope, keep.** This is the deliberate fail-open-but-detectable contract; the +mitigation is operational — "commits must be fast" (the golden rule, +`docs/git-commit-lock.md:433-458`), and raise STALE for a genuinely slow hold. + +### C. Orphaned / stale locks and claims + +**C1/C2 — Stale or empty lock.** Staleness is judged by the lock file's own +mtime; a lock older than STALE and *lock-shaped* (empty, or line 1 starts +`tok.`) is stealable (`git-commit-lock.sh:1408-1446`). The empty case is the +crash-between-create-and-write orphan and is explicitly stealable. *Tier 1.* +Tested: Test 2 (stale), Test 3 (empty orphan regression) (`U:197-210, 348-361`). +**Recommend: in scope, keep.** + +**C3 — Crashed-claimant / empty-claim orphan.** A claim older than CLAIM_STALE +(default 60s; claims are normally held for ms) is cleared by any waiter, which +re-races the claim create (`git-commit-lock.sh:1228-1267`). A crashed claimant +therefore delays only *steals*, only by ≤ the claim window; a free lock path is +never blocked by a claim. *Recovery Tier 1, latency Tier 2.* Tested: Test 21 +(aged foreign claim and empty claim both age out and recovery completes, +`U:1130-1154`). **Recommend: in scope, keep.** + +> **Test 21's `≤20s` latency assertion is Tier 2, not Tier 1.** `U:1144` asserts +> wall-clock recovery `≤20s` with STALE=1, CLAIM_STALE=2, MAX_WAIT=30. The +> *protocol* recovers correctly regardless; the 20s number is a generous +> envelope bound that a sufficiently oversubscribed runner (e.g. 8 CPU hogs on a +> 2-core box under the stress wrapper) can blow without any protocol defect. +> This is exactly the kind of bound §K says to treat as a test-harness envelope: +> if it flakes under extreme artificial load, **relax the test's bound or scope +> the stress level — do not harden the code.** + +**C4 — Leaked claim.** A few exits must leave a claim behind without a verifiable +unlink (an unreadable claim; an unlink blocked by a foreign handle — exactly +three feeders, `git-commit-lock.sh:138-157`). These append the attempt token to +an in-process **leaked-token memory**. While non-empty, every poll (and a pass +at release/timeout) also reads the lock's line 1: a listed token there means a +rival's rename installed *our* leaked claim as the lock → adopt the hold, or, at +release, recognise our real hold was displaced, clean the leaked file +best-effort, and report 98. The result is structural: **no process inside an +acquire/hold/release arc can leave an *unowned* lock** (per-attempt tokens make +the discovery read conclusive). One scope nuance worth stating, because the +memory is **process-local**: only the leaking process can *adopt* its own +installed claim. If that process exits the arc first — times out (97), releases +cleanly, or dies — *before* adopting, the installed claim becomes an unowned lock +recovered by the ordinary staleness lane, never adopted by another process (this +is exactly residual 5 / §B3). Per-attempt-token uniqueness still guarantees that +lock can never be *mistaken* for owned by anyone, so there is **no false +success** — the only cost is a bounded stall. *Tier 1.* Tested extensively: Test 31 (the four +leaked lanes, including a real Windows no-delete-share feeder), Test 35 +(release-time cleanup of a leak installed over a held hold → 98), Test 36 +(inconclusive-read keeps the entry) (`U:1549-1758, 2013-2164`); ps1 parity in +interop Test 16e. **Recommend: in scope, keep.** This is the most intricate +machinery in the tool and the most thoroughly tested. + +### D. Filesystem semantics the protocol depends on + +These are the **load-bearing FS assumptions**. Where one does not hold, that is a +real robustness boundary, not a bug to fix. + +**D1 — Steal install: atomic overwrite vs. the 5.1 fallback.** The steal installs +its lock at the path by replacing whatever is there. There are two engine classes +and they differ in atomicity — so this row is *not* uniformly "atomic rename": +- **Atomic overwrite (the guaranteed lane):** one `rename(2)`-class replace with + no path-absent window. bash uses GNU `mv -T` where available, probed once, with + a guarded `[ -d ]` + bare-`mv` fallback on BSD/macOS + (`git-commit-lock.sh:954-979`); pwsh 7 uses the 3-arg `File.Move(src,dst,true)` + (`git-commit-lock.ps1:941-982`). Atomic replace is guaranteed on local POSIX FS + and NTFS (probe R1: 400 replaces, zero absent reads, + `git-commit-lock.sh:380-382`); *not* guaranteed on some network FS (§E). +- **Windows PowerShell 5.1 fallback (NOT atomic, but claim-guarded):** 5.1 has no + 3-arg overload, so it unlinks then does a 2-arg `Move` (`git-commit-lock.ps1:941-982`). + This lane has a real path-absent window in which a rival's *create* can win the + recovered path — a **fairness loss, never a clobber** (claim serialization still + admits one stealer; the loser re-polls), documented at + `docs/git-commit-lock.md:471-476`. +`File.Replace` is *deliberately never used* (throws on read-only dest; +partial-failure states) — pinned by a static grep in interop Test 16d +(`I:1141-1149`). *The atomic lane is Tier 1 on local FS; the 5.1 fallback is Tier +1 for safety (no clobber) but gives up rename atomicity (fairness only).* +**Recommend: in scope on local FS; the network-FS boundary is §E.** + +**D2 — O_EXCL atomic create.** `set -C` noclobber redirect (bash) / +`FileMode.CreateNew` with `FileShare.ReadWrite|Delete` (ps1, +`git-commit-lock.ps1:650-670`). Atomic create-or-fail on local POSIX and NTFS; +exactly one creator wins. *Tier 1 on local FS.* **Recommend: in scope on local +FS.** Boundary: O_EXCL is the classic NFS weak spot (§E). + +**D3 — Wrong-type object at the lock or claim path.** A directory, symlink, FIFO, +socket, or device at the path is **never stolen or deleted**. bash has a +pre-create type guard (`[ -f ] && ! [ -L ]`) plus a per-poll wrong-type +classifier with two-consecutive-poll confirmation to survive Windows +delete-pending ghosts (`git-commit-lock.sh:1322-1327, 1518-1570`); the same +guards apply to the claim path with independent per-path warn-once state +(`:1458-1487`). The FIFO case is *why the pre-create guard is mandatory*: a +noclobber `>` onto a FIFO blocks in `open(2)` before any timeout logic — a hang, +not a warning. *Tier 1 on bash, and on ps1-on-Windows.* Tested: Test 17 +(dir/symlink/FIFO at lock path), Test 22 (claim path), Test 17d (churn must not +false-warn), and Test 44 (the socket & device-node arms of the same classifier, +bash; POSIX CI legs) (`U:818-892, 1156-1262, 894-1032`). + +> **The one real D3 boundary — ps1 on POSIX (Tier 2, accepted).** The .NET API +> exposes no portable type bit for FIFO/device/socket on Unix; they stat as size +> 0 and take the **empty-orphan steal lane** (lock path) or empty-claim clear +> lane (`git-commit-lock.ps1:62-78, 520-525`; `docs/git-commit-lock.md:215-222`). +> Damage is capped at the one misconfigured inode (consumed by the rename). This +> is an **unsupported configuration** (ps1 is Windows-only; POSIX runs it solely +> as cross-impl protocol verification, `README.md:91-95`). **Recommend: accept, +> as documented.** Closing it would need a `stat(2)` shell-out the port avoids; +> not worth it for an unsupported config. + +**D4 — Non-lock CONTENT at the path.** An age-gated content guard steals only +empty or `tok.`-prefixed line-1 content; a real user file at a typo'd path +survives forever (`git-commit-lock.sh:1411-1444`). *Tier 1.* Tested: Test 18 +(user file untouched; sub-prefix torn write `to` never stolen; `tok.`-prefixed +torn write *is* stolen) (`U:1034-1076`). **Two accepted residuals** make the +guarantee precise (`git-commit-lock.sh:298-311`): (a) a stale **empty** user +file is indistinguishable from the crash orphan and *is* stolen; (b) a stale +user file whose line 1 happens to start `tok.` passes the wire test and *is* +stolen. Both are deliberate (a fuller shape check buys near-zero protection for a +harder-bound wire format). **Recommend: in scope, keep, with the two residuals +documented** (already are). + +**D5 — Case-insensitive filesystem.** Not handled explicitly. The lock and claim +paths differ only by the `.next` suffix (`` vs `.next`), which never +collide under case folding, and the token content is case-exact regardless of FS +case sensitivity. The only theoretical exposure is two *different* configured +`AGENT_LOCK_PATH` values that differ only in case resolving to one file on +NTFS/APFS — but that would be a single shared lock, which is *correct* behavior +(they'd serialize), not a break. *Tier 3 (non-issue).* **Recommend: out of +scope as a non-issue; no action.** (Cheap to add one sentence to the design doc +if desired.) + +### E. Network / shared filesystems and clocks + +**E1 — Network/shared FS (NFS, SMB/CIFS, 9p, Dropbox/OneDrive sync).** The design +doc states this plainly: the repo must live on a **local FS with atomic +create/rename and sane mtimes**; "repos on network or sync-backed storage … are +outside the design's guarantees" (`docs/git-commit-lock.md:122-126`). This is the +honest boundary, because the protocol's *correctness* rests on D1 (atomic +rename-over) and D2 (O_EXCL create), and both are exactly the operations network +filesystems weaken: +- **NFS:** `O_EXCL` create is famously unreliable on older NFS (the client can't + guarantee exclusive create across the network); `rename` atomicity and mtime + granularity vary by version/server. On such a mount, **D2 can let two creators + both "win"** → two live holders, and the read-back verification + (`:1352-1361`) is the only backstop (it would catch *some* but not all + interleavings). +- **SMB/CIFS:** delete/rename semantics and the no-delete-share handle behavior + differ from both POSIX and local NTFS; mtime resolution and clock source may be + the *server's*, not the client's. +- **Sync folders (Dropbox/OneDrive):** asynchronous replication means the lock + file's existence and content are *not* globally consistent — two machines can + both create "the" lock locally before sync reconciles. Fundamentally broken; + not a tunable. + +*Tier 3 (out of scope, stated).* Untested (CI runs local FS only). **Recommend: +keep out of scope — but consider making it harder to *fall into* accidentally.** +The current failure mode on a bad FS is *silent* (the tool runs, exclusion may +just not hold). Options, in increasing cost: (i) leave as-is, documented — the +default lock lives in `.git`, which is almost always local, so accidental +network use is rare; (ii) a one-line caveat in `README.md` (since done — +`README.md:60-64`; previously only in the deeper design doc); (iii) an optional +best-effort startup probe of the lock dir's +FS type with a stderr warning on a known-network type (cheap on Linux via +`stat -f`, awkward cross-platform, and inherently incomplete). **My +recommendation: (ii) now** (surface the boundary in the README, where an operator +actually looks), and treat (iii) as optional polish — do *not* try to *support* +network FS. + +**E2 — Multi-host clock skew / NTP jumps / timezone.** *This is the one place +the documentation is genuinely thin, and it deserves a deliberate decision.* +Staleness is mtime-vs-`now` arithmetic (`git-commit-lock.sh:928, 1409`). The +lock file records `host=` (`:519`), which *suggests* cross-host use — +but the staleness math implicitly assumes **the mtime and the comparing +process's clock come from the same time source.** Reasoning from first +principles about what can go wrong: +- On a **single host** (the actual supported case — all contenders share one + checkout, hence one machine), mtime and `now` are the same clock; skew is a + non-issue, and the **mtime floor** (946684800 / 2000-01-01, + `git-commit-lock.sh:925`) already absorbs the only real local clock glitch: + the Windows FILETIME-zero (1601) transient on fresh files + (`docs/git-commit-lock.md:283-293`, probed at 0.04–0.5% of readings). +- A **large local clock correction** on the one host splits by sign, because + staleness is `age = now - mtime` (`git-commit-lock.sh:928, 1409`): a **forward** + jump (now leaps ahead) inflates the computed age, so a *live* lock can look + stale → premature steal; a **backward** jump (NTP steps back) shrinks the age, + so a genuinely *stale* lock can look fresh → delayed recovery. The + forward/premature-steal case is the only worrying one — and it degrades into the + *already handled* B5 lane: a premature steal of a still-live hold is detected at + release as 98 (given cooperative unwind), never a silent double-commit. So even + a local clock jump is **correctness-safe, liveness-degraded** — Tier 2. +- **Cross-host** use over a shared FS (already E1-out-of-scope) is where skew + would actually bite: host A's mtime compared against host B's `now` with + minutes of skew could steal live locks wholesale. But this only arises *on a + network FS*, which is already excluded. +- **Timezone** is a non-factor: all arithmetic is in epoch seconds + (`git-commit-lock.sh:439-449`, `git-commit-lock.ps1:448-451`), never local + time. + +*Tier 3 for cross-host (rides on E1); Tier 2 for a local NTP jump.* Untested — and +no code change is warranted (see below). **Documented:** the design doc now states +explicitly that the tool assumes a single time source — single-host use (the common +case) or a shared FS with a single server clock — and that this is *why* +network/multi-host is out of scope (`git-commit-lock.md`, "One time source"). It +also records the reassuring part: a *local* clock jump is correctness-safe — a +forward jump can prematurely steal a still-live lock, but that degrades to the +detected exit-98 lane, never a silent double-commit. A doc matter, not a code gap. + +**E3 — mtime probe fails entirely (the staleness clock is unreadable).** Distinct +from a *wrong* clock (E2): here the lock file's mtime cannot be read at all. Both +ports retry three times on a *present* file, then warn loudly once per process — +bash via `stat -c %Y` / `stat -f %m` / `date -r` (`git-commit-lock.sh:629-645`), +pwsh via `Get-Item.LastWriteTimeUtc` (`git-commit-lock.ps1:531-560`): *"Staleness +detection is BROKEN: stale locks will never be stolen, so a crashed holder wedges +waiters until MAX_WAIT."* The stale check then treats an unreadable mtime as **not +stale** — the floor guard `[ "$mt" -gt 946684800 ]` fails closed to "fresh" +(`git-commit-lock.sh:925-927`). **Safety is preserved**: the tool never steals a +lock whose age it cannot establish, so no premature steal and no corruption — but +**recovery of a genuinely crashed holder is disabled**, and waiters block to +MAX_WAIT (97). *Tier 2 (safety held, recovery lost — and loudly announced).* +Tested: unit Test 42 shadows the inner mtime probe to return empty on a present, +stale ghost and asserts the fail-safe lane — the "Staleness detection is BROKEN" +warn-once fires, the ghost is NOT stolen (left in place), and the waiter blocks to +MAX_WAIT → 97. **Recommend: accept; documented (§E3, `guarantees.md` BE-3)** — it is a +host/FS-health failure the tool already detects and announces, and it fails *safe* +(no false steal); the loud warning is the right behavior. This is also the clean +reason recovery is a *Tier-1-within-envelope* property, not unconditional (see the +tier split under §1): it presumes a readable clock. + +### F. Resource exhaustion + +**F1 — Disk full (ENOSPC) during a claim/lock create or write.** The create is +one open+write+close in a subshell; if the write fails (ENOSPC), the subshell +fails and the acquirer falls through to wait (`git-commit-lock.sh:1336-1361`, +comment at `:1341-1343`). A created-but-write-failed file is an empty orphan that +ages into the steal lane. A torn write *shorter than `tok.`* (e.g. `to`) is the +accepted residual at `:299-304`: non-empty, non-prefixed → never stolen, loud, +fixed by one manual `rm`. *Tier 2 (degrades to wait/97) / Tier 3 (the torn-write +manual-fix residual).* **Tested** (per §4 item 5): unit Test 50 mounts a small 64k +tmpfs, fills it to ENOSPC, and asserts the waiter times out at 97 with the wrapped +command never running — no corruption, no false hold. ENOSPC injection needs a full +FS (root via a tmpfs; `ulimit -f` raises SIGXFSZ — the wrong lane), so the test runs +on **Linux with passwordless sudo** (the Linux CI leg) and skips-with-note elsewhere. +ENOSPC is a host-health failure; the tool degrades safely (no corruption, no false +hold) and the one sharp edge (sub-`tok.` torn write needing manual `rm`) is already +documented. + +**F2 — ENOSPC during a LOG write.** All log writes end in `|| true` +(`git-commit-lock.sh:561`); a failed log write is silently lost. *Tier 2.* +**Tested** (per §4 item 5): unit Test 49 points `AGENT_LOCK_LOG` at a path *under a +regular file*, so every open/append fails ENOTDIR, and asserts the lock still +acquires + releases cleanly (rc 0), the wrapped command runs, the lock is cleaned +up, and no log file appears — i.e. the failing log write is swallowed and the lock +is unaffected. This is a portable injection (no chmod/perms), and it **also covers +J1**. Logging is best-effort by explicit design (it must never block or fail the +lock); the only downside is reduced post-mortem signal under disk pressure. + +**F3 — Inode / FD exhaustion.** Same shape as F1: a create that can't get an +inode fails → wait → eventually 97. The tool holds at most a couple of FDs +briefly. *Tier 2.* **Document-only — no deterministic portable injection.** A +`ulimit -n` FD cap can't be driven deterministically here: the create needs only +~1 FD, so an FD-exhaustion test would have to pin the process at *exactly* the +limit across a poll loop without starving the harness itself — not portable or +stable. Inode exhaustion needs a full FS the way F1 does (and F1/Test 50 already +exercises the create-fails-→-wait-→-97 lane that F3 shares). So F3 is recorded as +a reasoned-but-untested boundary rather than given a flaky test; the safe-degrade +behaviour is the same as F1, which is tested. + +**F4 — Read-only / unwritable lock dir or parent.** `lock_acquire` does a +best-effort `mkdir -p "$(dirname …)"` (`git-commit-lock.sh:1278`); if the dir is +unwritable the create fails every poll and the waiter times out at 97. No +corruption, no false hold. A *release* unlink blocked by an unwritable parent +routes to the LEFTOVER lane (`:1699-1711`). *Tier 2.* **Tested** (per §4 item 5 — the +highest-value one): unit Test 48 `chmod 0555`s the lock-dir parent and asserts the +waiter times out at 97, the wrapped command never runs, no lock file is created, +and the WAITING/TIMEOUT lines are logged — no corruption, no false hold. POSIX-only +(`chmod 0555` is a no-op for writes on Git-Bash/NTFS, so it skips-with-note on +Windows; the Linux/macOS CI legs exercise it). A correct, if blunt, outcome (97); an +*earlier, clearer* error would be nicer but is optional polish, low priority. + +**F5 — Memory exhaustion.** The scripts allocate trivially (a few shell vars; the +leaked-token list is "almost always empty"). Not a meaningful failure surface. +*Tier 3 / non-issue.* **Recommend: no action.** + +### G. Misconfiguration + +**G1 — Lock path is a directory / `$HOME` / a real file.** Covered by D3/D4: +never stolen or deleted, loud one-time warning, waiters reach 97 +(`U:818-840`). *Tier 1.* The security note (`docs/git-commit-lock.md:530-541`) +bounds the worst case even for a *hostile* repo redirecting the git dir: the tool +only ever creates its own small set of files at its own names and never deletes +recursively. **Recommend: in scope, keep.** + +**G2 — Garbage numeric config.** Each knob is validated at source time; invalid +values fall back to default with a stderr note (`git-commit-lock.sh:481-500`). +The ps1 port *tightens* .NET's permissive parser to bash's grammar so the same +env var configures the same value on both impls — e.g. rejecting `"1e3"`, +trailing newlines, whitespace (`git-commit-lock.ps1:327-359`). *Tier 1.* Tested: +unit Test 13, interop Test 12 (cross-impl parity, including `1e3`/`+2`/`' '`/ +trailing-newline) (`U:695-703`, `I:554-608`). **Recommend: in scope, keep.** + +**G3 — `run` outside a git repo, no `AGENT_LOCK_PATH`.** Refused with 96 — a +CWD-scoped lock would serialize against nobody (`git-commit-lock.sh:1768-1773`). +Sourcing keeps a CWD fallback with a stderr warning and creates no files +(`:570-572`; unit Test 14/14b). *Tier 1.* **Recommend: in scope, keep.** + +**G4 — `MAX_WAIT ≤ STALE + CLAIM_STALE`.** A startup warning, gated on MAX_WAIT +being left at its default (a caller who set it chose the relationship). The +relation is the stacked worst-case recovery: a crashed holder *plus* a crashed +claimant (`git-commit-lock.sh:502-514`). *Tier 2 (advisory).* Tested: Test 8 +exercises the gate and the stacking (`U:497-522`). **Recommend: in scope, +keep.** + +### H. Signals, interrupts, cleanup-on-exit + +**H1/H2 — bash INT/TERM/EXIT.** Handlers armed at acquire start; on a held lock +they release and re-raise the signal (wrapper dies 143, what a watchdog needs); +they restore the caller's pre-acquire traps exactly (`git-commit-lock.sh:1037- +1054, 1002-1023, 780-784`). *Tier 1.* Tested: Test 11 (TERM mid-hold → 143, +released), Test 12c (exit-while-holding chains the caller's EXIT trap), Test 12d/e +(trap restoration), Test 34 (TERM on a *steal*-acquired hold behaves identically +— all acquisition paths funnel through one hold helper) (`U:577-600, 633-693, +1989-2011`). One documented caveat: a SIGINT delivered to the `run` wrapper alone +while its foreground child survives is discarded by bash before any trap +(`git-commit-lock.sh:1030-1036`) — a real Ctrl+C hits the whole group and does +take the path. **Recommend: in scope, keep.** + +**H3 — ps1 process death.** PowerShell has no `trap SIGTERM`. The port substitutes +(a) `try/finally` inside `Lock-Acquire`, which runs on Ctrl+C/pipeline-stop/ +terminating errors and does the claim-window cleanup + discovery read +(`git-commit-lock.ps1:1378, 1672-1683, 1240-1295`); and (b) a `PowerShell.Exiting` +engine-event backstop for a *held* lock (`:704, 1303-1324`). **Documented limit:** +`PowerShell.Exiting` fires under `-Command` and interactively but **NOT under +`-File`**, and not on hard kill / `[Environment]::Exit()` +(`git-commit-lock.ps1:241-245, 1298-1302`). So a held lock abandoned by a +forgetful dot-source `-File` caller relies on the stale window, not the backstop. +The **`run` contract path is unaffected** — it pairs Acquire/Release in +try/finally (`:1928-1979`). *Tier 2 (for the dot-source `-File` gap).* The happy +path and trap-time claim cleanup are tested (interop Test 16e); the `-File` +non-firing is documented, not test-pinned. **Recommend: accept the `-File` +backstop gap as documented** — the stale window recovers it, and the supported +`run`/try-finally paths are covered. If you want to close it, the documented +option is handle-based ops (`git-commit-lock.ps1:146-151`), a larger change not +worth it for a forgetful-caller edge. + +**H4 — Process termination/replacement *without wrapper unwind* (the no-silent-loss +boundary).** §1's safety guarantee — a displaced holder reports 98 rather than a +false success — relies on the wrapper *reaching its release path*. The bypass class +is any termination or replacement of the holding process that skips that unwind; +crucially it is **not** triggered by a normal `exit`. The instances: +- **External SIGKILL** — untrappable; no handler runs in either port. +- **bash `exec` that replaces the lock-holding shell** — `run` executes `"$@"` + *in the wrapper shell itself* (`git-commit-lock.sh:1733`), so the bypass needs the + exec to run in *that* shell: the wrapped command *is* an exec (`run -- exec …`), + or a **sourced** caller does `lock_acquire; exec …` in its own shell. Then the + exec replaces that shell's process image and *neither* the trailing `lock_release` + *nor* the `EXIT` trap (`git-commit-lock.sh:1002-1013`, armed at `:1308`) runs. An + exec **nested in a child** — the ordinary `run -- bash -c 'exec …'` — does **not** + bypass (the child is replaced; the wrapper waits and releases normally). *Verified + empirically 2026-06-17.* +- **PowerShell `[Environment]::Exit(n)`** — a CLR hard-exit that bypasses + `Lock-Release`, the `finally`, *and* the `PowerShell.Exiting` backstop + (`git-commit-lock.ps1:221-245`). + +The useful contrast: a **plain `exit` is safe** — bash `exit` fires the EXIT trap +(which releases), and a plain `exit` inside the pwsh `run` body unwinds its +`finally` (`git-commit-lock.ps1:1928-1979`). Only *non-unwinding* termination or +replacement escapes. If such a process was *already displaced* (its lease stolen +past STALE) and exits **0**, its caller sees success with no 98 — the one +interleaving that defeats "no silent lost update." What keeps it narrow: an external +SIGKILL yields a non-zero wait status (`128+9`), so a caller checking exit codes does +*not* see success; the leak needs a command that *deliberately* replaces or +hard-exits the process **and** returns 0 **while displaced**. The *next* holder +still recovers via staleness; only the abruptly-exiting one is unwarned. *Tier 2 — +the residual edge of the fail-open lease.* Exercised indirectly: interop Test 5 +*uses* `[Environment]::Exit()` to fabricate a no-release orphan, confirming the +bypass (`I:308-334`). **Recommend: accept; documented as the explicit boundary of the +no-silent-loss guarantee** (`guarantees.md` OOS-5 / G-S1), alongside the "commits must be fast" golden rule — a +command that replaces/hard-exits the process mid-critical-section *after being +displaced* is exactly the fail-open case the STALE budget exists to make rare. No +code change closes it without the handle-based ops the design rejected (§H3). + +### I. Cross-implementation + +**I1 — Wire/format compatibility.** One on-disk format (token line 1, owner line +2, `tok.` prefix as wire contract), one read-retry schedule (8 attempts, +20/40/80/160/320/320/320 ms — verified byte-identical between +`git-commit-lock.sh:670` and `git-commit-lock.ps1:597-629`), one set of release +verdicts, one config grammar. *Tier 1.* The interop suite is built to break this: +mixed bash+pwsh exclusion (T1/T6), each side steals the other's genuine stale +lock (T4/T5), robbed-holder 98 both directions (T8), release-classification +agreement (T11), cross-impl claim staleness clearing (T16c), and a Windows +PowerShell 5.1 smoke lane (T17). **Recommend: in scope, keep — and keep the +interop suite as the guard.** Two independent implementations hammering one lock +is "cheap adversarial verification of the protocol" (`README.md:94`). + +**I2 — Mixed-version tree.** Prevention (the claim protocol) holds only when +*all* parties run it; older releases stole with an unserialized move-aside, so a +mixed tree degrades prevention to detection (98) and can leave `.dead.*` litter +current versions don't clean (residual 4, `git-commit-lock.sh:261-265`). *Tier +3.* Untested (would require shipping an old version into the suite). **Recommend: +out of scope; keep the "upgrade both implementations together" deployment note** +— in the design doc (`docs/git-commit-lock.md:251-255`) and now also surfaced in +`README.md:101-106`, where operators actually look. Acceptable +because the degraded mode is still *detected* (98), never silent. + +### J. Logging subsystem failure + +**J1.** Every log write is `|| true`; the log self-truncates past ~1 MB rather +than rotating (`git-commit-lock.sh:554-562`). A broken log never blocks or fails +the lock. Under a redirected git dir, log *content* (the owner line) is +attacker-influenceable — one-line text spoofing, no execution; the tool itself +writes only its token, owner line, and protocol events, never secrets +(`docs/git-commit-lock.md:543-551`). *Tier 2.* **Tested — covered by the F2 +log-failure test (per §4 item 5): unit Test 49** proves a failing log path leaves the +lock fully working. Logging is best-effort by design, which is the right call for a +lock that must keep working when the disk is full or the log path is bad. The +follow-on (unchanged): don't build automation that *trusts* log text from an +untrusted repo (already documented). + +### K. Behavior under extreme load / scheduling pressure, and internal time budgets + +**This is the most important analytical section** — it separates "must hold under +any load" from "holds within an envelope," and tells the owner which apparent +flakes are real gaps vs harness concerns. + +**The clean split: correctness is load-independent; liveness/latency is not.** + +- **Load-independent (Tier 1 *safety*, must always hold):** no silent lost update + (given cooperative unwind, §1/§H4), no corruption, and strict mutual exclusion + *within the staleness window*. These rest on O_EXCL create + atomic rename + + per-attempt-token discovery — *structural* properties that do not reference the + clock for their *correctness*. (Recovery of lock-shaped orphans is also + load-independent in *correctness* — only its latency degrades — but it presumes + a readable clock, §E3, and does not extend to foreign objects, per the tier + split under §1.) The mtime + floor + (`:925`) and the read-retry ladder (`:668-684`) exist precisely so that the + one timing-sensitive input (mtime, and transient empty reads) cannot corrupt a + correctness decision: a sub-floor or unsettled reading is treated as "wait," + never "steal." A 25-worker round can go 3s → 41s under load + and *still* lose no update. + +- **Load-dependent (Tier 2, best-effort in an envelope):** every wall-clock bound. + - **Recovery latency** ≈ STALE (+ CLAIM_STALE if a claimant also crashed) + + poll cadence. Under CPU oversubscription or a slow FS, polls stretch, so + recovery takes longer — but still completes. + - **`MAX_WAIT` timeout (97):** a waiter on a genuinely squatted/blocked lock + gives up at MAX_WAIT. Under load the *real* time to MAX_WAIT stretches with + poll cadence; the guarantee is "bounded by MAX_WAIT polls," not "exactly + MAX_WAIT seconds." Interop Test 14b explicitly checks that a blocked steal + **never busy-spins past MAX_WAIT** and logs in a damped, bounded way + (`I:746-817`) — a real correctness-adjacent property (no busy-spin), with a + timing-dependent upper bound on the STALE-line count (`[1,8]`). + - **The read-retry ladder (~1.26s budget):** sized to ride out a sub-second + transient (AV scanner handle, probe-F create→write gap). Under pathological + load a transient *longer* than ~1.26s would surface as the unverifiable-2 / + run-1 verdict (a detected, non-corrupting outcome), not a wrong hold. Test + 16c pins that a 0.4s transient is ridden out (`U:784-817`). + +**Internal time budgets, enumerated** (all tunable via `AGENT_LOCK_*`): + +| Budget | Default | Role | Load sensitivity | +|---|---|---|---| +| `STALE_SECS` | 300s | steal threshold (the lease length) | the fail-open ceiling; raise for slow holds | +| `CLAIM_STALE_SECS` | 60s | crashed-claimant ageout | delays only steals | +| `POLL_SECS` | 2s | poll interval | cadence stretches under load | +| `MAX_WAIT` | 420s | total wait cap → 97 | real wall-clock stretches with cadence | +| read-retry ladder | ~1.26s | ride out transient empty reads | a longer transient → detected-2, not wrong hold | +| mtime floor | 2000-01-01 | reject FILETIME-zero | static, not load-sensitive | + +**Judgments on the load-sensitive behaviors — gap, degradation, or harness +concern:** + +1. **Protocol correctness under load — (c) non-issue / already guaranteed.** + The stress branch wraps every suite in artificial CPU+disk load + (`tests/with-load.sh`) specifically to widen timing windows and surface + *latency/race flakes*, and the protocol assertions (exclusion, one-steal, + zero-98) are written to hold regardless. **Recommend: nothing to harden.** + +2. **Wall-clock test *bounds* under extreme load — (b) acceptable degradation; + fix the TEST, not the code.** Two examples surfaced by the prior stress + effort (which I verified independently against the code, not adopted): + - *Test 21's `≤20s` recovery-latency assertion* (`U:1144`) and + - *Test 22(a)'s claim-path warning* — the warning relies on the + two-consecutive-poll confirmation (the mechanism Test 17d pins for the lock + path) having poll *headroom* before MAX_WAIT, which an oversubscribed runner + can starve (`U:1156-1172`); the test asserts the warning fires, not a specific + poll count, + - and *Test 29's `≥2 CLAIM lines` discriminator* (explicitly given `MAX_WAIT=6` + headroom, `U:1514-1518`). + + Each asserts a wall-clock or poll-count bound that an oversubscribed runner + (e.g. 8 hogs on 2 cores) can blow *without any protocol defect* — the + protocol still recovers/warns correctly, just slower. **Recommend: where these + flake only under extreme artificial load, relax the bound or scope the stress + level for that test; do NOT change product code.** The correctness assertions + in the same tests must stay strict. + +3. **Test-*harness* race setup under load — (c) harness concern, already + mitigated.** Tests 2b/16/16b carry heavy sync scaffolding (`sync_waiting_fresh`, + token-guarded `backdate_ghost`, bounded discard-and-retry, `U:70-151`) because + a fast waiter can complete an entire steal before the harness finishes setting + up the race. This is purely about *constructing* the scenario deterministically; + the protocol is fine. **Recommend: keep the scaffolding; it is the right fix.** + +4. **No-busy-spin under a permanently blocked lock — (a) a real property, and + it's guarded.** A failed-steal lane that `continue`d past the timeout+sleep + would busy-spin and never reach 97 — a genuine bug class. Interop Test 14b is + the regression guard (`I:746-817`). **Recommend: keep that test; treat any + regression here as Tier 1.** + +**Net K — the envelope, now adopted.** The explicit envelope — *"correctness holds +under any load; wall-clock recovery/timeout latency scales with poll cadence and +scheduling, bounded by the configured knobs"* — is stated in the design doc +(`git-commit-lock.md`, "operating envelope") and in `load-testing-strategy.md` §1. +The suite's wall-clock assertions are scoped to a load level via the envelope tier +(`GCL_ENVELOPE_TIER` strict/relax, `ok_envelope`/`bad_envelope`): an oversubscribed +runner's latency miss warns rather than reds, while the correctness asserts stay +strict. So the stress branch's extreme `both/8-hog` mode is a flake-hunting tool, +not a contract the product must meet on a 2-core runner — which structurally ends +the chasing of "flakes" that are really a test asserting a Tier-1 bound on a +Tier-2 quantity. + +--- + +## 4. Open questions / recommended scope decisions + +Ordered by how much they need an explicit owner decision. + +**Status (Ben, 2026-06-17): reviewed and accepted — with two changes marked below.** +Item 3 (network FS) is **document-only**: do not build the FS-type probe. Item 5 is +**overridden** — the untested-but-robust lanes *will* get test coverage (actually-tested +edge cases make the tool more maintainable and give future users confidence), rather than +"accept untested". Every other recommendation is accepted as written. + +1. **The load/timing envelope (§K) — highest value.** + *Recommendation:* state in `docs/git-commit-lock.md` that correctness + (exclusion, no silent loss, eventual recovery) is load-independent, while all + wall-clock bounds (recovery latency, MAX_WAIT, the read ladder) are + best-effort and scale with scheduling. Then **scope the suite's wall-clock + assertions to a defined load level** so extreme-stress flakes (Test 21's 20s, + Test 22a's warning timing, Test 29's poll count) are recognised as Tier-2 + envelope misses, not product regressions. *This resolves the recurring + "flake" question structurally.* Cost: doc + a test-bound audit; no product + change. + *Status (done):* the envelope is stated in `docs/git-commit-lock.md` ("operating + envelope" — correctness load-independent, wall-clock bounds best-effort) and + `docs/load-testing-strategy.md` §1, and the suite's wall-clock assertions are + scoped to a load level via the envelope tier (`GCL_ENVELOPE_TIER`). + +2. **Multi-host / clock-skew assumption (§E2) — a doc matter, not a + code gap.** The tool implicitly assumes a single time source; a *local* NTP + jump is correctness-safe (degrades to the detected-98 lane), and cross-host + skew only bites on a network FS that's already out of scope. *Recommendation:* + add one explicit sentence — "assumes a single clock, i.e. single-host (the + common case) or a shared FS with one server clock" — and the reassurance that + a local clock jump cannot cause a silent double-commit. No code change. + *Status (done):* the single-clock sentence + local-jump reassurance are in + `docs/git-commit-lock.md` ("One time source"). + +3. **Network/shared FS is out of scope but fails *silently* if entered (§E1).** + The boundary is correctly stated in the design doc but only there. + *Decision (Ben — document-only):* surface the boundary in `README.md` (where + operators look), since the failure on a bad FS is silent loss of exclusion. Do + **not** attempt to *support* network FS, and **do not build** the optional + FS-type startup probe — just document. (It would be cross-platform-awkward and + incomplete anyway; Ben: "don't do the polish, just document.") + *Status (done):* the network/sync-FS boundary is stated in `README.md` (the + "local filesystems only" note); the FS-type probe was deliberately not built. + +4. **ps1-on-POSIX FIFO/device residual (§D3) and ps1 `-File` exit backstop gap + (§H3) — accept as documented.** Both are real but confined to an unsupported + config (ps1-on-POSIX) or a forgetful-caller edge that the stale window + recovers. *Recommendation:* no code change; confirm they stay documented. + Reconsider only if PowerShell-on-POSIX ever becomes supported (it isn't, + `README.md:91-95`). + +5. **Untested-but-robust-by-code lanes (resource exhaustion F1/F3/F4, log-write + failure F2/J1).** These degrade safely (wait/97, or silent best-effort log + loss) but had **no fault-injection tests** — they were reasoned-correct, not + verified. *Decision (Ben — overrides the prior "accept untested"):* **add test + coverage** for these lanes. Rationale: actually-tested edge cases make the + project easier to maintain and give future users confidence, versus + "reasoned-correct but untested." Add deterministic fault-injection tests where + feasible — **unwritable lock dir → clean 97** (F4, cheapest/highest-value and + the most likely real-world misconfig); an **unwritable log path → the lock + still works, the log write is swallowed** (F2/J1); and the **ENOSPC / inode / + FD-exhaustion** lanes (F1/F3) where they can be injected deterministically and + portably (e.g. a small dedicated tmpfs or quota for ENOSPC, `ulimit -n` for + FDs). Flag in the plan any lane that proves genuinely impractical to fault-inject + portably, rather than forcing a flaky test. + + *Status (done):* coverage added — **F4** unit Test 48 (POSIX `chmod 0555`, + skip-with-note on Windows), **F2/J1** unit Test 49 (portable failing-log path via + ENOTDIR), **F1** unit Test 50 (Linux + passwordless-sudo 64k tmpfs filled to + ENOSPC; skip-with-note elsewhere). **F3** (inode/FD exhaustion) proved impractical + to fault-inject deterministically and portably — the create needs only ~1 FD, so a + `ulimit -n` cap can't be driven deterministically across a poll loop without + starving the harness, and inode exhaustion needs a full FS the way F1 does (F1/Test + 50 already exercises the shared create-fails-→-wait-→-97 lane). Per the "flag any + impractical lane" instruction above, F3 stays **document-only**, not a flaky test. + +6. **Mixed-version tree (§I2) and case-insensitive FS (§D5) — out of scope, + confirm.** The first degrades to detection (98), never silent, and is covered + by the "upgrade both together" note. The second is a non-issue. *Recommendation:* + leave both out of scope; optionally one sentence each in the design doc. + +### Things explicitly NOT to do (the design already considered and rejected them) + +- **A background heartbeat** to refresh the lease — would make the tool more than + a single synchronous script; the fail-open-but-detectable lease is the + deliberate alternative (`git-commit-lock.sh:217-218`). +- **A two-rename compare-and-swap** to prevent residual 5 (B3) — reintroduces + crash litter + a sweep, for a failure that is already bounded and + false-success-free (`git-commit-lock.sh:276-282`). +- **`File.Replace` in the ps1 port** — pinned out by interop Test 16d for good + reasons (read-only-dest throw, partial-failure states). +- **Trying to support network/shared filesystems** — the protocol's correctness + rests on local-FS atomic create/rename; this is a boundary to *document*, not + to engineer around. diff --git a/docs/git-commit-lock.md b/docs/git-commit-lock.md index 828cfc4..c8dc29b 100644 --- a/docs/git-commit-lock.md +++ b/docs/git-commit-lock.md @@ -292,6 +292,22 @@ settles in milliseconds. The same floor governs the claim file's ageout: a sub-floor claim mtime reads as "just created", never "ancient — clear". +**The operating envelope — correctness is load-independent; latency is not.** +Exclusion, no-silent-loss, and eventual recovery rest on atomic create/rename +plus per-attempt tokens, and hold under any load. The wall-clock bounds — +recovery latency (≈ `STALE_SECS` + poll cadence), the `MAX_WAIT` timeout, and the +~1.3 s read-retry ladder — are best-effort and scale with scheduling: under CPU +oversubscription or a slow filesystem they stretch, but the protocol still +recovers and never loses an update. (For the precise guarantee/scope split, see +[`guarantees.md`](guarantees.md).) + +**One time source.** The tool assumes a single clock — single-host use (the +common case: all contenders share one checkout, hence one machine and one clock), +or a shared filesystem with one server clock. A local clock jump is +correctness-safe: a forward jump can make a live lock look stale and be +prematurely stolen, but that degrades to the detected exit-98 lane (the robbed +holder's release fails loudly), never a silent double-commit. + ## The PowerShell port (`git-commit-lock.ps1`) Some agents (Codex on Windows, for example) run their commands in @@ -561,6 +577,7 @@ unavailable): | `git-commit-lock.sh` | the mutex (bash; the authoritative implementation): source for `lock_acquire/lock_release/lock_run`, or `git-commit-lock.sh run -- ` | | `git-commit-lock.ps1` | wire-compatible PowerShell port (see [The PowerShell port](#the-powershell-port-git-commit-lockps1) above): `git-commit-lock.ps1 run ""`, or dot-source for `Lock-Acquire`/`Lock-Release` | | `tests/git-commit-lock.test.sh` | self-contained bash tests (throwaway temp dirs); exit 0 == all pass | +| `tests/git-commit-lock.canary.test.sh` | bash concurrency canary: mutual exclusion under many concurrent workers over repeated rounds — the statistical full-fan-out scenario (throwaway temp dirs) | | `tests/git-commit-lock.interop.test.sh` | cross-impl tests: pwsh + bash workers share one lock and serialise; run from MINGW/Git-Bash | | `tests/git-commit-lock.integration.test.sh` | end-to-end: many concurrent workers make real commits into one shared repo; the history is audited for the tool's guarantees | @@ -571,20 +588,25 @@ Run the suites from a clone of this repository (they are not installed to ```sh bash tests/git-commit-lock.test.sh # bash implementation +bash tests/git-commit-lock.canary.test.sh # bash concurrency canary (mutual exclusion under many concurrent workers) bash tests/git-commit-lock.interop.test.sh # bash + PowerShell interop (skips if pwsh is absent) bash tests/git-commit-lock.integration.test.sh # end-to-end: concurrent real commits into one repo (pwsh half skips if absent) ``` Each suite prints a result summary line and exits 0 when everything passes. -All three use throwaway temp dirs and never touch the repo you launch them +All four use throwaway temp dirs and never touch the repo you launch them from. The heavy fan-out tests run at a REDUCED width by default, so a routine run doesn't lag a shared development machine; each suite prints a `fan-out mode:` line at the start and tags its result line with the mode, so check those say `FULL` when you ran `GCL_TEST_FULL=1` for the full-strength canary (CI does). -`tests/git-commit-lock.test.sh` covers the bash implementation: mutual exclusion -under many concurrent workers (clean acquire/release path), stale-lock theft, +`tests/git-commit-lock.canary.test.sh` is the concurrency canary: mutual +exclusion under many concurrent workers (clean acquire/release path) over +repeated rounds — the statistical scenario that needs the full 8×25 fan-out +(`GCL_TEST_FULL=1`, which CI runs) to trust a rare exclusion race. + +`tests/git-commit-lock.test.sh` covers the bash implementation: stale-lock theft, crash recovery under contention (several waiters racing one dead lock — claim-serialized: exactly one steal, zero displacements, zero spurious 98s, and no move-aside file ever created), claim contention (many concurrent @@ -648,7 +670,7 @@ is audited for the guarantees this document claims — every commit lands, history stays linear, no commit sweeps up another worker's file, no `index.lock` races, no stolen leases, and a clean tree at the end. -The same three suites run in CI on Linux, macOS, and Windows +The same four suites run in CI on Linux, macOS, and Windows (`.github/workflows/tests.yml`), at full fan-out strength, alongside a shellcheck + PSScriptAnalyzer lint job. The POSIX legs exercise the PowerShell implementation purely as cross-implementation protocol @@ -664,9 +686,10 @@ heavy process fan-out is environmental, not a lock failure — but only the interop suite's exclusion test tolerates it (scoring by violations/steals, with a minimum-acquired floor so a collapsed fan-out cannot pass vacuously); the integration suite is deliberately strict per worker (every worker must -launch and commit), and the unit suite's counts are exact. +launch and commit), and the unit and canary suites' counts are exact (the +canary requires every worker to acquire and release in each round). -For debugging, all three suites copy their logs and work dirs to +For debugging, all four suites copy their logs and work dirs to `$GCL_TEST_PRESERVE_DIR` when it is set, and keep the work dir on disk on any failure. diff --git a/docs/guarantees.md b/docs/guarantees.md new file mode 100644 index 0000000..d27aab0 --- /dev/null +++ b/docs/guarantees.md @@ -0,0 +1,423 @@ +# git-commit-lock: guarantees and scope (the normative contract) + +**Status: normative.** This document states *what the tool guarantees*, *under +what conditions* (the operating envelope), and *what is explicitly out of +scope*. It is the contract a user or a CI gate can point at: a behavior listed +under [Guarantees](#2-guarantees) is a property the code must uphold and the +tests defend; a behavior under [Out of scope](#5-out-of-scope-not-guaranteed) is +one the tool deliberately does not promise. + +**How this relates to the other two docs.** This is the *contract*; +[`failure-modes.md`](failure-modes.md) is the *analysis* behind it (per-mode +current behavior, tier classification, and the scope decisions that produced +this contract); [`git-commit-lock.md`](git-commit-lock.md) is the *design +reference* (why the protocol is shaped this way and how it works). Where they +appear to disagree, the **code and tests are authoritative**, then this contract, +then the analysis, then the design narrative. Each guarantee below cites its +witnessing test(s) and the failure-modes section that justifies it; the +[Verification map](#7-verification-map) collects those pointers. (Test and +`file:line` citations are **anchors, not exact addresses**: find a test by its +name/number — the line numbers reflect the tree when written and drift as files +move.) + +This contract makes **no new claims** about behavior — it is a re-statement of +the decisions recorded in `failure-modes.md` §4 as commitments. It does not +re-derive the protocol (see the design doc) or re-argue the tiers (see the +analysis). + +--- + +## 1. The operating envelope + +Every guarantee in §2 holds **within this envelope**. Outside it, the tool +degrades as described in §4 (best-effort) or §5 (out of scope) — in most cases +*detectably and without corruption*, but the strict guarantees are not promised. +The envelope is not a disclaimer bolted on; it is the precise set of assumptions +the filesystem-lease design rests on. + +**E1 — Single host, single time source.** All contenders share one working tree, +hence one machine, hence one clock. Staleness is `age = now − mtime` arithmetic +(`git-commit-lock.sh:928,1409`); it assumes the mtime and the comparing process's +`now` come from the *same* clock. Single-host use satisfies this. A *local* clock +jump remains correctness-safe (it degrades to the detected-98 lane, never a +silent double-commit; see G-S1 and `failure-modes.md` §E2). Multi-host use over a +shared FS does not satisfy it and is out of scope (§5, OOS-2). + +**E2 — Local filesystem with atomic create/rename and sane mtimes.** The protocol +is built from three filesystem operations — atomic create-or-fail (`O_EXCL` / +`FileMode.CreateNew`), atomic rename-over, and unlink — each atomic on local +POSIX filesystems and NTFS (ext4, APFS, NTFS, and kin). (The one exception is the +Windows PowerShell 5.1 steal, which lacks the atomic 3-arg move and uses a +claim-guarded unlink-then-move — a fairness loss, never a clobber; see BE-5.) +Network and sync-backed storage (NFS, SMB/CIFS, 9p, Dropbox/OneDrive) weaken +exactly these operations and are out of scope (§5, OOS-1; +`git-commit-lock.md:122-126`). + +**E3 — Cooperative wrapper unwind.** The theft-detection guarantee (G-S1) fires +when the lock-holding shell *reaches its release path* — on normal return, on a +handled INT/TERM, or on a plain `exit` (all of which unwind). It is **not** +triggered by a termination or replacement that skips the unwind: an external +SIGKILL, an `exec` that replaces the lock-holding shell itself, or PowerShell +`[Environment]::Exit()`. (An `exec` nested in a child — the ordinary +`run -- bash -c 'exec …'` — does *not* skip release.) See §5, OOS-5 for the +precise boundary. + +**E4 — Commits fast relative to the staleness window (for *strict* exclusion).** +The lease is fail-open: a hold older than `AGENT_LOCK_STALE_SECS` (default 300s) +can be stolen mid-work. *Strict* mutual exclusion (G-S3) is therefore guaranteed +only for holds that complete within the staleness window. A hold that overruns it +is still *safe* — a displaced holder is detected (G-S1) — but two processes can +briefly both believe they hold the lock. Keep commits well inside the window, or +raise `AGENT_LOCK_STALE_SECS` for a deliberately slow hold (the golden rule, +`git-commit-lock.md:433-458`). + +**E5 — Matching protocol version on all parties.** Prevention of the +crash-recovery-under-contention race (G-S3's no-displacement property) holds only +when every contender runs the claim protocol. A mixed-version tree degrades +prevention to detection and is out of scope (§5, OOS-3). + +**E6 — Supported platforms.** `git-commit-lock.sh` (bash) is supported on Linux, +macOS, and Windows under Git-for-Windows' MINGW bash. `git-commit-lock.ps1` +(PowerShell) is supported on **Windows only**. Running the `.ps1` port on POSIX is +a CI-only cross-implementation protocol check, not a supported configuration (§5, +OOS-4; `README.md:91-95`). + +**E7 — Cooperating, non-hostile agents.** The lock is advisory: it serializes +*cooperating* agents. It detects interference where it can (token checks; exit 98) +but cannot prevent a process running as the same user from deleting or +overwriting the lock file. The threat model is honest agents racing each other, +not an actively hostile local process (§5, OOS-6; +`git-commit-lock.md:520-528`). + +--- + +## 2. Guarantees + +Each guarantee holds **within the envelope (§1)**. The defaults named are knobs +(`AGENT_LOCK_*`); the guarantee is in terms of the configured value, not a fixed +number of seconds. + +### 2A. Safety (unconditional within the envelope) + +These are correctness properties. If one can break inside the envelope, that is a +bug. + +- **G-S1 — No silent lost update.** A holder whose lease is taken from it never + reports a serialized critical section that wasn't. On release, a **definitive** + theft (the lock file is gone, or carries a foreign token) returns **98** with a + loud WARNING rather than success (`git-commit-lock.sh:1607-1688`; + `git-commit-lock.ps1:1717-1837`); a state the release cannot disambiguate (the + file is present but reads **empty** after the retry ladder — possibly a successor + mid-create after a boundary steal) returns the distinct **unverifiable** code + (`lock_release` 2; `run` maps it to 1 when the command itself succeeded, else + keeps the command's code) — still **never** a silent success. *Condition:* the + wrapper unwinds cooperatively (E3). *Witness:* unit Test 4b (98 + WARNING), Test + 16 (unverifiable lane), interop Test 8 (98 both directions) (`U:387-417`, + `I:460-492`). *Basis:* `failure-modes.md` §1, §B5. + +- **G-S2 — No corruption and no false hold.** An acquirer that cannot prove its + own token is at the lock path (after the read-back retry ladder) treats the lock + as **not** acquired and logs loudly; it never "repairs" a failed read-back by + rewriting the path (`git-commit-lock.sh:1352-1361`). Every path that cannot + establish a fact fails toward "wait", never toward "steal" or "hold". This + extends to resource-exhaustion lanes: a create that fails (ENOSPC, FD/inode + exhaustion, an unwritable lock dir) **never produces a false hold or corruption** + — it falls through to wait/97 (an empty orphan ages into the recovery lane). The + guarantee is *no false hold*, not a uniformly clean 97: a torn write shorter than + `tok.` is a non-lock-shaped residual, never stolen, that needs manual removal + (`failure-modes.md` §F1 — an accepted residual). *Witness:* the read-back-failure lanes — + create-path Test 32, steal-path Test 32b (`U:1760-1855`); resource lanes — + unwritable lock dir Test 48 (F4), ENOSPC Test 50 (F1, Linux+sudo; skip-with-note + elsewhere) (`failure-modes.md` §4 item 5); FD/inode exhaustion (F3) is document-only + (no portable injection). *Basis:* §1, §A1, §F. + +- **G-S3 — Strict mutual exclusion within the staleness window, with no + displacement during crash recovery.** Within `AGENT_LOCK_STALE_SECS` no steal + occurs at all, so at most one process holds the lock. When a holder dies and a + herd of waiters recovers the one stale lock, the **claim protocol** admits + exactly one stealer and the recovering waiter keeps the lock it recovered — a + straggler whose stale judgement predates the recovery cannot displace it + (`git-commit-lock.sh:1070-1218`). At most one process is ever the *legitimate* + holder. (On the supported Windows PowerShell 5.1 unlink-then-move lane the + recovering waiter can *lose* the recovered path to a rival's create in the + transient absent window — a fairness loss, never a clobber; see BE-5.) + *Condition:* holds complete within the window (E4); a stable clock (E1) — a local + clock jump preserves *no silent loss* (G-S1) but can break *strict exclusion* by + making a live lock look stale (a premature, but detected, steal); and matching + version (E5). *Witness:* the concurrency canary (mutual exclusion under many + concurrent workers, 8 rounds × 25 at FULL, `C:81-111`), unit Tests 2b/20 + (claim-recovery and many-stealers), interop Tests 1/6/16/16b, integration suite + (`U:212-346,1095-1128`; `I:227-261,341-386,884-1088`). *Basis:* + §A1/§A2/§A3. + +- **G-S4 — Never destroys a non-lock-shaped object.** A directory, symlink, FIFO, + device, socket, or a regular file whose line 1 is neither empty nor `tok.`- + prefixed is **never** stolen or deleted, at either the lock path or the claim + path (`git-commit-lock.sh:1322-1327,1411-1444,1458-1487,1518-1570`). The + never-steal *safety* is unconditional; the *warning* is best-effort — it normally + fires once and names the object, but an **actively-rewritten** user file may never + age into the content guard and then times out at 97 *without* the warning + (`git-commit-lock.sh:308`). Deletion is + never recursive; the tool only ever removes its own named lock-protocol files. + *Two accepted residuals* bound this and are documented, not bugs: a stale + *empty* user file, and a stale file whose line 1 happens to start `tok.`, are + stolen (`git-commit-lock.sh:298-311`). *Witness:* unit Tests 17/17d/18/22 + (dir/symlink/FIFO/content) and Test 44 (socket & device-node, bash; POSIX CI) + (`U:818-892,894-1032,1034-1076,1156-1262`). *Basis:* §D3/§D4/§G1. *Scoped + exception:* ps1-on-POSIX has no .NET type probe for FIFO/device/socket (§5, + OOS-4). + +- **G-S5 — Truthful exit codes.** The three reserved high codes from `run` are + exact: **96** = usage error (command **not** run), **97** = acquisition timed + out (command **not** run), **98** = lock stolen mid-hold (command **ran but was + not serialized** — redo it) (`git-commit-lock.sh:392-415`). A `run` exit of the + command's own code (including 0) means the command was serialized — *subject to + the one carve-out in OOS-5* (a non-unwinding exit returning 0 while displaced). + *Two stated assumptions* keep the high-code contract exact: the wrapped command + must not itself exit 96/97/98 (such an exit is indistinguishable from a tool + verdict, `git-commit-lock.sh:392`), and an **unverifiable** release maps a + *successful* command to **1** (G-S1), so 0 is never reported over an unverifiable + hold. *Witness:* Test 7 (96), Test 8 (97), Test 4b (98), Test 5 (propagation), + Test 16 (unverifiable→1), interop `run` verdict tests. *Basis:* §1, §H4. + +### 2B. Recovery (within the FS/clock/tooling envelope) + +These hold given a readable clock (E1) and lock-shaped state; latency is +best-effort (§4). + +- **G-R1 — Lock-shaped orphans are reclaimed.** A crashed holder's stale lock, an + orphaned or empty claim, and an empty crash-orphan (a crash between create and + content write) all eventually become stealable and are recovered, bounded by + `STALE` (+ `CLAIM_STALE` if a claimant also crashed) plus poll cadence + (`git-commit-lock.sh:1408-1446,1228-1267`). This does **not** extend to *foreign* + objects (G-S4) — those wait for an operator. *Witness:* unit Tests 2/3/21 + (`U:197-210,348-361,1130-1154`). *Basis:* §B1/§C1/§C2/§C3. + +- **G-R2 — One stuck agent cannot wedge the fleet.** Because the lock is a lease + and the claim is itself leased, a hung-but-alive holder or claimant is recovered + within its window; the fleet does not deadlock behind it. *Witness:* the stale- + steal and crashed-claimant lanes above. *Basis:* §1, `git-commit-lock.md:60-82` + (the explicit reason for a lease over a kernel lock). + +- **G-R3 — No busy-spin; bounded wait.** A waiter on a genuinely squatted or + delete-blocked lock gives up at `MAX_WAIT` and never busy-spins past it; the + failed-steal lane logs in a damped, bounded way (`I:746-817`). *Witness:* interop + Test 14b. *Basis:* §K(4). + +- **G-R4 — No process leaves an *unowned* lock behind.** Per-attempt tokens make + the ownership-discovery read conclusive, so no process inside an + acquire/hold/release arc can install a lock nobody owns and walk away: it either + discovers it holds, or the lock is recovered by staleness, and in no case is a + steal-installed lock mistaken for owned by the wrong process + (`git-commit-lock.sh:138-157` + the leaked-token memory). The one bounded + residual — an untrappably-killed claimant's claim installed as an unowned lock — + stalls waiters ≤ one stale window with **no false success** (accepted; §B3). + *Witness:* unit Tests 31/35/36 (`U:1549-1758,2013-2164`). *Basis:* §C4. + +### 2C. Interoperation + +- **G-I1 — bash and PowerShell take the same lock.** One on-disk wire format + (`tok.`-prefixed line 1, owner line 2), one read-retry ladder + (8 attempts, 20/40/80/160/320/320/320 ms — byte-identical between ports), one + set of release verdicts, one config grammar. A `.sh` holder and a `.ps1` holder + in one tree serialize against each other and steal each other's genuinely stale + locks. *Condition:* Windows for the supported ps1 config (E6). *Witness:* the + interop suite throughout (`I:*`). *Basis:* §I1. + +--- + +## 3. Failure semantics (the shape of every degradation) + +When the tool cannot uphold a property it fails in one of these bounded, +documented ways — **never** silently: + +- **Detect, don't pretend** — a displaced holder returns 98 + WARNING (G-S1). +- **Wait, don't guess** — an unprovable state routes to poll/wait → 97, never to + a steal or a hold (G-S2). +- **Refuse, don't destroy** — a non-lock-shaped object is left in place (and + normally warned about — the warning is best-effort, see G-S4); waiters reach 97. +- **Announce, don't hide** — a broken staleness clock (unreadable mtime) warns + loudly once and disables stealing (fails safe; §4, BE-2). + +**Within the operating envelope**, the only place a *correctness* degradation can +be silent — a non-unwinding exit returning 0 while displaced — is carved out +explicitly in OOS-5. Two silences fall *outside* that scope and are disclosed +separately: a degradation **outside** the envelope (a network/sync FS silently +losing exclusion, OOS-1), and a **non-correctness** loss (a swallowed log write, +BE-4). Logging is best-effort by design; correctness is not. + +--- + +## 4. Best-effort (within the envelope, not a hard guarantee) + +These hold under normal conditions and degrade *gracefully and detectably* under +pathological scheduling or host-health failures. **Correctness (§2) is preserved +throughout; only liveness/latency degrades.** This tier is what the suite's +wall-clock test assertions are scoped against (the strict/envelope test split; see +`failure-modes.md` §K and §4 item 1). + +- **BE-1 — Wall-clock latency bounds are in poll-count, not seconds.** Recovery + latency (≈ `STALE` + poll cadence), the `MAX_WAIT` timeout, and the ~1.26s + read-retry ladder all *stretch* under CPU oversubscription or a slow FS while + still completing. The guarantee is "bounded by the configured knobs in + poll-count," not "exactly N seconds." Tests asserting a specific wall-clock or + poll-count number (Test 21's ≤20s, Test 22a's warning timing, Test 29's ≥2-CLAIM + count) assert an *envelope* bound, not a correctness bound, and may be relaxed or + gated to a defined load level (`GCL_ENVELOPE_TIER=relax`) without any product + change. *Basis:* `failure-modes.md` §K and §4 item 1. + +- **BE-2 — Diagnostic warnings are best-effort.** The wrong-type config warning + and the claim-path warning rely on poll headroom that an oversubscribed runner + can starve; the guarantee is that the *condition is handled safely*, not that a + specific warning fires within a specific time. *Basis:* §K(2), §D3. + +- **BE-3 — Recovery presumes a readable clock; an unreadable mtime fails safe.** + If the lock's mtime cannot be read at all, both ports retry three times, then + warn loudly once per process and treat the lock as **not** stale (the mtime floor + fails closed to "fresh"): no premature steal, no corruption — but recovery of a + genuinely crashed holder is *disabled* and waiters block to `MAX_WAIT` (97). + Safety is preserved; recovery is lost and announced. *Witness:* unit Test 42 + (shadows the mtime probe to return empty on a present stale ghost; the + "Staleness detection is BROKEN" warn-once fires, the ghost is left in place, + the waiter blocks to 97). *Basis:* §E3. + +- **BE-4 — Logging is best-effort and never blocks the lock.** Every log write + ends `|| true`; a failed or unwritable log write is swallowed and the lock works + unaffected (the log self-truncates past ~1 MB). *Witness:* unit Test 49 (points + `AGENT_LOCK_LOG` under a regular file so every append fails ENOTDIR; the lock + still acquires + releases cleanly with the log write swallowed — also covers + J1). *Basis:* §F2/§J1. + +- **BE-5 — The PowerShell 5.1 steal is claim-guarded, not atomic.** Windows + PowerShell 5.1 lacks the 3-arg `File.Move` overload, so its steal is + unlink-then-move with a transient absent window. Under the claim this is a + *fairness loss* (a rival's create can win the recovered path; the claimant backs + off cleanly), **never a clobber**. *Basis:* §D1, `git-commit-lock.md:471-476`. + +--- + +## 5. Out of scope (not guaranteed) + +The tool deliberately does not promise the following. Where it can, it still fails +*safely and detectably*; the point of listing them is that the strict guarantees +of §2 are **not** claimed here. + +- **OOS-1 — Network / shared / sync-backed filesystems.** NFS, SMB/CIFS, 9p, + Dropbox/OneDrive. These weaken the atomic create/rename the protocol rests on, so + exclusion may silently not hold. Documented boundary only — surfaced in the + README; **no** FS-type probe is built (decision: `failure-modes.md` §4 item 3). + *Basis:* §E1. + +- **OOS-2 — Multi-host use / clock skew across hosts.** Rides on OOS-1 (only arises + on a shared FS). A *local* clock jump on the single host is **in scope and + correctness-safe** (degrades to the detected-98 lane). *Basis:* §E2. + +- **OOS-3 — Mixed-version trees.** If contenders run different protocol versions, + the no-displacement prevention (G-S3) degrades to detection (98), and old-style + stealers can leave `.dead.*` litter. Never silent, but the prevention property is + not guaranteed. Deployment rule: **upgrade both implementations together** + (`git-commit-lock.md:251-256`; also surfaced in `README.md:101-106`). + *Basis:* §I2. + +- **OOS-4 — PowerShell port on POSIX.** Supported on Windows only; on POSIX it runs + solely as a cross-implementation protocol check. Its one residual there + (FIFO/device/socket stat as empty and take the empty-orphan lane, capping damage + at the one misconfigured inode) is accepted and documented. *Basis:* §D3. + +- **OOS-5 — A non-unwinding exit returning 0 while displaced (the no-silent-loss + boundary).** G-S1's detection requires the *lock-holding shell* to reach release + (E3). If a *displaced* holder is terminated or replaced **without unwinding** — + external SIGKILL, an `exec` that replaces the **lock-holding shell itself**, or + PowerShell `[Environment]::Exit()` — *and* the resulting process exits **0**, the + caller can see success with no 98. The `exec` case is **narrower than it looks** + (verified empirically): `lock_run` runs the wrapped command vector in the wrapper + shell (`git-commit-lock.sh:1733`), so the bypass needs the exec to run in *that* + shell — a **sourced** caller doing `lock_acquire; exec …` in its own shell, or + the contrived `run -- exec …` where the wrapped command *is* an exec. An exec + **nested in a child** — the normal `run -- bash -c 'exec …'` — does **not** + bypass: the child is replaced, the wrapper waits and releases normally. A **plain + `exit` is safe** (it unwinds). What keeps the whole class narrow: an external + SIGKILL yields a non-zero wait status (POSIX `128+9`), so a caller checking exit + codes does not see success; the hole needs a process that *deliberately* replaces + or hard-exits the lock-holding shell **and** returns 0 **while displaced**. The + *next* holder still recovers via staleness; only the abruptly-exiting one is + unwarned. No code change closes this without the handle-based ops the design + rejected. *Witness:* the §H4 non-unwinding-exit boundary is pinned by interop + Test 5 (`I:308-334`, ps1 `[Environment]::Exit()`) and unit Test 40 (bash `exec` + in the lock-holding shell, OOS-5). *Basis:* §H4. + +- **OOS-6 — Adversarial / hostile local processes.** The lock is advisory. Against + a process actively trying to break it (deleting/overwriting the lock file, or a + hostile repo redirecting the git dir), the tool *detects* interference where it + can but does not prevent it; damage from a redirected git dir is bounded to the + tool's own named files with non-recursive deletion. *Basis:* + `git-commit-lock.md:520-551`. + +- **OOS-7 — Non-issues, explicitly.** A case-insensitive FS path collision (the + lock and claim paths never collide under case folding; two case-differing + configured paths resolving to one file is *correct* shared-lock behavior) and + memory exhaustion (the scripts allocate trivially). No action. *Basis:* §D5/§F5. + +### Things deliberately NOT built (and why) + +The design considered and rejected each of these; they are not roadmap items +(`failure-modes.md` §4 "Things explicitly NOT to do"): + +- A **background heartbeat** to refresh the lease — would make the tool more than a + single synchronous script; the fail-open-but-detectable lease is the deliberate + alternative. +- A **two-rename compare-and-swap** to prevent the B3 residual — reintroduces crash + litter and a sweep, for a failure that is already bounded and false-success-free. +- **`File.Replace`** in the ps1 port — throws on a read-only destination and has + partial-failure states (pinned out by interop Test 16d). +- **Supporting network/shared filesystems** — correctness rests on local-FS atomic + create/rename; this is a boundary to document, not to engineer around. + +--- + +## 6. Staying inside the envelope (operating rules) + +- **Hold the lock only to commit.** Decide what to stage, build any patch, and + resolve failures *outside* the lock; a normal stage+commit holds it for seconds + (the golden rule, `git-commit-lock.md:433-458`). This keeps holds inside the + staleness window (E4) so G-S3 applies. +- **For a deliberately slow hold, raise `AGENT_LOCK_STALE_SECS`** for that + invocation rather than risking a fail-open steal. +- **Keep the lock on a local filesystem** (the default `/commit.lock` + almost always is) so E2 holds. +- **Upgrade both implementations together** (E5) so G-S3's prevention holds. +- **Never `git stash` in a shared checkout** — it rewrites the working tree and + clobbers other agents' edits (orthogonal to the lock, but part of operating in a + shared tree). + +--- + +## 7. Verification map + +Each guarantee → its witnessing test(s) and the failure-modes section. `U` = +`tests/git-commit-lock.test.sh`, `I` = `tests/git-commit-lock.interop.test.sh`, +`C` = `tests/git-commit-lock.canary.test.sh` (the concurrency canary), `integ` = +`tests/git-commit-lock.integration.test.sh`. The former resource-exhaustion and +diagnostic-clock coverage gaps are now closed by the fault-injection tests +(`failure-modes.md` §4 item 5): F4 (Test 48), F2/J1 (Test 49), F1 (Test 50), and the +unreadable-mtime fail-safe (Test 42). The one remaining document-only lane is F3 +(FD/inode exhaustion), which has no portable deterministic injection. + +| Guarantee | Witness | failure-modes § | +|---|---|---| +| G-S1 no silent lost update | U Test 4b + Test 16 (unverifiable lane); I Test 8 (both dirs) | §1, §B5 | +| G-S2 no corruption / no false hold | U Tests 32/32b (read-back failure); **resource lanes: Test 48 (F4), Test 50 (F1); F3 document-only** | §1, §A1, §F | +| G-S3 strict exclusion in window + no displacement | C Test 1 (8×25 canary); U Tests 2b/20; I Tests 1/6/16/16b; integ | §A1/§A2/§A3 | +| G-S4 never destroys non-lock-shaped | U Tests 17/17d/18/22 (dir/symlink/FIFO) + Test 44 (socket/device) | §D3/§D4/§G1 | +| G-S5 truthful exit codes | U Tests 7/8/4b/5/16; I run-verdict tests | §1, §H4 | +| G-R1 lock-shaped orphans reclaimed | U Tests 2/3/21 | §B1/§C1/§C2/§C3 | +| G-R2 one stuck agent can't wedge | stale-steal + crashed-claimant lanes | §1 | +| G-R3 no busy-spin; bounded wait | I Test 14b | §K(4) | +| G-R4 no unowned lock left behind | U Tests 31/35/36 | §C4 | +| G-I1 bash⇄pwsh same lock | I suite throughout | §I1 | +| BE-3 unreadable mtime fails safe | U Test 42 | §E3 | +| BE-4 logging best-effort | U Test 49 (F2/J1) | §F2/§J1 | + +The resource-exhaustion and diagnostic-clock lanes (Tests 42/48/49/50) are the +fault-injection coverage added per `failure-modes.md` §4 item 5; F3 (FD/inode +exhaustion) stays document-only for want of a portable deterministic injection. diff --git a/docs/load-testing-strategy.md b/docs/load-testing-strategy.md new file mode 100644 index 0000000..22ea393 --- /dev/null +++ b/docs/load-testing-strategy.md @@ -0,0 +1,236 @@ +# git-commit-lock: CI & load-testing strategy + +This is the rationale for *why the CI is shaped the way it is* — the principles +behind the three workflows (`tests.yml`, `nightly.yml`, `deep-sweep.yml`), the load +wrapper (`tests/with-load.sh`), and the two test-level levers (the Axis-A sweep and +the envelope tier). It describes the system as it stands; for the correctness +guarantees the suites assert against, see `docs/guarantees.md` and +`docs/failure-modes.md`. + +--- + +## 1. The principle: correctness is load-independent + +This is not a throughput-bound system whose correctness degrades under load. Safety +and exclusion rest on structural primitives — `O_EXCL` create, atomic `rename(2)`, +per-attempt token discovery — that never consult the clock for a *correctness* +decision (`guarantees.md` §2A, BE-1; `failure-modes.md` §K). No amount of CPU or IO +pressure makes a rename non-atomic or lets two `O_EXCL` creates both win on a local +filesystem. + +So load does not *change what is correct* — it only *surfaces races*. Its sole job +is to widen the timing windows in the protocol's multi-syscall sequences (which are +not individually atomic) so that the inter-process interleavings the code claims to +handle are actually exercised. The right question to ask of a load regime is "does +this raise the probability that process A is suspended between syscall N and N+1 +while process B advances?" — not "does it consume the box?". Past roughly 2× CPU +oversubscription, more load finds no new correctness bugs; it only stretches +wall-clock latency and starts tripping the suite's best-effort timing assertions. + +Two consequences shape the whole design: + +- **The per-PR gate runs no load** (strict, fast). A red required check is then + always actionable — a real correctness bug or genuine infra drift, never a + stress-manufactured wall-clock flake. +- **Load lives in non-blocking tiers** (nightly, deep-sweep), where the + load-sensitive timing assertions are relaxed to warnings so an oversubscribed + runner cannot turn a latency stretch into a red. + +## 2. Deterministic steering is the primary race-coverage lever + +The protocol's genuinely dangerous windows — create → read-back verify; the claim +recheck → touch → re-verify → rename residual; rename-over → read-back on a steal; +the release boundary — are ones where a *wrong interleaving could actually corrupt +state*. External load can only reach those windows *probabilistically*: it raises +the background chance of hitting an interleaving nobody scripted. + +The suite reaches them *deterministically* instead, by in-process function +interposition. `clone_fn` (`tests/_harness.sh`) clones a library internal (or +shadows a command like `mv`/`rm`/`touch` with a shell function) so a steering test +can land "the rival's rename" at an exact protocol position, then call the original +through the clone (the Test 23–36 steered scenarios in +`tests/git-commit-lock.test.sh`). This hits the exact protocol window every run, +attributably — which is why it, not external load, is the primary race-coverage +investment. + +External load is the secondary, broad-net lever. It earns its place mainly on the +one window it can genuinely move: the mtime-staleness / fail-open boundary, where +CPU/IO pressure stretches a contended holder past the STALE threshold and exercises +the detected-98 lane. A corollary for triage: because external load *cannot* break +correctness, a load run that produces a *correctness* failure is surfacing either a +real logic bug in a steering-reachable window (high value) or a test-harness setup +race (a harness fix, not a code fix). + +## 3. The three tiers + +### Tier R — required, per-PR (`tests.yml`) + +The blocking gate. It runs every suite (unit, interop, integration, and the +full-width concurrency canary as its own parallel cell) at full fan-out +(`GCL_TEST_FULL=1`) with **no load** and the **strict** envelope tier (the default — +the workflow sets no `GCL_ENVELOPE_TIER`, so every timing assertion is hard). The +matrix is: + +| Cell | OS | Engines / leg | Buys | +|---|---|---|---| +| ubuntu-24.04 `all` + `canary` | Linux | bash + pwsh7 | Linux correctness + interop baseline | +| macos-15 `all` + `canary` | macOS | bash + pwsh7 | BSD `stat`/`mv` lanes | +| windows-2025 `unit` | Windows | bash (MINGW) | delete-pending ghosts, FILETIME floor | +| windows-2025 `interop-integration` | Windows | bash + pwsh7 + **PowerShell 5.1** | the 5.1 non-atomic-fallback path + real NTFS commit swarm | +| windows-2025 `canary` | Windows | bash (MINGW) | full-width concurrency under process-spawn overhead | + +The canary runs as a separate parallel cell on every arch because it is about half +the Windows unit wall-clock; suites must *not* run concurrently inside one runner +(they are timing-sensitive on 2-core runners). Triggers: `pull_request` and +`push: main` (both `paths-ignore` docs/`.plans`/license), a weekly `schedule` to +catch runner-image and tool drift, and `workflow_dispatch`. The concurrency group is +`${{ github.workflow }}-${{ github.ref }}` with `cancel-in-progress: true`, so rapid +pushes coalesce. A separate `lint` job gates shellcheck (pinned v0.11.0, `-S style`) +and PSScriptAnalyzer (warning severity). + +### Tier N — nightly, scheduled (`nightly.yml`) + +A non-blocking scheduled stress run (08:23 UTC daily, plus `workflow_dispatch`). +This project has **no branch protection** (single-dev, decision 2026-06-18), so +nightly never gates a PR; its job is to catch the load-sensitive flakes and coverage +regressions the no-load per-PR gate cannot. + +Six `stress` cells run the suites wrapped in `tests/with-load.sh` at one +oversubscription level (`GCL_STRESS_RATIO=2`, R≈2), one `GCL_STRESS_KIND` each: +ubuntu×{cpu, disk, both}, macos×disk, windows interop-integration×disk, windows +unit×both. macOS gets a single cell (it is the scarce, slow pool); ubuntu absorbs +the extra kinds (cheapest). The whole workflow runs with two test-level levers +turned on (§4): `GCL_ENVELOPE_TIER=relax` (the three load-sensitive timing +assertions warn instead of failing; correctness assertions stay hard) and +`GCL_TEST_SWEEP=1` (the Axis-A waiter-count sweep). Each cell writes its own +`cell-conclusion.txt` (ground truth, captured under `always()`) and uploads its logs +plus the load-manifest on success too — the negatives are needed to read the +positives. + +A separate `kcov` job runs the unit + canary suites under kcov v43 (built from +source) on Linux, **no load, strict envelope, full fan-out**, and gates line +coverage of `git-commit-lock.sh` at a 0.80 floor (tracks ~0.83 achieved; ratchets up +as tests land). It explicitly overrides the workflow-level `relax` back to `strict` +so coverage is measured on a clean run. + +A `triage` job (`always()`) downloads every cell's artifact and classifies each into +one labelled issue per (date, class): `nightly-correctness` (a correctness assertion +failed — investigate), `nightly-envelope` (a relaxed timing miss — expected, +tracked), or `nightly-infra` (missing artifact / timeout / errored — not a product +failure). An empty-round guard prevents "0 FAIL across 0 logs" being misread as +green when an artifact set is entirely missing. + +### Tier D — on-demand deep sweep (`deep-sweep.yml`) + +`workflow_dispatch`-only; it never runs on push/PR and never gates anything. This is +the deep flake-hunting instrument — the "50-clean hunt". A dispatch picks a +`stress_kind`, an optional raw `stress_load` override, a `repeat` count, and an +`envelope_tier` (defaults `relax`). Each suite is run `repeat` times under load in a +fail-fast loop that names the failing iteration. The concurrency group is per-run +(`deep-${{ github.run_id }}`) so many parallel dispatches fan out freely and accept +queue waves rather than cancelling each other. Timeouts are deliberately generous +(deep + loaded + repeated is far slower than the gate). + +## 4. The two test-level levers + +These let the existing tests yield more under load without touching the per-PR +gate's behaviour. + +**The Axis-A waiter-count sweep** (`GCL_TEST_SWEEP`, `T_AXIS_A` in +`tests/_harness.sh`). By default `T_AXIS_A="4"`, so per-PR and plain dev runs are +byte-identical to the historical behaviour. Under `GCL_TEST_SWEEP=1` (nightly and +deep only) it becomes `"4 12 24"`, and the fan-out/contention tests iterate over it — +unit Test 2b, unit Test 20 (which composes its own list from its mode-driven floor +plus the sweep's higher counts), and interop Test 16 — each naming N in every +assertion message so a sweep failure says which N broke. This widens the +thundering-herd / claim-serialization and displacement windows that re-running N=4 +never will. Correctness assertions are kept config-independent (e.g. hold ≫ STALE so +"zero-98 / one-steal" stays a pure correctness statement) and MAX_WAIT scales with N, +so a large-N run doesn't time out and *look* like a product failure. + +**The envelope tier** (`GCL_ENVELOPE_TIER`, default `strict`, in +`tests/git-commit-lock.test.sh`). A wall-clock or poll-count bound is a best-effort +liveness property (`guarantees.md` BE-1), not a correctness one. The `ok_envelope` / +`bad_envelope` assertion helpers behave exactly like the hard `ok`/`bad` under +`strict`; under `relax` a `bad_envelope` becomes a `WARN` that does not increment +FAIL. Three assertions are tiered this way — recovery latency ≤20s (Test 21), the +claim-path config warning firing (Test 22a), and the failed-steal's claim being +re-created rather than left to age out (Test 29). Nightly and deep set `relax`; +per-PR and the kcov job never do. So an oversubscribed runner can stretch wall-clock +to a warning without reddening correctness, while correctness assertions stay hard in +both tiers. + +## 5. How load is calibrated (`tests/with-load.sh`) + +The wrapper runs a command under a calibrated, reproducible background load, then +tears it down by *exact spawned PIDs* (never by name — safe on a shared box and on an +ephemeral runner) and propagates the wrapped command's exit code. + +- **Load is an oversubscription ratio**, not an absolute hog count: + `GCL_STRESS_RATIO` (R, default 1) gives stressors-per-kind = `round(R × nproc)`, + floored at 1 for a selected kind. "R=2" means the same pressure on a 2-core and a + 32-core runner, where a raw hog count would not. +- **The total ratio is capped** by `GCL_STRESS_RATIO_MAX` (default 2). `both` runs + cpu + disk, so its total would be 2R; the cap scales each kind down proportionally + so the runner is never wedged. The deep-sweep flake hunt can raise it deliberately. +- **`GCL_STRESS_KIND`** selects `none` (clean pass-through, zero added load), + `cpu`, `disk`, or `both`. **`GCL_STRESS_LOAD`** is a back-compat raw per-kind + count override (kept so the deep-sweep `stress_load` input keeps working); empty + ⇒ use the ratio. +- **CPU stressor:** `stress-ng --cpu` when available (calibrated, measurable), else a + portable bash spin loop. **Disk stressor:** a tight create / write+fsync / delete + loop over a small file on the test scratch volume — real metadata + write-back + pressure that contends with the lock-file create/delete the suite itself does + (always the portable shell hog; cross-platform, low-fidelity but real). +- **A per-run `load-manifest` JSON** is written next to the suite logs (on success + too): `{kind, R, ratio_max, raw-load override, nproc, cpu/disk/total stressor + counts, capped?, cpu mechanism, cgroup probe, baseline/loaded ms, achieved + slowdown, tool versions, os/arch, git sha, command}`, so any flake is reproducible. + A cheap fixed bash micro-benchmark, timed unloaded then mid-load, records a coarse + achieved-slowdown figure (only when load is actually applied). + +### Platform asymmetry (current operating facts) + +The platforms diverge too much for a uniform calibrated injection layer, so the +wrapper is honest about which regime ran: + +- Deterministic steering is portable (bash everywhere; pwsh equivalent) — the real + race-coverage tool, on every leg. +- Calibrated CPU throttling via a cgroup v2 quota is **Linux-only and probe-gated**: + `GCL_STRESS_CGROUP=1` makes the wrapper *probe* for a writable cgroup v2 cpu + controller and record the result in the manifest (`writable` / + `present-not-delegated` / `no-cpu-controller` / `no-cgroup-v2`); it does not create + scopes here (that needs a usable systemd manager). IO cgroup throttling is + experimental and intentionally not attempted. +- Everywhere else (macOS, Windows) load is blunt CPU/disk oversubscription — + uncalibrated but real pressure. + +## 6. GitHub Actions operating facts + +- **Minutes are free on public repos; concurrency is the real ceiling.** Free-plan + accounts cap concurrent jobs (~20 total, with a smaller macOS sub-limit). A matrix + past that *queues* into waves, it doesn't fail. The required gate stays small + enough to run in one wave; the deep sweep intentionally exceeds it and accepts + waves. macOS is the slowest and scarcest pool, so it is kept sparse across all + tiers; ubuntu (cheapest) is used liberally. +- **`fail-fast: false`** on every matrix — an OS-specific failure is exactly the + signal we want, so the other legs finish. +- **`paths-ignore` and required checks:** `tests.yml` filters docs/`.plans`/license + paths. A workflow whose jobs are *required* checks would leave those checks + Pending (blocking merge) when skipped by a path filter — but this project has no + branch protection, so the filter just saves runner minutes on doc-only pushes + without that hazard. +- **Artifacts** are uploaded with `include-hidden-files: true` (the integration + suite's key diagnostics — lock log, repo state — live under the scratch repo's + `.git/`) and named uniquely per cell so parallel uploads never collide. +- All actions are SHA-pinned. + +## 7. The discipline: required = always-meaningful-red + +The invariant that ties it together: **required is always-meaningful-red; nightly is +triaged-amber-tolerant; deep is noise-by-design.** Keeping artificial load off the +required gate is what makes a red gate trustworthy; putting all load in non-blocking +tiers with the envelope assertions relaxed is what stops load from manufacturing +flakes that erode that trust. The required tier is never retry-masked — a retry that +hid a 1-in-20 real race would defeat the silent-loss class this tool exists to +prevent. diff --git a/tests/_harness.sh b/tests/_harness.sh new file mode 100644 index 0000000..7529cca --- /dev/null +++ b/tests/_harness.sh @@ -0,0 +1,193 @@ +# shellcheck shell=bash +# tests/_harness.sh — shared test harness for the git-commit-lock suites. +# +# Sourced by all four suites (git-commit-lock.test.sh, .canary.test.sh, +# .interop.test.sh, .integration.test.sh) to share the bits they all +# copy-pasted: the PASS/FAIL/TAP counters, the GCL_TAP / GCL_TEST_ONLY reads, +# ok()/bad(), section(), the end-of-suite DONE sentinel (finish), and the +# per-test selector verdict helper. +# Pure deduplication — ZERO behaviour change vs the inline copies it replaces. +# +# Contract for sourcing suites: +# * Source this EARLY (before any use of the inits/helpers below), CWD- +# independently — resolve it from the sourcing script's own location: +# _HARNESS_DIR="$(CDPATH= cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# # shellcheck source=tests/_harness.sh +# . "$_HARNESS_DIR/_harness.sh" +# * Each suite still defines its OWN cleanup() (it closes over the suite's +# $WORK and the bodies genuinely differ); finish() below calls whatever +# cleanup() is in scope when the EXIT trap fires. +# * Each suite installs the trap itself: `trap finish EXIT`. +# * The suite reaching its end sets DONE=1 before its verdict line. +# +# The whole project runs its suites under `set -uo pipefail` (NOT set -e); these +# helpers are written for that (they assert on values, never on implicit exit +# propagation), and the disables below cover the idioms that pervade the suites. +# +# shellcheck disable=SC2015 # The pervasive ` && ok ... || bad ...` +# idiom is deliberate throughout: ok/bad are echo+counter helpers that cannot +# fail, so the classic A && B || C pitfall (C running after B fails) is moot. +# shellcheck disable=SC2310,SC2312 # info-level, deliberate: helper functions +# and command substitutions run inside conditions all over a test suite; the +# suites run WITHOUT errexit (set -uo only) and assert on values, not on +# implicit exit propagation. + +PASS=0; FAIL=0; TAPN=0; DONE=0; SECTIONS_RUN=0 +GCL_TAP="${GCL_TAP:-0}" # CI sets GCL_TAP=1 for machine-readable TAP13 output +GCL_TEST_ONLY="${GCL_TEST_ONLY:-}" # if set, run ONLY test blocks whose label REGEX-matches (single-test selector) + +# Axis-A waiter-count sweep (see load-testing-strategy.md). GCL_TEST_SWEEP=1 (nightly/deep CI) widens +# the fan-out/contention tests over several waiter counts to wring more coverage +# from the existing tests; unset/0 (per-PR default + plain dev) keeps the floor so +# default runs are byte-identical to today. T_AXIS_A is the shared waiter-count +# list the contention tests (unit Test 2b, interop Test 16) iterate N over; each +# names N in every assertion message so a sweep failure says which N broke. The +# floor is 4 — the count those two tests hardcode today, so the single-element +# default reproduces today's behaviour exactly. (Test 20's floor is mode-driven +# `$T20_N` (5 REDUCED / 10 FULL), not 4, so it composes its own list from $T20_N + +# the sweep's higher counts rather than from T_AXIS_A — see that test.) +GCL_TEST_SWEEP="${GCL_TEST_SWEEP:-0}" +# shellcheck disable=SC2034 # T_AXIS_A is consumed by the sourcing suites (unit +# Test 2b, interop Test 16), not within this harness file. +if [ "$GCL_TEST_SWEEP" = 1 ]; then T_AXIS_A="4 12 24"; else T_AXIS_A="4"; fi + +# ok/bad are TAP-aware (gated by GCL_TAP so plain dev runs are byte-unchanged) and +# bump the running assertion number TAPN. The trailing `1..$TAPN` plan line (emitted +# by each suite just before its verdict) lets a TAP consumer fail on a short count; +# together with the DONE sentinel below this closes the silent-undercount gap. +# `return 0` preserves the "ok/bad cannot fail" property the +# ` && ok ... || bad ...` idiom relies on. +ok() { PASS=$((PASS+1)); TAPN=$((TAPN+1)); echo "PASS: $*" + [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $*"; return 0; } +bad() { FAIL=$((FAIL+1)); TAPN=$((TAPN+1)); echo "FAIL: $*" + [ "$GCL_TAP" = 1 ] && echo "not ok $TAPN - $*"; return 0; } + +# Per-test gate: echoes the block header (so a normal run is byte-unchanged) and +# returns success iff GCL_TEST_ONLY is unset/empty OR its regex matches the label. +# Each top-level `== Test N: ==` block is wrapped `if section "..."; then ... fi`. +# Bumps SECTIONS_RUN on a match so the verdict's zero-match guard (selector_report) +# can catch a selector regex that matched nothing. +section() { + echo "== $1 ==" + if [ -z "${GCL_TEST_ONLY:-}" ] || [[ "$1" =~ $GCL_TEST_ONLY ]]; then + SECTIONS_RUN=$((SECTIONS_RUN + 1)); return 0 + fi + return 1 +} + +# Sentinel: the suite reaching its end sets DONE=1. If the EXIT trap fires with +# DONE!=1, the suite died early (a stray exit/crash) and the assertion count is +# unreliable — fail loudly even if the pre-trap code was 0. A bare trap `return` +# is IGNORED (the script keeps its pre-trap code), so the guard must `exit 1`. +# Calls the suite-local cleanup() (each suite defines its own, closing over its +# own $WORK); whatever cleanup is in scope when the trap fires is used. +finish() { + cleanup + if [ "${DONE:-0}" != 1 ]; then + echo "Bail out! suite terminated early before the plan line; ran ${TAPN:-0} assertion(s), count unreliable" >&2 + exit 1 + fi +} + +# Selector verdict helper, called by the section-using suites just before their +# verdict line. Two parts, both gated on GCL_TEST_ONLY being non-empty so a +# default run stays byte-identical: +# 1. Zero-match guard: a set-but-non-matching GCL_TEST_ONLY ran NO test block, +# so the (vacuously green) verdict would lie — a typo'd selector regex must +# FAIL, not pass with zero assertions. Bail loudly. (The finish EXIT trap +# also fires here since DONE is still 0; this exit is non-zero regardless.) +# 2. Report how many blocks the selector matched. +# Integration does NOT call this — it is one indivisible scenario that does not +# use section(), so it note-and-ignores GCL_TEST_ONLY at its top instead. +selector_report() { + if [ -n "${GCL_TEST_ONLY:-}" ] && [ "$SECTIONS_RUN" = 0 ]; then + echo "Bail out! GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" matched no test" >&2 + exit 1 + fi + [ -n "${GCL_TEST_ONLY:-}" ] && echo "selector GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" ran $SECTIONS_RUN test block(s)" + return 0 +} + +# --- Shared timing/lock helpers (unit + interop; integration uses none) ------- +# Backdate a path's mtime by $2 seconds — how a test fakes a stale lock (the +# lock's staleness clock is the lock FILE's own mtime, stamped by the creating +# write). Portable: BSD/macOS touch has no `-d @epoch`, so convert the target +# epoch to a `touch -t` stamp via GNU `date -d @` with BSD `date -r` as +# fallback. +epoch_to_stamp() { + date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null +} +backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; } + +# Token-guarded backdate for the contended-recovery rounds (unit T2b / +# interop T16/T16b). Why: under load a fast waiter can complete its ENTIRE steal +# (claim -> rename-over -> ACQUIRED) before the harness's `touch` executes, so a +# blind backdate lands on the WINNER'S freshly installed lock, making it +# instantly stale for every rival — a legitimate re-steal then fails the round's +# "zero 98s / exactly one STOLE-BY-CLAIM" assertions although the protocol +# behaved exactly as designed (observed 2026-06-12 on a loaded box). Verdicts: +# * pre-read not the ghost: a waiter stole the ghost BEFORE the touch (it +# aged stale naturally during a stalled sync); no touch is performed and +# the round premise is gone — invalid, the caller retries the round. +# * post-read the ghost: conclusive — nothing ever rewrites the ghost +# token at the path, so the touch verifiably hit the ghost; any steal +# after the post-read steals an ALREADY-ancient ghost, exactly the +# scenario the round wants. Valid. +# * post-read anything else: a steal raced the touch->re-read window — +# COMMON under load (waiters poll every 0.05s; the post-read costs +# subprocess spawns), so it must not blindly invalidate. The lock's +# MTIME arbitrates which file the touch hit: a winner's installed lock +# is FRESH (the rename carries the claim file's just-created mtime), so +# fresh => the touch hit the GHOST and a legitimate steal followed — +# valid; ancient => the touch landed on the WINNER'S live lock and +# corrupted the round — invalid, retry. Vanished => cannot arbitrate — +# invalid, retry. +backdate_ghost() { # $1=lock $2=ghost token $3=age-secs -> 0 iff the round premise is intact + local pre post now mt + pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" + [ "$pre" = "$2" ] || return 1 + backdate "$1" "$3" 2>/dev/null || return 1 + post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" + [ "$post" = "$2" ] && return 0 + [ -e "$1" ] || return 1 + now="$(date +%s)" + mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1 + [ $(( now - mt )) -lt $(( $3 / 2 )) ] +} + +# Wait for every waiter's WAITING line while keeping the ghost lock FRESH +# (touch -c to now, no-create so a released path is never resurrected): a +# fresh ghost cannot be judged stale, so no waiter can steal it before the +# guarded backdate — without this, a sync stalled past STALE (slow worker +# cold starts on a loaded box) lets the ghost age stale naturally and a +# waiter steals it mid-sync. Freshening is race-safe: if a steal slipped in +# anyway, touching the winner's (already fresh) live lock to "now" is a +# harmless no-op, and backdate_ghost's pre-read catches the broken premise. +sync_waiting_fresh() { # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING + local lock="$1" deadline f ok=1 + deadline=$(( $(date +%s) + $2 )); shift 2 + for f in "$@"; do + until grep -q "WAITING for lock" "$f" 2>/dev/null; do + touch -c "$lock" 2>/dev/null + if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi + sleep 0.2 + done + done + [ "$ok" = 1 ] +} + +# Fabricate a lock file the way a real (foreign) holder would have written it: +# token line + owner line. The token MUST be "tok."-prefixed (wire format) or +# the steal's content guard will — correctly — refuse to steal it. +fabricate_lock() { # $1=path $2=token $3=owner + printf '%s\n%s\n' "$2" "$3" > "$1" +} + +# Wait (up to $3 seconds, default 15) for a pattern to appear in a file. Used to +# gate on the WAITING log line: proof the waiter actually contended, without a +# fixed-length hold. +wait_for_grep() { + local pat="$1" f="$2" tries=$(( ${3:-15} * 20 )) + while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done + grep -q "$pat" "$f" 2>/dev/null +} diff --git a/tests/git-commit-lock.canary.test.sh b/tests/git-commit-lock.canary.test.sh new file mode 100755 index 0000000..0f30461 --- /dev/null +++ b/tests/git-commit-lock.canary.test.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# git-commit-lock.canary.test.sh — the concurrency CANARY, extracted from the +# unit suite (git-commit-lock.test.sh) into its own file so it runs as a +# naturally-parallel CI job. +# +# Runs entirely against throwaway temp dirs, so it never touches the repo you +# launch it from. Exit 0 == pass. +# bash tests/git-commit-lock.canary.test.sh +# +# This is a STATISTICAL concurrency canary — N workers race the lock over +# repeated rounds; repetition at width is its coverage. It is cheap on +# Linux/macOS (fast process spawn) but pathological on Windows (~half the +# Windows unit wall-clock), which is exactly why it lives in its own cell. +# +# Fan-out: defaults to REDUCED width so routine dev runs don't lag a live shared +# machine; set GCL_TEST_FULL=1 (CI does) for the full-strength 8x25 canary. The +# file prints which mode ran — a reduced pass must never masquerade as the full one. +# +# On failure the work dir is PRESERVED (path printed) for post-mortem; set +# GCL_TEST_PRESERVE_DIR= to additionally copy all logs/outputs there +# regardless of outcome (used by CI). +# +# shellcheck disable=SC2015 # The pervasive ` && ok ... || bad ...` +# idiom is deliberate throughout: ok/bad are echo+counter helpers that cannot +# fail, so the classic A && B || C pitfall (C running after B fails) is moot. +# shellcheck disable=SC2310,SC2312 # info-level, deliberate: helper functions +# and command substitutions run inside conditions all over a test suite; the +# suite runs WITHOUT errexit (set -uo only) and asserts on values, not on +# implicit exit propagation. +# shellcheck disable=SC2016 # $INCR is single-quoted on purpose: it expands +# inside the worker's `bash -c`, not here. +set -uo pipefail + +# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad, +# section, the finish EXIT-trap sentinel (calls our cleanup below). Resolved from +# THIS script's own dir so it sources regardless of CWD; sourced EARLY (before any +# use of the inits/helpers below). +_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/_harness.sh +. "$_HARNESS_DIR/_harness.sh" + +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$DIR/.." && pwd)" # the implementations live at the repo root +LIB="$ROOT/git-commit-lock.sh" + +if [ "${GCL_TEST_FULL:-0}" = 1 ]; then + GCL_MODE="FULL"; T1_ROUNDS=8; T1_N=25 +else + GCL_MODE="REDUCED"; T1_ROUNDS=3; T1_N=8 +fi +echo "fan-out mode: $GCL_MODE (T1 ${T1_ROUNDS} rounds x ${T1_N} workers)" +[ "$GCL_MODE" = REDUCED ] && echo " (set GCL_TEST_FULL=1 for the full-strength 8x25 canary — CI runs it)" + +WORK="$(mktemp -d 2>/dev/null || echo "${TMPDIR:-/tmp}/git-commit-lock-test.$$")" +mkdir -p "$WORK" +cleanup() { + if [ -n "${GCL_TEST_PRESERVE_DIR:-}" ]; then + mkdir -p "$GCL_TEST_PRESERVE_DIR" 2>/dev/null || true + cp -R "$WORK"/. "$GCL_TEST_PRESERVE_DIR"/ 2>/dev/null || true + echo "note: copied test artifacts to $GCL_TEST_PRESERVE_DIR" + fi + if [ "${FAIL:-0}" -gt 0 ]; then + echo "note: failures detected — work dir preserved for post-mortem: $WORK" + else + rm -rf "$WORK" 2>/dev/null || true + fi +} +# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup() +# above and fails loudly if the suite died before setting DONE=1. +trap finish EXIT + +# The RESULT line below expands $ENV_WARN, which in the unit suite is maintained +# by the envelope-tier assertions (ok_envelope/bad_envelope). The canary uses +# only plain ok/bad (no envelope assertions), so define it to 0 here so the +# standard RESULT line works unchanged under set -u. +ENV_WARN=0 + +# Critical section that loses updates without a mutex: read, gap, write+1. +INCR='n="$(cat "$1")"; sleep 0.03; echo $((n+1)) > "$1"' + +if section "Test 1: concurrent workers, mutual exclusion (repeated rounds, $GCL_MODE width)"; then +# A single pass is too weak to trust a rare exclusion race (the release-steal +# bug found 2026-05-30 lost ~1 update per 25 only intermittently). Repeat +# several rounds; ANY lost update across ALL rounds fails the test. +# MAX_WAIT caps a regression at 180s per worker instead of the 420s default; +# STALE stays comfortably above any realistic hold so nothing is ever stolen. +N=$T1_N; ROUNDS=$T1_ROUNDS; t1_fail=0; T1ERR="$WORK/excl.err"; : > "$T1ERR" +for r in $(seq 1 "$ROUNDS"); do + COUNTER="$WORK/counter.$r"; echo 0 > "$COUNTER" + LOCK="$WORK/excl.$r.lock"; LOG="$WORK/excl.$r.log"; : > "$LOG"; pids=() + for _ in $(seq 1 "$N"); do + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=120 \ + AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=180 \ + bash "$LIB" run -- bash -c "$INCR" _ "$COUNTER" 2>> "$T1ERR" & + pids+=($!) + done + for p in "${pids[@]}"; do wait "$p"; done + c="$(cat "$COUNTER")"; a="$(grep -c ACQUIRED "$LOG")"; rl="$(grep -c RELEASED "$LOG")" + if [ "$c" != "$N" ] || [ "$a" != "$N" ] || [ "$rl" != "$N" ] || [ -e "$LOCK" ]; then + t1_fail=1; echo " round $r: counter=$c acquired=$a released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)" + fi +done +[ "$t1_fail" = 0 ] && ok "$ROUNDS rounds x $N workers ($GCL_MODE): no lost updates, balanced acquire/release, no leftover lock" \ + || bad "mutual-exclusion failure in at least one round (see above)" +# Regression: under contention the lock file routinely vanishes mid-mtime-probe; +# that must NOT be misdiagnosed as "staleness detection broken" (false WARNING +# observed 2026-06-10 before the probe got its retry loop). +grep -q "Staleness detection is BROKEN" "$T1ERR" \ + && bad "spurious mtime-probe WARNING under contention (see $T1ERR)" \ + || ok "no spurious mtime-probe warnings under contention" +fi + +# Zero-match guard + selector-report line (shared helper in _harness.sh): a +# set-but-non-matching GCL_TEST_ONLY ran NO test block, which without the guard +# would fall through to a vacuous PASS=0 FAIL=0 "green". Near-pointless in a +# one-test file, but zero-cost and keeps the finish/zero-match scaffolding +# uniform with the other suites. +selector_report + +DONE=1 +echo +echo "==== RESULT: $PASS passed, $FAIL failed, $ENV_WARN envelope warning(s) (fan-out: $GCL_MODE) ====" +[ "$GCL_TAP" = 1 ] && echo "1..$TAPN" +[ "$FAIL" = 0 ] diff --git a/tests/git-commit-lock.integration.test.sh b/tests/git-commit-lock.integration.test.sh index a142bba..49badf8 100644 --- a/tests/git-commit-lock.integration.test.sh +++ b/tests/git-commit-lock.integration.test.sh @@ -36,6 +36,13 @@ # they expand inside a worker's `bash -c` invocation, not here. set -uo pipefail +# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad, +# section, the finish EXIT-trap sentinel (calls our cleanup below). Resolved from +# THIS script's own dir so it sources regardless of CWD. +_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/_harness.sh +. "$_HARNESS_DIR/_harness.sh" + DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="$(cd "$DIR/.." && pwd)" # the implementations live at the repo root LIB="$ROOT/git-commit-lock.sh" @@ -59,11 +66,9 @@ cleanup() { rm -rf "$WORK" 2>/dev/null || true fi } -trap cleanup EXIT - -PASS=0; FAIL=0 -ok() { echo "PASS: $*"; PASS=$((PASS+1)); } -bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); } +# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup() +# above and fails loudly if the suite died before setting DONE=1. +trap finish EXIT # --- sizing ------------------------------------------------------------------ # Commits serialise (that's the whole point), so wall time ≈ workers x commit @@ -95,6 +100,15 @@ echo "fan-out mode: $GCL_MODE (bash swarm ${BROUNDS}x${BN}, mixed swarm ${MSH}+$ # bounded max wait so a wedge fails the suite instead of hanging it. LK_ENV=(AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=240) +# Note-and-ignore the per-test selector the unit/interop suites honour: this +# suite is ONE indivisible scenario (Tests 1-3 share a single repo + the ALL_IDS +# accumulator, and Test 3 audits Tests 1+2's output), so a per-block selector +# can't apply. If GCL_TEST_ONLY is set (read by _harness.sh), say so loudly on +# stderr and run the whole scenario as normal. +if [ -n "$GCL_TEST_ONLY" ]; then + echo "NOTE: integration suite ignores GCL_TEST_ONLY=\"$GCL_TEST_ONLY\" — Tests 1-3 are one indivisible scenario (shared repo + ALL_IDS audit); running the whole suite." >&2 +fi + # --- scratch repo ------------------------------------------------------------ REPO="$WORK/repo"; OUTD="$WORK/out"; NOHOOKS="$WORK/nohooks" mkdir -p "$REPO" "$OUTD" "$NOHOOKS" @@ -301,5 +315,7 @@ done || bad "$n_next leftover claim file(s) beside the lock" echo +DONE=1 echo "==== INTEGRATION RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ====" +[ "$GCL_TAP" = 1 ] && echo "1..$TAPN" [ "$FAIL" = 0 ] diff --git a/tests/git-commit-lock.interop.test.sh b/tests/git-commit-lock.interop.test.sh index 06fe746..bfb0e44 100644 --- a/tests/git-commit-lock.interop.test.sh +++ b/tests/git-commit-lock.interop.test.sh @@ -40,6 +40,16 @@ # they expand inside a worker's `bash -c` or pwsh invocation, not here. set -uo pipefail +# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad, +# section, the finish EXIT-trap sentinel (calls our cleanup below), and the +# shared timing/lock helpers (epoch_to_stamp, backdate, backdate_ghost, +# sync_waiting_fresh, fabricate_lock, wait_for_grep). Resolved from THIS +# script's own dir so it sources regardless of CWD; sourced EARLY (before any +# use of the inits/helpers below). +_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/_harness.sh +. "$_HARNESS_DIR/_harness.sh" + DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="$(cd "$DIR/.." && pwd)" # the implementations live at the repo root SH="$ROOT/git-commit-lock.sh" @@ -67,9 +77,11 @@ WORK="$(pwsh -NoProfile -Command '[IO.Path]::Combine([IO.Path]::GetTempPath(), " WORK="${WORK//\\//}" mkdir -p "$WORK" -PASS=0; FAIL=0 -ok() { echo "PASS: $*"; PASS=$((PASS+1)); } -bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); } +# The PASS/FAIL/TAP/SECTIONS_RUN inits, the GCL_TAP/GCL_TEST_ONLY reads, ok/bad, +# and section() all come from _harness.sh (sourced above). GCL_TEST_ONLY is the +# single-test selector: a that runs only the `== Test N: ==` +# blocks whose label matches (BASH =~); unset/empty runs every block; a typo'd +# regex that matches nothing bails out loudly at the verdict (selector_report). # Failure post-mortems need the logs: keep $WORK when anything failed, and # honour GCL_TEST_PRESERVE_DIR (the CI preserve-logs knob) by copying @@ -86,7 +98,9 @@ cleanup() { fi rm -rf "$WORK" 2>/dev/null || true } -trap cleanup EXIT +# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup() +# above and fails loudly if the suite died before setting DONE=1. +trap finish EXIT # Poll for a marker file: ready-markers replace fixed head-start sleeps so a # slow pwsh cold-start (1-3s+ under load) can't fake an ordering failure. @@ -95,88 +109,11 @@ wait_for() { # $1=file $2=max iterations of 50ms (default 200 = 10s) return 1 } -# Wait (up to $3 seconds, default 15) for a pattern to appear in a file — -# used to gate on the WAITING log line (proof a waiter actually contended) -# without a fixed-length hold. Same helper as the unit suite. -wait_for_grep() { - local pat="$1" f="$2" tries=$(( ${3:-15} * 20 )) - while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done - grep -q "$pat" "$f" 2>/dev/null -} - -# Backdate a path's mtime by $2 seconds — how a test fakes a stale lock (the -# staleness clock is the lock FILE's own mtime, stamped by the creating -# write). Portable: BSD/macOS touch has no `-d @epoch`, so convert the target -# epoch to a `touch -t` stamp via GNU `date -d @` with BSD `date -r` as -# fallback (same helper as the unit suite). -epoch_to_stamp() { - date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null -} -backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; } - -# Token-guarded backdate for the contended-recovery tests (T16/T16b; same -# guard as the unit suite's T2b — full rationale there). Why: under load a -# fast waiter can complete its ENTIRE steal (claim -> rename-over -> -# ACQUIRED) before the harness's `touch` executes, so a blind backdate lands -# on the WINNER'S freshly installed lock, making it instantly stale for -# every rival — a legitimate re-steal then fails the test's "zero 98s / -# exactly one STOLE-BY-CLAIM" assertions although the protocol behaved -# exactly as designed (observed 2026-06-12 on a loaded box: a fast pwsh -# waiter judged the FRESH ghost at age==STALE, stole and ACQUIRED before the -# touch, which then aged its live lock to 10000s and a rival re-stole it). -# Verdicts: -# * pre-read not the ghost: stolen BEFORE the touch (no touch performed) — -# invalid, the caller retries the run. -# * post-read the ghost: conclusive — the touch hit the ghost. Valid. -# * post-read anything else: a steal raced the touch->re-read window — -# COMMON under load (waiters poll every 0.05s; the post-read costs -# subprocess spawns), so it must not blindly invalidate. The lock's -# MTIME arbitrates which file the touch hit: a winner's installed lock -# is FRESH (the rename carries the claim file's just-created mtime), so -# fresh => the touch hit the GHOST and a legitimate steal followed — -# valid; ancient => the touch landed on the WINNER'S live lock and -# corrupted the run — invalid, retry. Vanished => cannot arbitrate — -# invalid, retry. -backdate_ghost() { # $1=lock $2=ghost token $3=age-secs -> 0 iff the run premise is intact - local pre post now mt - pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" - [ "$pre" = "$2" ] || return 1 - backdate "$1" "$3" 2>/dev/null || return 1 - post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" - [ "$post" = "$2" ] && return 0 - [ -e "$1" ] || return 1 - now="$(date +%s)" - mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1 - [ $(( now - mt )) -lt $(( $3 / 2 )) ] -} - -# Wait for every waiter's WAITING line while keeping the ghost lock FRESH -# (touch -c to now, no-create so a released path is never resurrected): a -# fresh ghost cannot be judged stale, so no waiter can steal it before the -# guarded backdate — without this, a sync stalled past STALE (slow pwsh cold -# starts on a loaded box) lets the ghost age stale naturally and a waiter -# steals it mid-sync. Freshening is race-safe: if a steal slipped in anyway, -# touching the winner's (already fresh) live lock to "now" is a harmless -# no-op, and backdate_ghost's pre-read catches the broken premise. -sync_waiting_fresh() { # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING - local lock="$1" deadline f ok=1 - deadline=$(( $(date +%s) + $2 )); shift 2 - for f in "$@"; do - until grep -q "WAITING for lock" "$f" 2>/dev/null; do - touch -c "$lock" 2>/dev/null - if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi - sleep 0.2 - done - done - [ "$ok" = 1 ] -} - -# Fabricate a lock file the way a real (foreign) holder would have written it: -# token line + owner line. The token MUST be "tok."-prefixed (wire format) or -# the steal's content guard will — correctly — refuse to steal it. -fabricate_lock() { # $1=path $2=token $3=owner - printf '%s\n%s\n' "$2" "$3" > "$1" -} +# wait_for_grep, epoch_to_stamp, backdate, backdate_ghost, sync_waiting_fresh, +# and fabricate_lock now live in _harness.sh (sourced above) — shared +# byte-for-byte with the unit suite. (wait_for above is interop-only: its arg-2 +# is a count of 50ms iterations, distinct from the unit suite's wait_for_file +# whole-seconds semantics, so the two poll helpers stay separate.) # A pwsh process that holds the lock FILE open with FileShare.Read — the # no-delete-share handle class that blocks unlink AND rename alike (probe @@ -224,7 +161,7 @@ ps_worker() { # $1=lock $2=log $3=holder $4=violations $5=id pwsh -NoProfile -File "$PS1WIN" run "$body" } -echo "== Test 1: mixed pwsh+bash workers, mutual exclusion across implementations ($GCL_MODE width) ==" +if section "Test 1: mixed pwsh+bash workers, mutual exclusion across implementations ($GCL_MODE width)"; then NSH=$T1_NSH; NPS=$T1_NPS; TOT=$((NSH+NPS)) LOCK="$WORK/excl.lock" HOLDER="$WORK/holder"; : > "$HOLDER"; VIOL="$WORK/violations"; : > "$VIOL" @@ -259,8 +196,9 @@ else [ "$st" != 0 ] && { echo " STALE/STEAL log lines:"; grep -E "STALE|STOLE" "$WORK/excl-all.log" | sed 's/^/ /'; } bad "cross-impl exclusion/balance: violations=$nv steals=$st acquired=$a (floor $((TOT/2))) released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)" fi +fi -echo "== Test 2: a bash holder blocks a pwsh waiter (no concurrent hold, no wrongful steal) ==" +if section "Test 2: a bash holder blocks a pwsh waiter (no concurrent hold, no wrongful steal)"; then LOCK="$WORK/b2.lock"; LOG="$WORK/b2.log"; : > "$LOG"; ORDER="$WORK/b2.order"; : > "$ORDER" READY="$WORK/b2.ready"; rm -f "$READY" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \ @@ -276,8 +214,9 @@ wait "$holder" got="$(tr '\n' ',' < "$ORDER")" [ "$got" = "sh-start,sh-end,ps-ran," ] && ok "bash-holds / pwsh-waits ordering correct" || bad "ordering wrong: $got" grep -q STOLE "$LOG" && bad "pwsh wrongly STOLE a live bash lock" || ok "pwsh did not steal the live bash lock" +fi -echo "== Test 3: a pwsh holder blocks a bash waiter ==" +if section "Test 3: a pwsh holder blocks a bash waiter"; then LOCK="$WORK/b3.lock"; LOG="$WORK/b3.log"; : > "$LOG"; ORDER="$WORK/b3.order"; : > "$ORDER" READY="$WORK/b3.ready"; rm -f "$READY" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \ @@ -290,8 +229,9 @@ wait "$holder" got="$(tr '\n' ',' < "$ORDER")" [ "$got" = "ps-start,ps-end,sh-ran," ] && ok "pwsh-holds / bash-waits ordering correct" || bad "ordering wrong: $got" grep -q STOLE "$LOG" && bad "bash wrongly STOLE a live pwsh lock" || ok "bash did not steal the live pwsh lock" +fi -echo "== Test 4: pwsh steals a STALE lock fabricated as bash's (old file mtime) ==" +if section "Test 4: pwsh steals a STALE lock fabricated as bash's (old file mtime)"; then # AGENT_LOCK_MAX_WAIT caps the run so a steal regression fails in ~20s, not 420s. LOCK="$WORK/b4.lock"; LOG="$WORK/b4.log"; : > "$LOG"; MARK="$WORK/b4.mark"; printf '%s' before > "$MARK" fabricate_lock "$LOCK" "tok.sh.ghost.1" "pid=99999 host=ghost" @@ -304,22 +244,28 @@ grep -q STOLE "$LOG" && ok "log records the cross-impl steal" || bad "no STOLE e grep -q "holder=pid=99999 host=ghost" "$LOG" \ && ok "STALE log line carries the holder parsed from line 2 (cross-impl wire format)" \ || bad "holder from line 2 missing in pwsh's STALE log line" +fi -echo "== Test 5: bash steals a STALE lock GENUINELY created by pwsh (holder killed mid-hold) ==" -# The stale lock really is pwsh's: a pwsh process dot-sources the lock, acquires, -# signals ready, then is hard-killed by PID mid-hold (TerminateProcess — no -# release, no exit event), leaving its live lock FILE (token line 1) behind. +if section "Test 5: bash steals a STALE lock GENUINELY created by pwsh (holder killed mid-hold)"; then +# The stale lock really is pwsh's: a pwsh process dot-sources the lock, acquires (writing +# its tok.ps.* token to line 1 and flushing+closing the file), signals ready, then +# SELF-EXITS via [Environment]::Exit(0) — the port's documented hard-exit that bypasses +# BOTH Lock-Release AND the PowerShell.Exiting backstop — leaving its live token'd lock +# FILE behind with no release. This is DETERMINISTIC: the same on-disk state as a holder +# killed mid-hold, but without an external kill. (An MSYS `kill -9 "$!"` does NOT reliably +# terminate the native pwsh.exe under load — it survived, ran to completion, and its +# graceful-exit backstop DELETED the lock, leaving an empty file to steal; observed under +# CPU load, run 27621668323. See the Test 5 de-flake plan.) LOCK="$WORK/b5.lock"; LOG="$WORK/b5.log"; : > "$LOG"; MARK="$WORK/b5.mark"; printf '%s' before > "$MARK" READY="$WORK/b5.ready"; rm -f "$READY" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 \ - pwsh -NoProfile -Command ". '$PS1WIN'; Lock-Acquire | Out-Null; [IO.File]::WriteAllText('$READY','r'); Start-Sleep 60" & + pwsh -NoProfile -Command ". '$PS1WIN'; if (-not (Lock-Acquire)) { [Environment]::Exit(3) }; [IO.File]::WriteAllText('$READY','r'); [Environment]::Exit(0)" & hpid=$! if wait_for "$READY"; then - kill -9 "$hpid" 2>/dev/null; wait "$hpid" 2>/dev/null - sleep 0.3 + wait "$hpid" 2>/dev/null # holder self-exited via [Environment]::Exit (no release); reap it tok="$(head -n 1 "$LOCK" 2>/dev/null | tr -d '\r\n')" case "$tok" in - tok.ps.*) ok "dead pwsh holder left its own lock file behind (token $tok)" ;; + tok.ps.*) ok "self-exited pwsh holder left its own token'd lock behind (token $tok)" ;; *) bad "expected a tok.ps.* token on line 1 of the orphan lock, got '$tok'" ;; esac backdate "$LOCK" 9999 # age the orphan past any stale window @@ -332,8 +278,9 @@ else kill -9 "$hpid" 2>/dev/null; wait "$hpid" 2>/dev/null bad "T5 pwsh holder never acquired/signalled ready" fi +fi -echo "== Test 6: deterministic lost-update counter, mixed bash+pwsh increments ($GCL_MODE width) ==" +if section "Test 6: deterministic lost-update counter, mixed bash+pwsh increments ($GCL_MODE width)"; then # The deterministic complement to Test 1's exclusion probe (which has a blind # window and tolerates launch flakiness): every worker MUST launch (strict rc # checks) and the final counter MUST equal the total increments — any lost @@ -379,8 +326,9 @@ cat "$WORK"/cnt-*.log > "$WORK/cnt-all.log" 2>/dev/null || : > "$WORK/cnt-all.lo a="$(grep -c ACQUIRED "$WORK/cnt-all.log")"; rl="$(grep -c RELEASED "$WORK/cnt-all.log")" [ "$a" = "$CTOT" ] && [ "$rl" = "$CTOT" ] && ok "lock logs balanced ($a acquired / $rl released)" || bad "lock logs unbalanced: acquired=$a released=$rl want=$CTOT" [ -e "$LOCK" ] && bad "leftover counter lock" || ok "no leftover lock" +fi -echo "== Test 7: pwsh run propagates the command's exit code (two contending runs in parallel) ==" +if section "Test 7: pwsh run propagates the command's exit code (two contending runs in parallel)"; then LOCK="$WORK/rc.lock"; LOG="$WORK/rc.log"; : > "$LOG" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=60 \ pwsh -NoProfile -File "$PS1WIN" run "exit 0" & p0=$! @@ -391,8 +339,9 @@ wait "$p7"; rc7=$? [ "$rc0" = 0 ] && ok "pwsh exit 0 propagated" || bad "pwsh exit 0 not propagated (rc=$rc0)" [ "$rc7" = 7 ] && ok "pwsh exit 7 propagated" || bad "pwsh exit code not propagated ($rc7)" [ -e "$LOCK" ] && bad "lock left held after pwsh run" || ok "lock released after pwsh run (success and failure)" +fi -echo "== Test 7b: ps1 run verdicts for PowerShell-NATIVE failure (a failing cmdlet must not exit 0) ==" +if section "Test 7b: ps1 run verdicts for PowerShell-NATIVE failure (a failing cmdlet must not exit 0)"; then # A cmdlet's non-terminating error never sets LASTEXITCODE, so a runner # consulting only LASTEXITCODE would return 0 for a failed command. The # runner must consult the staged script's FINAL '$?' when no nonzero native @@ -430,8 +379,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=20 \ [ "$rc" = 0 ] && ok "mid-command cmdlet failure + succeeding final statement -> 0 (the documented final-statement limitation)" \ || bad "limitation pin: rc=$rc (want 0 — has the final-statement contract changed?)" [ -e "$LOCK" ] && bad "lock left held after the failing-cmdlet verdict runs" || ok "no leftover lock after the failing-cmdlet verdict runs" +fi -echo "== Test 7c: ps1 CLI help/usage convention — explicit help -> stdout + exit 0; usage errors -> stderr + 96 ==" +if section "Test 7c: ps1 CLI help/usage convention — explicit help -> stdout + exit 0; usage errors -> stderr + 96"; then # (bash's side of the same convention is pinned in the unit suite, Test 7.) for h in --help -h; do pwsh -NoProfile -File "$PS1WIN" "$h" > "$WORK/t7c.out" 2> "$WORK/t7c.err"; rc=$? @@ -451,8 +401,9 @@ pwsh -NoProfile -File "$PS1WIN" > "$WORK/t7c-noargs.out" 2> "$WORK/t7c-noargs.er || bad "ps1 no-args rc=$rc (want 96) stderr-usage=$(grep -c '^usage:' "$WORK/t7c-noargs.err")" pwsh -NoProfile -File "$PS1WIN" frobnicate >/dev/null 2>&1; rc=$? [ "$rc" = 96 ] && ok "ps1 unknown subcommand -> 96" || bad "ps1 unknown subcommand rc=$rc (want 96)" +fi -echo "== Test 8: a ROBBED holder exits 98 — pwsh victim/bash thief, then bash victim/pwsh thief ==" +if section "Test 8: a ROBBED holder exits 98 — pwsh victim/bash thief, then bash victim/pwsh thief"; then # Fail-open ceiling, cross-impl: the victim holds past its 1s stale window # UNTIL THE THIEF IS DONE (marker, not a fixed sleep — a fixed hold once let a # slow-starting thief arrive after the victim had already released), the other @@ -485,15 +436,17 @@ touch "$TDONE" wait "$vic"; vic_rc=$? [ "$vic_rc" = 98 ] && ok "robbed bash holder exited 98" || bad "robbed bash holder exited $vic_rc (want 98)" [ "$thief_rc" = 0 ] && ok "pwsh thief exited 0" || bad "pwsh thief exited $thief_rc" +fi -echo "== Test 9: a slow but UNCONTENDED pwsh holder keeps its lock (slowness != failure) ==" +if section "Test 9: a slow but UNCONTENDED pwsh holder keeps its lock (slowness != failure)"; then LOCK="$WORK/slow.lock"; LOG="$WORK/slow.log"; : > "$LOG" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \ pwsh -NoProfile -File "$PS1WIN" run "Start-Sleep 2"; rc=$? [ "$rc" = 0 ] && ok "uncontended slow pwsh holder exited 0" || bad "uncontended slow pwsh holder exited $rc" grep -q "WARNING" "$LOG" && bad "spurious theft WARNING with no contender" || ok "no spurious WARNING when uncontended" +fi -echo "== Test 10: default lock location is /commit.lock for BOTH impls (regression: item 1) ==" +if section "Test 10: default lock location is /commit.lock for BOTH impls (regression: item 1)"; then # The BLOCKER this guards against: the .ps1 silently fell back to a CWD lock at # default config, so the two impls never contended. Run BOTH impls from a # SUBDIRECTORY of a scratch repo with AGENT_LOCK_PATH/LOG unset; each command @@ -515,8 +468,9 @@ nps="$(grep -c "ACQUIRED.*tok=tok\.ps\." "$DLOG" 2>/dev/null)" && ok "shared log shows 1 bash + 1 pwsh acquisition" \ || bad "default-log evidence wrong: ACQUIRED=$na (want 2), pwsh tokens=$nps (want 1) in $DLOG" [ -e "$GITDIR2/commit.lock" ] && bad "leftover default lock" || ok "no leftover default lock" +fi -echo "== Test 11: release-time classification agrees across impls — truncated => unverifiable (1); deleted => theft (98) ==" +if section "Test 11: release-time classification agrees across impls — truncated => unverifiable (1); deleted => theft (98)"; then # (i) TRUNCATED at release: the file still exists but reads EMPTY after the # retry ladder. NOT provable theft (it is the probe-F create->write window of # a successor after a boundary steal, or external truncation), so BOTH impls @@ -545,8 +499,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_MAX_WAIT=20 \ pwsh -NoProfile -File "$PS1WIN" run "Remove-Item -LiteralPath '$LOCK' -Force" 2>/dev/null; rc_ps=$? [ "$rc_sh" = 98 ] && ok "bash: lock GONE at release -> exit 98 (theft)" || bad "bash gone-at-release rc=$rc_sh (want 98)" [ "$rc_ps" = 98 ] && ok "pwsh: lock GONE at release -> exit 98 (theft)" || bad "pwsh gone-at-release rc=$rc_ps (want 98)" +fi -echo "== Test 12: fractional STALE/MAX_WAIT rejected identically by both impls (note + default) ==" +if section "Test 12: fractional STALE/MAX_WAIT rejected identically by both impls (note + default)"; then # These two knobs are integers in both impls; a fractional value silently # rounded by one side but rejected by the other would give the two impls # DIFFERENT steal thresholds for the same env. Both must note + use defaults. @@ -601,10 +556,11 @@ n_ps="$(grep -c 'ignoring invalid' "$WORK/poll-ps.err")" [ "$rc_sh" = 0 ] && [ "$n_sh" = 0 ] && [ "$rc_ps" = 0 ] && [ "$n_ps" = 0 ] \ && ok "POLL_SECS='' (empty): silent default in BOTH impls (no note)" \ || bad "POLL_SECS='' parity: sh rc=$rc_sh notes=$n_sh; pwsh rc=$rc_ps notes=$n_ps (want rc 0 + 0 notes each)" +fi if [ "$GCL_WINDOWS" = 1 ]; then -echo "== Test 13: blocked release (no-delete-share handle) — deterministic LEFTOVER, run keeps the command's code, then recovery ==" +if section "Test 13: blocked release (no-delete-share handle) — deterministic LEFTOVER, run keeps the command's code, then recovery"; then # Probe D1 made this lane deterministically testable (TODO #30): a pwsh # FileShare.Read handle on the lock file blocks the release unlink (and any # steal rename) until it closes. (a) sourced bash: lock_release returns 1 and @@ -708,8 +664,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 AGENT_LOCK [ "$rc" = 0 ] && ok "leftover reclaimed once the handle closed + stale window elapsed (TODO #30 lane)" \ || bad "leftover recovery rc=$rc (want 0)" grep -q STOLE "$LOG" && ok "recovery steal logged" || bad "no STOLE entry during leftover recovery" +fi -echo "== Test 14: blocked steal — a no-delete-share handle on a STALE lock defers the steal until it closes ==" +if section "Test 14: blocked steal — a no-delete-share handle on a STALE lock defers the steal until it closes"; then # Same handle class against a stale lock: the stealer's rename keeps failing # while the handle is open (probe D1), so it re-polls — and acquires promptly # once the handle closes. Run with the ps1 stealer: this exercises its @@ -737,8 +694,9 @@ else touch "$BGO"; wait "$blk14" 2>/dev/null bad "T14 blocker never signalled its handle open" fi +fi -echo "== Test 14b: blocked steal NEVER bypasses MAX_WAIT — squatted stale lock => 97 with bounded logging (regression: busy-spin) ==" +if section "Test 14b: blocked steal NEVER bypasses MAX_WAIT — squatted stale lock => 97 with bounded logging (regression: busy-spin)"; then # Discriminator: when the steal rename keeps # failing with the lock file still present (a no-delete-share handle squatting # it), a failed-steal lane that `continue`s past the timeout check AND the @@ -810,13 +768,14 @@ else bad "T14b squatter never signalled its handle open" fi rm -f "$LOCK" +fi else echo "== Tests 13/14/14b SKIPPED (POSIX): open handles never block unlink/rename here ==" echo "note: the LEFTOVER and blocked-steal lanes are Windows-only by construction (.NET's Unix FileShare gates no namespace operation); the Windows CI leg covers them" fi -echo "== Test 15: ps1-side never-steal guards — dir, dangling symlink, non-lock content (parity with the bash guards) ==" +if section "Test 15: ps1-side never-steal guards — dir, dangling symlink, non-lock content (parity with the bash guards)"; then # The ps1 guards use different APIs than bash (PSIsContainer, reparse # attributes, the catch-all CreateNew exception), so bash coverage proves # nothing about them. The wrong-type warning needs the SAME concrete type on @@ -875,21 +834,22 @@ grep -q "is not a lock file" "$WORK/psuser.err" && ok "ps1: config warning names || bad "ps1: no config warning for non-lock content" grep -q STOLE "$LOG" && bad "ps1 STOLE the user file" || ok "ps1: no steal of the user file" rm -f "$LOCK" +fi -echo "== Test 16: crash recovery under CONTENTION, mixed impls — claim-serialized: zero displacement, zero 98s ==" +if section "Test 16: crash recovery under CONTENTION, mixed impls — claim-serialized: zero displacement, zero 98s"; then # Cross-impl variant of the unit suite's Test 2b (which carries the full -# rationale): 2 bash + 2 pwsh waiters race ONE crashed lock. Under the claim -# protocol the straggler-robs-recovery-winner race is PREVENTED (the claim -# serializes stealers across the wire format), not detected-and-repaired, so -# the assertions are strict: every waiter exits 0 (zero spurious 98s — an -# unserialized implementation displaces the recovery winner near-certainly), -# exactly ONE STOLE-BY-CLAIM, NO move-aside file ever exists (an -# implementation that staged the steal through an intermediate .dead.* file -# would re-open the displacement race; a background sampler proves no such -# file ever appears — and the unserialized "STOLE stale lock" line shape and -# any STEAL-DISPLACED repair line must never appear), and the final state -# is clean (no lock, no claim). Sync: waiters launch against a FRESH -# fabricated lock and only once all four have logged WAITING is it +# rationale): N waiters split half bash / half pwsh race ONE crashed lock. +# Under the claim protocol the straggler-robs-recovery-winner race is +# PREVENTED (the claim serializes stealers across the wire format), not +# detected-and-repaired, so the assertions are strict: every waiter exits 0 +# (zero spurious 98s — an unserialized implementation displaces the recovery +# winner near-certainly), exactly ONE STOLE-BY-CLAIM, NO move-aside file ever +# exists (an implementation that staged the steal through an intermediate +# .dead.* file would re-open the displacement race; a background sampler proves +# no such file ever appears — and the unserialized "STOLE stale lock" line +# shape and any STEAL-DISPLACED repair line must never appear), and the final +# state is clean (no lock, no claim). Sync: waiters launch against a FRESH +# fabricated lock and only once all have logged WAITING is it # backdated, so all judge stale within one poll window despite pwsh's slow # cold start; the sync keeps the ghost fresh while it waits # (sync_waiting_fresh) so a stalled sync can't let the ghost age stale on @@ -901,13 +861,34 @@ echo "== Test 16: crash recovery under CONTENTION, mixed impls — claim-seriali # the run's premise is broken (the touch may have aged the WINNER'S live # lock), so the run is discarded and retried (bounded) instead of failing # assertions the protocol never violated. +# +# Waiter count is swept over $T_AXIS_A (see load-testing-strategy.md): one iteration at N=4 by +# default (2 bash + 2 pwsh — byte-identical to today) and at N=4,12,24 under +# GCL_TEST_SWEEP=1. N is split into a bash half (N/2) and a pwsh half (the +# remainder); at N=4 that is 2+2 exactly. The correctness invariants stay strict +# at EVERY N — but that needs STALE >> the winner's EFFECTIVE hold, which grows +# with N under load (the winner is one of N concurrent processes), so STALE is +# floored to N when sweeping (t16_stale); at the default floor it is the same 8 +# as today. MAX_WAIT scales too (30*N => 120 at N=4) so a wide, pwsh-cold-start- +# heavy sweep has time to drain. The per-N tag on the non-count-naming +# assertions is suppressed in the default run so the messages stay byte-identical. LOCK="$WORK/recov.lock" T16_TRIES=3 T16_GRAVESEEN="$WORK/recov.graveseen"; T16_SAMPSTOP="$WORK/recov.sampstop" +for T16_N in $T_AXIS_A; do +t16_nsh=$(( T16_N / 2 )); t16_nps=$(( T16_N - t16_nsh )) # bash half + pwsh half (2+2 at N=4) +t16_maxwait=$(( 30 * T16_N )) +# STALE budget: today's 8 in the default (non-sweep) run for byte-identical +# behaviour; when sweeping, floor it to N so a wide fan-out's load-stretched +# winner hold can never make its own live lock look stale (a legitimate but +# unwanted second steal), keeping "exactly one steal" strict at every N. +if [ "$GCL_TEST_SWEEP" = 1 ] && [ "$T16_N" -gt 8 ]; then t16_stale="$T16_N"; else t16_stale=8; fi +if [ "$GCL_TEST_SWEEP" = 1 ]; then t16_ntag=" at N=$T16_N"; else t16_ntag=""; fi t16_valid=0; t16_sync=1; t16_fail=0; n98=0 for t16_try in $(seq 1 "$T16_TRIES"); do - T16_GHOST="tok.ghost.recov.$t16_try" - rm -f "$WORK"/recov.ran.* "$T16_GRAVESEEN" "$T16_SAMPSTOP" "$LOCK" "$LOCK.next" 2>/dev/null + T16_GHOST="tok.ghost.recov.$T16_N.$t16_try" + rm -f "$WORK"/recov.ran.* "$WORK"/recov-sh*.log "$WORK"/recov-ps*.log \ + "$T16_GRAVESEEN" "$T16_SAMPSTOP" "$LOCK" "$LOCK.next" 2>/dev/null fabricate_lock "$LOCK" "$T16_GHOST" "pid=999 host=ghost" # fresh mtime: not yet stale ( while [ ! -e "$T16_SAMPSTOP" ]; do @@ -918,41 +899,45 @@ for t16_try in $(seq 1 "$T16_TRIES"); do done ) & t16_sampler=$! - pids=() - for i in 1 2; do + pids=(); t16_logs=() + for i in $(seq 1 "$t16_nsh"); do : > "$WORK/recov-sh$i.log" # per-waiter logs: concurrent appends to one log drop lines - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-sh$i.log" AGENT_LOCK_STALE_SECS=8 \ - AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \ + t16_logs+=("$WORK/recov-sh$i.log") + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-sh$i.log" AGENT_LOCK_STALE_SECS="$t16_stale" \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t16_maxwait" \ bash "$SH" run -- bash -c 'touch "$1"; sleep 0.1' _ "$WORK/recov.ran.sh$i" 2>/dev/null & pids+=($!) done - for i in 1 2; do + for i in $(seq 1 "$t16_nps"); do : > "$WORK/recov-ps$i.log" - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-ps$i.log" AGENT_LOCK_STALE_SECS=8 \ - AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \ + t16_logs+=("$WORK/recov-ps$i.log") + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov-ps$i.log" AGENT_LOCK_STALE_SECS="$t16_stale" \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t16_maxwait" \ pwsh -NoProfile -File "$PS1WIN" run "[IO.File]::WriteAllText('$WORK/recov.ran.ps$i', 'x'); Start-Sleep -Milliseconds 100" 2>/dev/null & pids+=($!) done t16_sync=1 - if ! sync_waiting_fresh "$LOCK" 90 "$WORK/recov-sh1.log" "$WORK/recov-sh2.log" \ - "$WORK/recov-ps1.log" "$WORK/recov-ps2.log"; then + if ! sync_waiting_fresh "$LOCK" 90 "${t16_logs[@]}"; then t16_sync=0 - for f in "$WORK/recov-sh1.log" "$WORK/recov-sh2.log" "$WORK/recov-ps1.log" "$WORK/recov-ps2.log"; do - grep -q "WAITING for lock" "$f" 2>/dev/null || echo " T16 waiter never contended (no WAITING in ${f##*/})" + for f in "${t16_logs[@]}"; do + grep -q "WAITING for lock" "$f" 2>/dev/null || echo " T16 N=$T16_N waiter never contended (no WAITING in ${f##*/})" done fi - backdate_ghost "$LOCK" "$T16_GHOST" 9999; t16_bd=$? # all four now judge the ghost stale together + backdate_ghost "$LOCK" "$T16_GHOST" 9999; t16_bd=$? # all waiters now judge the ghost stale together t16_fail=0; n98=0 for p in "${pids[@]}"; do wait "$p"; rc=$? case "$rc" in 0) ;; - 98) n98=$((n98+1)); echo " T16 waiter rc=98 — displacement under the claim protocol" ;; - *) t16_fail=1; echo " T16 waiter rc=$rc (want 0)" ;; + 98) n98=$((n98+1)); echo " T16 N=$T16_N waiter rc=98 — displacement under the claim protocol" ;; + *) t16_fail=1; echo " T16 N=$T16_N waiter rc=$rc (want 0)" ;; esac done touch "$T16_SAMPSTOP"; wait "$t16_sampler" 2>/dev/null - cat "$WORK"/recov-*.log > "$WORK/recov-all.log" 2>/dev/null || : > "$WORK/recov-all.log" + # Aggregate from the explicit per-waiter log list, NOT a recov-*.log glob: the + # glob would also match recov-all.log itself, which now persists across sweep N + # iterations, so a glob could self-cat a stale aggregate into the count. + cat "${t16_logs[@]}" > "$WORK/recov-all.log" 2>/dev/null || : > "$WORK/recov-all.log" if [ "$t16_bd" != 0 ]; then # The backdate was NOT conclusively clean (see backdate_ghost; under # load the whole steal+release cycle often completes before the @@ -969,7 +954,7 @@ for t16_try in $(seq 1 "$T16_TRIES"); do [ "$(grep -c "lock LOST" "$WORK/recov-all.log")" = 0 ] || t16_dirty=1 { [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } && t16_dirty=1 if [ "$t16_dirty" = 1 ]; then - echo " T16 try $t16_try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying" + echo " T16 N=$T16_N try $t16_try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying" rm -f "$LOCK" "$LOCK.next" 2>/dev/null continue fi @@ -984,32 +969,34 @@ if [ "$t16_valid" = 1 ]; then nold="$(grep -c "STOLE stale lock" "$WORK/recov-all.log")" ndisp="$(grep -c "STEAL-DISPLACED" "$WORK/recov-all.log")" [ "$t16_fail" = 0 ] && [ "$t16_sync" = 1 ] \ - && ok "2 bash + 2 pwsh waiters on one crashed lock: every waiter exited 0" \ - || bad "mixed crash-recovery exits wrong (see above)" - [ "$n98" = 0 ] && ok "zero spurious 98s — the claim serialized recovery across implementations" \ - || bad "$n98 waiter(s) exited 98 — displacement happened under the claim protocol" - [ "$nran" = 4 ] && ok "all 4 waiter commands ran" || bad "only $nran/4 waiter commands ran" - [ "$nstole" = 1 ] && ok "exactly ONE STOLE-BY-CLAIM (the claim serialized the cross-impl recovery)" \ - || bad "STOLE-BY-CLAIM x$nstole (want exactly 1)" + && ok "$t16_nsh bash + $t16_nps pwsh waiters on one crashed lock: every waiter exited 0" \ + || bad "mixed crash-recovery exits wrong$t16_ntag (see above)" + [ "$n98" = 0 ] && ok "zero spurious 98s$t16_ntag — the claim serialized recovery across implementations" \ + || bad "$n98 waiter(s) exited 98$t16_ntag — displacement happened under the claim protocol" + [ "$nran" = "$T16_N" ] && ok "all $T16_N waiter commands ran" || bad "only $nran/$T16_N waiter commands ran" + [ "$nstole" = 1 ] && ok "exactly ONE STOLE-BY-CLAIM$t16_ntag (the claim serialized the cross-impl recovery)" \ + || bad "STOLE-BY-CLAIM x$nstole$t16_ntag (want exactly 1)" grep -q "STOLE-BY-CLAIM.*ghost=pid=999 host=ghost" "$WORK/recov-all.log" \ - && ok "the steal line attributes the crashed ghost cross-impl (wire-format line 2 parsed)" \ - || bad "STOLE-BY-CLAIM does not carry the ghost's line-2 attribution" + && ok "the steal line attributes the crashed ghost cross-impl (wire-format line 2 parsed)$t16_ntag" \ + || bad "STOLE-BY-CLAIM does not carry the ghost's line-2 attribution$t16_ntag" grep -q "CLAIM .*tok=tok\." "$WORK/recov-all.log" \ - && ok "claim create logged with its per-attempt token (CLAIM ... tok=)" \ - || bad "no CLAIM line with a token in the recovery logs" - [ "$nold" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged" \ - || bad "'STOLE stale lock' shape appeared x$nold — an unserialized steal lane is present" - [ "$ndisp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)" \ - || bad "STEAL-DISPLACED fired x$ndisp — displacement-repair machinery present?" - [ -e "$T16_GRAVESEEN" ] && bad "a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!" \ - || ok "no move-aside file (.dead.*) ever existed during recovery (sampler)" - [ -e "$LOCK" ] && bad "leftover crash-recovery lock" || ok "no leftover lock" - [ -e "$LOCK.next" ] && bad "leftover claim after recovery" || ok "no leftover claim" + && ok "claim create logged with its per-attempt token (CLAIM ... tok=)$t16_ntag" \ + || bad "no CLAIM line with a token in the recovery logs$t16_ntag" + [ "$nold" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged$t16_ntag" \ + || bad "'STOLE stale lock' shape appeared x$nold$t16_ntag — an unserialized steal lane is present" + [ "$ndisp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)$t16_ntag" \ + || bad "STEAL-DISPLACED fired x$ndisp$t16_ntag — displacement-repair machinery present?" + [ -e "$T16_GRAVESEEN" ] && bad "a move-aside file (.dead.*) existed during recovery$t16_ntag — the steal is staged through an intermediate file!" \ + || ok "no move-aside file (.dead.*) ever existed during recovery (sampler)$t16_ntag" + [ -e "$LOCK" ] && bad "leftover crash-recovery lock$t16_ntag" || ok "no leftover lock$t16_ntag" + [ -e "$LOCK.next" ] && bad "leftover claim after recovery$t16_ntag" || ok "no leftover claim$t16_ntag" else - bad "T16: no clean run under a conclusive backdate in $T16_TRIES attempts (see above)" + bad "T16: no clean run under a conclusive backdate in $T16_TRIES attempts$t16_ntag (see above)" +fi +done fi -echo "== Test 16b: bash claimant vs ps1 claimant racing ONE ghost — one claim winner, cross-impl wire parity ==" +if section "Test 16b: bash claimant vs ps1 claimant racing ONE ghost — one claim winner, cross-impl wire parity"; then # The 1+1 distilled version of Test 16: one bash and one pwsh waiter race the # same ancient ghost. Exactly one wins the O_EXCL claim and steals # (STOLE-BY-CLAIM x1); the loser either loses the claim create (a young @@ -1081,8 +1068,9 @@ if [ "$t16b_valid" = 1 ]; then else bad "T16b: no clean run under a conclusive backdate in $T16B_TRIES attempts (see above)" fi +fi -echo "== Test 16c: cross-impl claim staleness — each side clears the OTHER side's aged claim; young foreign claims are respected ==" +if section "Test 16c: cross-impl claim staleness — each side clears the OTHER side's aged claim; young foreign claims are respected"; then # (a) bash clears an aged ps1-tokened claim, then completes the steal. LOCK="$WORK/cstale.lock"; LOG="$WORK/cstale.log"; : > "$LOG" fabricate_lock "$LOCK" "tok.ghost.cstale" "pid=9 host=ghost"; backdate "$LOCK" 9999 @@ -1132,8 +1120,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ && ok "ps1 respected a young bash claim (97, claim intact, no clear/steal)" \ || bad "ps1 young-bash-claim handling: rc=$rc intact=$([ -f "$LOCK.next" ] && echo yes || echo no)" rm -f "$LOCK" "$LOCK.next" +fi -echo "== Test 16d: static checks — no File.Replace in the ps1 port ==" +if section "Test 16d: static checks — no File.Replace in the ps1 port"; then # File.Replace is deliberately never used: it throws on a # read-only destination and has partial-failure states when called without a # backup file. The 5.1 lane must stay unlink + fail-if-exists Move. @@ -1142,8 +1131,9 @@ if grep -qE 'File\]?::Replace' "$ROOT/git-commit-lock.ps1"; then else ok "git-commit-lock.ps1 contains no File.Replace call" fi +fi -echo "== Test 16e: ps1 arc-end pass keeps INCONCLUSIVE entries; trap-time discovery-HOLD releases per normal release semantics ==" +if section "Test 16e: ps1 arc-end pass keeps INCONCLUSIVE entries; trap-time discovery-HOLD releases per normal release semantics"; then # Driven directly via a dot-sourcing pwsh driver — the ps1 side's # unit-equivalent steering mechanism (the lib skips its CLI when # dot-sourced). Part 1: the arc-end resolution pass's entry-drop is gated @@ -1237,8 +1227,9 @@ PSEOF else echo "note: the blocked trap-time release leg is Windows-only by construction (POSIX open handles never block unlink); the happy-path leg above pins the honest-log contract" fi +fi -echo "== Test 16f: ps1 claim-gone-at-touch — the SetLastWriteTimeUtc FileNotFound gone signal fires; no resurrection ==" +if section "Test 16f: ps1 claim-gone-at-touch — the SetLastWriteTimeUtc FileNotFound gone signal fires; no resurrection"; then # The unit suite's discovery-position matrix (T25) covers bash's # touch-gone lane; this is the ps1 counterpart: the claim passes the # step-3.1 recheck, vanishes before the step-3.2 touch (steered via the @@ -1297,9 +1288,10 @@ PSEOF else echo "== Test 16f SKIPPED: claim-gone-at-touch steering uses Windows pwsh (POSIX legs cover the protocol via the bash matrix; the ps1 gone-catch is probed Q1) ==" fi +fi if command -v powershell >/dev/null 2>&1; then -echo "== Test 17: Windows PowerShell 5.1 smoke lane — the ps1 must run, not just parse, on the in-box engine ==" +if section "Test 17: Windows PowerShell 5.1 smoke lane — the ps1 must run, not just parse, on the in-box engine"; then # Everything above runs the port under pwsh (7+). 5.1 ships in every Windows # 10/11 box and stays supported, so its claim is tested, not asserted: the # run lane's exit-code contract (0 / exit 7 / the failing-cmdlet -> 1) and @@ -1369,11 +1361,20 @@ AGENT_LOCK_PATH="$LOCK51" AGENT_LOCK_LOG="$LOG51" AGENT_LOCK_STALE_SECS=2 \ grep -q "CLAIM .*tok=tok\.ps\." "$LOG51" && ok "5.1: claim create logged with its per-attempt token" || bad "5.1: no CLAIM line with a tok.ps.* token" [ -e "$LOCK51" ] && bad "5.1: leftover lock after the steal ladder" || ok "5.1: no leftover lock" [ -e "$LOCK51.next" ] && bad "5.1: leftover claim after the steal ladder" || ok "5.1: no leftover claim" +fi else echo "== Test 17 SKIPPED: Windows PowerShell 5.1 (powershell) not on PATH — POSIX leg; the Windows CI leg covers it ==" echo "note: the 5.1 unlink+Move steal-ladder leg is part of this lane and is covered by the Windows CI leg" fi echo +# Zero-match guard + selector-report line (shared helper in _harness.sh): a +# set-but-non-matching GCL_TEST_ONLY ran no test block, so the (vacuously green) +# verdict below would lie — bail loudly; a typo'd selector regex must FAIL, not +# pass with zero assertions. When the selector matched, report how many blocks +# ran. Both gated on GCL_TEST_ONLY non-empty so the default run stays unchanged. +selector_report +DONE=1 echo "==== INTEROP RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ====" +[ "$GCL_TAP" = 1 ] && echo "1..$TAPN" [ "$FAIL" = 0 ] diff --git a/tests/git-commit-lock.test.sh b/tests/git-commit-lock.test.sh index 021ea22..3a41419 100755 --- a/tests/git-commit-lock.test.sh +++ b/tests/git-commit-lock.test.sh @@ -7,7 +7,7 @@ # # Fan-out: heavy concurrency tests default to REDUCED width so routine dev # runs don't lag a live shared machine; set GCL_TEST_FULL=1 (CI does) for the -# full-strength canary. The suite prints which mode ran — a reduced pass must +# full-strength fan-out. The suite prints which mode ran — a reduced pass must # never masquerade as the full one. # # On failure the work dir is PRESERVED (path printed) for post-mortem; set @@ -21,21 +21,36 @@ # and command substitutions run inside conditions all over a test suite; the # suite runs WITHOUT errexit (set -uo only) and asserts on values, not on # implicit exit propagation. -# shellcheck disable=SC2016 # $INCR is single-quoted on purpose: it expands -# inside the worker's `bash -c`, not here. +# shellcheck disable=SC2016 # Single-quoted strings carrying `$…` on purpose — +# steering-shell bodies (the T*_INNER `bash -c` programs) and grep patterns that +# match literal `$_LOCK_*` text in the library — expand in their own context, not +# here. set -uo pipefail +# Shared harness: PASS/FAIL/TAP counters, GCL_TAP/GCL_TEST_ONLY reads, ok/bad, +# section, the finish EXIT-trap sentinel (calls our cleanup below), and the +# shared timing/lock helpers (epoch_to_stamp, backdate, backdate_ghost, +# sync_waiting_fresh, fabricate_lock, wait_for_grep). Resolved from THIS +# script's own dir so it sources regardless of CWD; sourced EARLY (before any +# use of the inits/helpers below). +_HARNESS_DIR="$(CDPATH='' cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=tests/_harness.sh +. "$_HARNESS_DIR/_harness.sh" + DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT="$(cd "$DIR/.." && pwd)" # the implementations live at the repo root LIB="$ROOT/git-commit-lock.sh" if [ "${GCL_TEST_FULL:-0}" = 1 ]; then - GCL_MODE="FULL"; T1_ROUNDS=8; T1_N=25; T2B_ROUNDS=4; T20_N=10 + GCL_MODE="FULL"; T2B_ROUNDS=4; T20_N=10 else - GCL_MODE="REDUCED"; T1_ROUNDS=3; T1_N=8; T2B_ROUNDS=2; T20_N=5 + GCL_MODE="REDUCED"; T2B_ROUNDS=2; T20_N=5 fi -echo "fan-out mode: $GCL_MODE (T1 ${T1_ROUNDS} rounds x ${T1_N} workers)" -[ "$GCL_MODE" = REDUCED ] && echo " (set GCL_TEST_FULL=1 for the full-strength 8x25 canary — CI runs it)" +# (The full-width concurrency canary, formerly Test 1, now lives in its own file +# tests/git-commit-lock.canary.test.sh; this suite's heavy fan-out is Test 2b / +# Test 20.) +echo "fan-out mode: $GCL_MODE (Test 2b ${T2B_ROUNDS} rounds, Test 20 ${T20_N} concurrent workers)" +[ "$GCL_MODE" = REDUCED ] && echo " (set GCL_TEST_FULL=1 for full-strength fan-out — CI runs it)" WORK="$(mktemp -d 2>/dev/null || echo "${TMPDIR:-/tmp}/git-commit-lock-test.$$")" mkdir -p "$WORK" @@ -51,78 +66,32 @@ cleanup() { rm -rf "$WORK" 2>/dev/null || true fi } -trap cleanup EXIT - -PASS=0; FAIL=0 -ok() { echo "PASS: $*"; PASS=$((PASS+1)); } -bad() { echo "FAIL: $*"; FAIL=$((FAIL+1)); } - -# Backdate a path's mtime by $2 seconds — the lock's staleness clock is the -# lock FILE's own mtime (stamped by the creating write), so this is how a -# test fakes a stale lock. Portable: BSD touch has no `-d @epoch`, so convert -# the target epoch to a `touch -t` stamp via GNU `date -d @` with BSD -# `date -r` as fallback. -epoch_to_stamp() { - date -d "@$1" +%Y%m%d%H%M.%S 2>/dev/null || date -r "$1" +%Y%m%d%H%M.%S 2>/dev/null -} -backdate() { touch -t "$(epoch_to_stamp "$(( $(date +%s) - $2 ))")" "$1"; } - -# Token-guarded backdate for the contended-recovery rounds (T2b). Why: under -# load a fast waiter can complete its ENTIRE steal (claim -> rename-over -> -# ACQUIRED) before the harness's `touch` executes, so a blind backdate lands -# on the WINNER'S freshly installed lock, making it instantly stale for every -# rival — a legitimate re-steal then fails the round's "zero 98s / exactly -# one STOLE-BY-CLAIM" assertions although the protocol behaved exactly as -# designed (observed 2026-06-12 on a loaded box). Verdicts: -# * pre-read not the ghost: a waiter stole the ghost BEFORE the touch (it -# aged stale naturally during a stalled sync); no touch is performed and -# the round premise is gone — invalid, the caller retries the round. -# * post-read the ghost: conclusive — nothing ever rewrites the ghost -# token at the path, so the touch verifiably hit the ghost; any steal -# after the post-read steals an ALREADY-ancient ghost, exactly the -# scenario the round wants. Valid. -# * post-read anything else: a steal raced the touch->re-read window — -# COMMON under load (waiters poll every 0.05s; the post-read costs -# subprocess spawns), so it must not blindly invalidate. The lock's -# MTIME arbitrates which file the touch hit: a winner's installed lock -# is FRESH (the rename carries the claim file's just-created mtime), so -# fresh => the touch hit the GHOST and a legitimate steal followed — -# valid; ancient => the touch landed on the WINNER'S live lock and -# corrupted the round — invalid, retry. Vanished => cannot arbitrate — -# invalid, retry. -backdate_ghost() { # $1=lock $2=ghost token $3=age-secs -> 0 iff the round premise is intact - local pre post now mt - pre="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" - [ "$pre" = "$2" ] || return 1 - backdate "$1" "$3" 2>/dev/null || return 1 - post="$(head -n 1 -- "$1" 2>/dev/null | tr -d '\r')" - [ "$post" = "$2" ] && return 0 - [ -e "$1" ] || return 1 - now="$(date +%s)" - mt="$(stat -c %Y -- "$1" 2>/dev/null || stat -f %m -- "$1" 2>/dev/null)" || return 1 - [ $(( now - mt )) -lt $(( $3 / 2 )) ] -} +# The finish EXIT-trap sentinel (defined in _harness.sh) calls the cleanup() +# above and fails loudly if the suite died before setting DONE=1. +trap finish EXIT -# Wait for every waiter's WAITING line while keeping the ghost lock FRESH -# (touch -c to now, no-create so a released path is never resurrected): a -# fresh ghost cannot be judged stale, so no waiter can steal it before the -# guarded backdate — without this, a sync stalled past STALE (slow worker -# cold starts on a loaded box) lets the ghost age stale naturally and a -# waiter steals it mid-sync. Freshening is race-safe: if a steal slipped in -# anyway, touching the winner's (already fresh) live lock to "now" is a -# harmless no-op, and backdate_ghost's pre-read catches the broken premise. -sync_waiting_fresh() { # $1=lock $2=timeout-secs $3..=waiter logs -> 0 iff all logged WAITING - local lock="$1" deadline f ok=1 - deadline=$(( $(date +%s) + $2 )); shift 2 - for f in "$@"; do - until grep -q "WAITING for lock" "$f" 2>/dev/null; do - touch -c "$lock" 2>/dev/null - if [ "$(date +%s)" -ge "$deadline" ]; then ok=0; break; fi - sleep 0.2 - done - done - [ "$ok" = 1 ] -} +# Envelope-tier assertions (see failure-modes.md §K / §4 item 1). A wall-clock or poll-count +# bound is a Tier-2 (best-effort latency) property, NOT a correctness one (see +# guarantees.md BE-1). In the default 'strict' tier these behave exactly like +# ok/bad. Under GCL_ENVELOPE_TIER=relax (nightly/deep stress runs) an envelope FAIL +# becomes a WARN that does NOT increment FAIL — so an oversubscribed runner can't +# turn a latency miss into a red — while every CORRECTNESS assertion keeps ok/bad +# and stays hard in both tiers. TAP-aware so envelope assertions still count toward 1..N. +ENVELOPE_TIER="${GCL_ENVELOPE_TIER:-strict}" +ENV_WARN=0 +ok_envelope() { PASS=$((PASS+1)); TAPN=$((TAPN+1)); echo "PASS[env]: $*" + [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $*"; return 0; } +bad_envelope() { + if [ "$ENVELOPE_TIER" = relax ]; then + ENV_WARN=$((ENV_WARN+1)); TAPN=$((TAPN+1)); echo "WARN[env-relaxed]: $*" + [ "$GCL_TAP" = 1 ] && echo "ok $TAPN - $* # env-relaxed" + else + FAIL=$((FAIL+1)); TAPN=$((TAPN+1)); echo "FAIL: $*" + [ "$GCL_TAP" = 1 ] && echo "not ok $TAPN - $*" + fi; return 0; } + +# epoch_to_stamp, backdate, backdate_ghost, and sync_waiting_fresh now live in +# _harness.sh (sourced above) — shared byte-for-byte with the interop suite. # Clone a shell function under a new name — the steering tests' interposition # mechanism: a sourced test shell wraps a library internal (or a command like @@ -135,66 +104,25 @@ clone_fn() { # $1=existing function $2=new name } export -f clone_fn epoch_to_stamp backdate -# Fabricate a lock file the way a real (foreign) holder would have written it: -# token line + owner line. The token MUST be "tok."-prefixed (wire format) or -# the steal's content guard will — correctly — refuse to steal it. -fabricate_lock() { # $1=path $2=token $3=owner - printf '%s\n%s\n' "$2" "$3" > "$1" -} +# fabricate_lock and wait_for_grep now live in _harness.sh (sourced above) — +# shared byte-for-byte with the interop suite. # Wait (up to $2 seconds, default 15) for a marker file to appear. Holders # touch a ready-marker as their first act INSIDE the lock; tests gate on that -# instead of sleep-margin head starts, which flaked under load. +# instead of sleep-margin head starts, which flaked under load. Unit-only: the +# interop suite has its own poll helper (wait_for, 50ms-iteration semantics). wait_for_file() { local f="$1" tries=$(( ${2:-15} * 20 )) while [ ! -e "$f" ] && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done [ -e "$f" ] } -# Wait (up to $3 seconds, default 15) for a pattern to appear in a file. -# Used to gate on the WAITING log line: proof the waiter actually contended, -# without a fixed-length hold. -wait_for_grep() { - local pat="$1" f="$2" tries=$(( ${3:-15} * 20 )) - while ! grep -q "$pat" "$f" 2>/dev/null && [ "$tries" -gt 0 ]; do sleep 0.05; tries=$((tries-1)); done - grep -q "$pat" "$f" 2>/dev/null -} +# NB: Test 1 (the full-width concurrency CANARY) now lives in its own suite file, +# tests/git-commit-lock.canary.test.sh, so it runs as a naturally-parallel CI job +# (it is ~half the Windows unit wall-clock). The $INCR critical-section string it +# used moved out with it (no other unit test uses it). -# Critical section that loses updates without a mutex: read, gap, write+1. -INCR='n="$(cat "$1")"; sleep 0.03; echo $((n+1)) > "$1"' - -echo "== Test 1: concurrent workers, mutual exclusion (repeated rounds, $GCL_MODE width) ==" -# A single pass is too weak to trust a rare exclusion race (the release-steal -# bug found 2026-05-30 lost ~1 update per 25 only intermittently). Repeat -# several rounds; ANY lost update across ALL rounds fails the test. -# MAX_WAIT caps a regression at 180s per worker instead of the 420s default; -# STALE stays comfortably above any realistic hold so nothing is ever stolen. -N=$T1_N; ROUNDS=$T1_ROUNDS; t1_fail=0; T1ERR="$WORK/excl.err"; : > "$T1ERR" -for r in $(seq 1 "$ROUNDS"); do - COUNTER="$WORK/counter.$r"; echo 0 > "$COUNTER" - LOCK="$WORK/excl.$r.lock"; LOG="$WORK/excl.$r.log"; : > "$LOG"; pids=() - for _ in $(seq 1 "$N"); do - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=120 \ - AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=180 \ - bash "$LIB" run -- bash -c "$INCR" _ "$COUNTER" 2>> "$T1ERR" & - pids+=($!) - done - for p in "${pids[@]}"; do wait "$p"; done - c="$(cat "$COUNTER")"; a="$(grep -c ACQUIRED "$LOG")"; rl="$(grep -c RELEASED "$LOG")" - if [ "$c" != "$N" ] || [ "$a" != "$N" ] || [ "$rl" != "$N" ] || [ -e "$LOCK" ]; then - t1_fail=1; echo " round $r: counter=$c acquired=$a released=$rl leftover=$([ -e "$LOCK" ] && echo yes || echo no)" - fi -done -[ "$t1_fail" = 0 ] && ok "$ROUNDS rounds x $N workers ($GCL_MODE): no lost updates, balanced acquire/release, no leftover lock" \ - || bad "mutual-exclusion failure in at least one round (see above)" -# Regression: under contention the lock file routinely vanishes mid-mtime-probe; -# that must NOT be misdiagnosed as "staleness detection broken" (false WARNING -# observed 2026-06-10 before the probe got its retry loop). -grep -q "Staleness detection is BROKEN" "$T1ERR" \ - && bad "spurious mtime-probe WARNING under contention (see $T1ERR)" \ - || ok "no spurious mtime-probe warnings under contention" - -echo "== Test 2: stale lock (old file mtime) is stolen; holder comes from line 2 ==" +if section "Test 2: stale lock (old file mtime) is stolen; holder comes from line 2"; then LOCK="$WORK/steal.lock"; LOG="$WORK/steal.log"; : > "$LOG"; MARKER="$WORK/steal-marker" fabricate_lock "$LOCK" "tok.fake.99999.1" "pid=99999 host=ghost" backdate "$LOCK" 9999 # make the FILE mtime ancient -> stale @@ -208,8 +136,9 @@ grep -q STOLE "$LOG" && ok "log records a steal" || bad "no STOLE entry" grep -q "holder=pid=99999 host=ghost" "$LOG" \ && ok "STALE log line carries the holder parsed from line 2" \ || bad "holder from line 2 missing in the STALE log line" +fi -echo "== Test 2b: crash recovery under CONTENTION — claim-serialized: zero displacement, zero 98s ($GCL_MODE: $T2B_ROUNDS rounds) ==" +if section "Test 2b: crash recovery under CONTENTION — claim-serialized: zero displacement, zero 98s ($GCL_MODE: $T2B_ROUNDS rounds)"; then # The claim SERIALIZES stealers, so the straggler-robs-recovery-winner race # is PREVENTED, not detected-and-repaired. Scenario: one crashed lock, N # waiters judging stale in the same poll window (the launch/backdate sync @@ -230,16 +159,49 @@ echo "== Test 2b: crash recovery under CONTENTION — claim-serialized: zero dis # WINNER'S live lock), the attempt is kept only if its outcome is clean and # otherwise discarded and retried (bounded), instead of failing assertions # the protocol never violated. -T2B_N=4 +# +# Waiter count is swept over $T_AXIS_A (see load-testing-strategy.md): one iteration at N=4 by +# default (byte-identical to today) and at N=4,12,24 under GCL_TEST_SWEEP=1. +# Every sweep iteration's assertions carry an " at N=" tag so a sweep +# failure says which N broke; that tag is SUPPRESSED in the default (non-sweep) +# run (t2b_ntag empty) so the messages are byte-identical to today — the first +# assertion already names the count via "$T2B_N waiters". The correctness +# invariants asserted here (zero 98, exactly one steal, no move-aside, clean +# final state) stay ok/bad strict (not envelope) at all N — but that requires +# STALE >> the winner's EFFECTIVE hold, which grows with N under load (the +# winner is one of N concurrent processes; oversubscription stretches the wall +# time between its create and release), so STALE is floored to N when sweeping +# (t2b_stale) — at the default floor it is the same 8 as today. The per-waiter +# wall-clock budget scales too: MAX_WAIT = 30*N (=> 120 at N=4, today's value) +# so a wide sweep, where the losing waiters acquire in sequence after the winner +# releases, has time to drain instead of timing out and looking like a product +# failure. T2B_TRIES=3 # per-round attempts; see the backdate_ghost note +for T2B_N in $T_AXIS_A; do +# MAX_WAIT and STALE: today's exact values (120 / 8) in the default (non-sweep) +# run so the env passed to the library is byte-identical; only the sweep's wider +# N raise them. MAX_WAIT scales 30*N (=> 120 at N=4 anyway). STALE floors to N so +# a wide fan-out's load-stretched winner hold (the winner is one of N concurrent +# processes) can never make its own live lock look stale and trigger a +# legitimate-but-unwanted second steal. +if [ "$GCL_TEST_SWEEP" = 1 ]; then + t2b_maxwait=$(( 30 * T2B_N )) + [ "$T2B_N" -gt 8 ] && t2b_stale="$T2B_N" || t2b_stale=8 + t2b_ntag=" at N=$T2B_N" +else + t2b_maxwait=120; t2b_stale=8; t2b_ntag="" +fi t2b_fail=0; t2b_stole=0; t2b_old_shape=0; t2b_disp=0; t2b_98=0; t2b_retried=0 for r in $(seq 1 "$T2B_ROUNDS"); do t2b_valid=0 for try in $(seq 1 "$T2B_TRIES"); do - GHOST="tok.ghost.t2b.$r.$try" + # Ghost token carries an N segment only when sweeping (distinct per N); the + # default keeps today's exact "tok.ghost.t2b.$r.$try" so the lock CONTENT + # the library sees is byte-identical. + if [ "$GCL_TEST_SWEEP" = 1 ]; then GHOST="tok.ghost.t2b.$T2B_N.$r.$try"; else GHOST="tok.ghost.t2b.$r.$try"; fi LOCK="$WORK/recov.$r.lock"; RAN="$WORK/recov.$r.ran"; : > "$RAN" GRAVESEEN="$WORK/recov.$r.graveseen"; SAMPSTOP="$WORK/recov.$r.sampstop" - rm -f "$GRAVESEEN" "$SAMPSTOP" "$LOCK" "$LOCK.next" + rm -f "$GRAVESEEN" "$SAMPSTOP" "$LOCK" "$LOCK.next" "$WORK/recov.$r".*.log fabricate_lock "$LOCK" "$GHOST" "pid=999 host=ghost" # fresh mtime: not yet stale # Move-aside sampler: ANY .dead.* sighting at ANY moment during the round # means the implementation stages the steal through an intermediate file @@ -253,21 +215,21 @@ for r in $(seq 1 "$T2B_ROUNDS"); do done ) & sampler=$! - pids=() + pids=(); waiter_logs=() for i in $(seq 1 "$T2B_N"); do : > "$WORK/recov.$r.$i.log" # per-waiter logs: concurrent appends to one log drop lines - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov.$r.$i.log" AGENT_LOCK_STALE_SECS=8 \ - AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \ + waiter_logs+=("$WORK/recov.$r.$i.log") + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/recov.$r.$i.log" AGENT_LOCK_STALE_SECS="$t2b_stale" \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t2b_maxwait" \ bash "$LIB" run -- bash -c 'echo ran >> "$1"; sleep 0.1' _ "$RAN" 2>/dev/null & pids+=($!) done t2b_sync=1 - if ! sync_waiting_fresh "$LOCK" 60 "$WORK/recov.$r.1.log" "$WORK/recov.$r.2.log" \ - "$WORK/recov.$r.3.log" "$WORK/recov.$r.4.log"; then + if ! sync_waiting_fresh "$LOCK" 60 "${waiter_logs[@]}"; then t2b_sync=0 for i in $(seq 1 "$T2B_N"); do grep -q "WAITING for lock" "$WORK/recov.$r.$i.log" 2>/dev/null \ - || echo " round $r: waiter $i never logged WAITING" + || echo " N=$T2B_N round $r: waiter $i never logged WAITING" done fi backdate_ghost "$LOCK" "$GHOST" 9999; bd=$? # all waiters now judge the ghost stale together @@ -276,8 +238,8 @@ for r in $(seq 1 "$T2B_ROUNDS"); do wait "${pids[$((i-1))]}"; rc=$? case "$rc" in 0) ;; - 98) round_98=$((round_98+1)); echo " round $r: waiter $i rc=98 — displacement under the claim protocol" ;; - *) round_badrc=$((round_badrc+1)); echo " round $r: waiter $i rc=$rc (want 0)" ;; + 98) round_98=$((round_98+1)); echo " N=$T2B_N round $r: waiter $i rc=98 — displacement under the claim protocol" ;; + *) round_badrc=$((round_badrc+1)); echo " N=$T2B_N round $r: waiter $i rc=$rc (want 0)" ;; esac done touch "$SAMPSTOP"; wait "$sampler" 2>/dev/null @@ -300,7 +262,7 @@ for r in $(seq 1 "$T2B_ROUNDS"); do { [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } && round_dirty=1 if [ "$round_dirty" = 1 ]; then t2b_retried=$((t2b_retried+1)) - echo " round $r try $try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying" + echo " N=$T2B_N round $r try $try: non-conclusive backdate AND dirty outcome — attempt discarded, retrying" rm -f "$LOCK" "$LOCK.next" "$RAN" "$GRAVESEEN" "$SAMPSTOP" continue fi @@ -312,40 +274,42 @@ for r in $(seq 1 "$T2B_ROUNDS"); do nran="$(grep -c ran "$RAN")" [ "$nran" = "$T2B_N" ] || { t2b_fail=1 - echo " round $r: only $nran/$T2B_N commands ran" + echo " N=$T2B_N round $r: only $nran/$T2B_N commands ran" } [ -e "$LOCK" ] && { t2b_fail=1 - echo " round $r: leftover lock" + echo " N=$T2B_N round $r: leftover lock" } [ -e "$LOCK.next" ] && { t2b_fail=1 - echo " round $r: leftover claim" + echo " N=$T2B_N round $r: leftover claim" } [ -e "$GRAVESEEN" ] && { t2b_fail=1 - echo " round $r: a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!" + echo " N=$T2B_N round $r: a move-aside file (.dead.*) existed during recovery — the steal is staged through an intermediate file!" } t2b_stole=$((t2b_stole + $(grep -c "STOLE-BY-CLAIM" "$WORK/recov.$r.all.log"))) t2b_old_shape=$((t2b_old_shape + $(grep -c "STOLE stale lock" "$WORK/recov.$r.all.log"))) t2b_disp=$((t2b_disp + $(grep -c "STEAL-DISPLACED" "$WORK/recov.$r.all.log"))) break done - [ "$t2b_valid" = 1 ] || { t2b_fail=1; echo " round $r: no clean round under a conclusive backdate in $T2B_TRIES attempts"; } + [ "$t2b_valid" = 1 ] || { t2b_fail=1; echo " N=$T2B_N round $r: no clean round under a conclusive backdate in $T2B_TRIES attempts"; } done -[ "$t2b_retried" = 0 ] || echo " note: $t2b_retried discarded attempt(s) — harness backdate race, not a protocol verdict" +[ "$t2b_retried" = 0 ] || echo " note: $t2b_retried discarded attempt(s) at N=$T2B_N — harness backdate race, not a protocol verdict" [ "$t2b_fail" = 0 ] && ok "$T2B_ROUNDS rounds x $T2B_N waiters on one crashed lock: all ran, clean final state, no move-aside file ever existed" \ - || bad "crash-recovery contention failure (see above)" -[ "$t2b_98" = 0 ] && ok "zero spurious 98s — the claim serialized recovery (unserialized: near-certain displacement)" \ - || bad "$t2b_98 waiter(s) exited 98 — displacement happened under the claim protocol" -[ "$t2b_stole" = "$T2B_ROUNDS" ] && ok "exactly one STOLE-BY-CLAIM per recovery (x$t2b_stole/$T2B_ROUNDS rounds)" \ - || bad "STOLE-BY-CLAIM count $t2b_stole != $T2B_ROUNDS rounds (want exactly one steal per recovery)" -[ "$t2b_old_shape" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged" \ - || bad "'STOLE stale lock' line appeared x$t2b_old_shape — an unserialized steal lane is present" -[ "$t2b_disp" = 0 ] && ok "zero STEAL-DISPLACED lines (prevention, not detect-and-repair)" \ - || bad "STEAL-DISPLACED fired x$t2b_disp — displacement-repair machinery present?" - -echo "== Test 3: REGRESSION — EMPTY lock file (crash between create and write) is still stolen ==" + || bad "crash-recovery contention failure$t2b_ntag (see above)" +[ "$t2b_98" = 0 ] && ok "zero spurious 98s$t2b_ntag — the claim serialized recovery (unserialized: near-certain displacement)" \ + || bad "$t2b_98 waiter(s) exited 98$t2b_ntag — displacement happened under the claim protocol" +[ "$t2b_stole" = "$T2B_ROUNDS" ] && ok "exactly one STOLE-BY-CLAIM per recovery$t2b_ntag (x$t2b_stole/$T2B_ROUNDS rounds)" \ + || bad "STOLE-BY-CLAIM count $t2b_stole != $T2B_ROUNDS rounds$t2b_ntag (want exactly one steal per recovery)" +[ "$t2b_old_shape" = 0 ] && ok "unserialized-steal line shape ('STOLE stale lock') never logged$t2b_ntag" \ + || bad "'STOLE stale lock' line appeared x$t2b_old_shape$t2b_ntag — an unserialized steal lane is present" +[ "$t2b_disp" = 0 ] && ok "zero STEAL-DISPLACED lines$t2b_ntag (prevention, not detect-and-repair)" \ + || bad "STEAL-DISPLACED fired x$t2b_disp$t2b_ntag — displacement-repair machinery present?" +done +fi + +if section "Test 3: REGRESSION — EMPTY lock file (crash between create and write) is still stolen"; then # The file-protocol descendant of the 2026-05-30 orphan bug: an acquirer that # died after the open but before (or mid-) content write leaves an empty file. # Staleness MUST come from the file mtime and the content guard MUST class an @@ -359,8 +323,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \ bash "$LIB" run -- bash -c 'echo after > "$1"' _ "$MARKER"; rc=$? [ "$rc" = 0 ] && ok "empty-file orphan stolen (no hang)" || bad "orphan NOT stolen (rc=$rc) — regression!" [ "$(cat "$MARKER")" = after ] && ok "command ran after stealing orphan" || bad "command did not run" +fi -echo "== Test 4: a LIVE lock is NOT stolen (waiter logs WAITING, blocks, then proceeds) ==" +if section "Test 4: a LIVE lock is NOT stolen (waiter logs WAITING, blocks, then proceeds)"; then LOCK="$WORK/live.lock"; LOG="$WORK/live.log"; : > "$LOG"; ORDER="$WORK/order"; echo none > "$ORDER" READY="$WORK/t4.ready"; GO4="$WORK/t4.go" # Holder keeps the lock until the test has SEEN the waiter contend (the @@ -383,8 +348,9 @@ wait "$waiter"; wait "$holder" [ "$(tr '\n' ',' < "$ORDER")" = "none,holder-start,holder-end,waiter-ran," ] \ && ok "ordering correct" || bad "ordering wrong: $(tr '\n' ',' < "$ORDER")" grep -q STOLE "$LOG" && bad "waiter wrongly STOLE a live lock" || ok "no wrongful steal of live lock" +fi -echo "== Test 4b: a ROBBED slow holder detects the theft and FAILS with 98 on release ==" +if section "Test 4b: a ROBBED slow holder detects the theft and FAILS with 98 on release"; then # The fail-open ceiling: a hold longer than the stale window CAN be stolen by a # contender. The robbed holder must DETECT this at release (the lock file is # gone, or carries the thief's token) and exit EXACTLY 98 (the reserved @@ -415,8 +381,9 @@ wait "$vpid"; victim_rc=$? grep -q "WARNING: lock LOST" "$LOG" && ok "robbed holder logged a loud theft WARNING" || bad "no theft WARNING logged" [ "$thief_rc" = 0 ] && ok "thief (its own fresh hold) released cleanly (rc 0)" || bad "thief rc=$thief_rc (should be 0)" grep -q thief-work "$OUT" && ok "thief did its work" || bad "thief work missing" +fi -echo "== Test 4c: a slow but UNCONTENDED holder keeps its lock (slowness != failure) ==" +if section "Test 4c: a slow but UNCONTENDED holder keeps its lock (slowness != failure)"; then # Documents the boundary: exceeding the stale window is only dangerous when a # contender actually steals. With no waiter, the file is never moved, the token # still matches, and release succeeds. (If this failed, the lock would punish @@ -427,16 +394,18 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 AGENT_LOCK [ "$solo_rc" = 0 ] && ok "uncontended slow holder released cleanly (rc 0)" || bad "uncontended slow holder rc=$solo_rc (should be 0)" grep -q "WARNING: lock LOST" "$LOG" && bad "spurious theft WARNING with no contender" || ok "no spurious WARNING when uncontended" grep -q solo-done "$OUT" && ok "uncontended slow holder did its work" || bad "work missing" +fi -echo "== Test 5: run propagates the command's exit code, releases either way ==" +if section "Test 5: run propagates the command's exit code, releases either way"; then LOCK="$WORK/rc.lock"; LOG="$WORK/rc.log"; : > "$LOG" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exit 0'; rc=$? [ "$rc" = 0 ] && ok "exit 0 propagated" || bad "exit 0 not propagated (rc=$rc)" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exit 7'; rc=$? [ "$rc" = 7 ] && ok "exit 7 propagated" || bad "exit code not propagated (rc=$rc)" [ -e "$LOCK" ] && bad "lock left held after run" || ok "lock released after run (success and failure)" +fi -echo "== Test 6: default lock FILE and log live in the git dir ==" +if section "Test 6: default lock FILE and log live in the git dir"; then SCRATCH="$WORK/scratch"; mkdir -p "$SCRATCH" git -C "$SCRATCH" init -q; git -C "$SCRATCH" config user.email t@t; git -C "$SCRATCH" config user.name t GITDIR="$(git -C "$SCRATCH" rev-parse --absolute-git-dir)" @@ -455,8 +424,9 @@ touch "$GO6" wait "$h6" [ -e "$GITDIR/commit.lock" ] && bad "default lock file left behind after release" || ok "default lock file removed on release" [ -f "$GITDIR/git-commit-lock.log" ] && ok "lock log created in git dir ($GITDIR)" || bad "no log in git dir" +fi -echo "== Test 7: CLI usage errors exit 96 (stderr); explicit --help/-h exits 0 (stdout) ==" +if section "Test 7: CLI usage errors exit 96 (stderr); explicit --help/-h exits 0 (stdout)"; then bash "$LIB" >/dev/null 2>&1; [ "$?" = 96 ] && ok "no args -> 96" || bad "no args rc=$? (want 96)" bash "$LIB" frobnicate > "$WORK/t7.err.out" 2> "$WORK/t7.err.err" [ "$?" = 96 ] && ok "unknown subcommand -> 96" || bad "unknown subcommand rc=$? (want 96)" @@ -475,8 +445,9 @@ for h in --help -h; do && ok "$h -> usage on stdout, exit 0, stderr empty" \ || bad "$h rc=$rc (want 0) stdout-usage=$(grep -c '^usage:' "$WORK/t7.help.out") stderr=$(head -c 60 "$WORK/t7.help.err")" done +fi -echo "== Test 8: acquire timeout exits 97 and the command NEVER runs ==" +if section "Test 8: acquire timeout exits 97 and the command NEVER runs"; then LOCK="$WORK/tmo.lock"; LOG="$WORK/tmo.log"; : > "$LOG"; READY="$WORK/t8.ready"; DONE8="$WORK/t8.done" # Holder keeps the lock until the test says so (marker, not a fixed sleep — # under heavy load a slow-starting waiter once arrived AFTER a 4s holder had @@ -522,8 +493,9 @@ grep -q "raise AGENT_LOCK_MAX_WAIT" "$WORK/t8.warn3.err" \ || ok "explicit MAX_WAIT silences the knob-relation warning (left-default gate kept)" wait "$h8"; rc=$? [ "$rc" = 0 ] && ok "holder unaffected by the timed-out waiter" || bad "holder rc=$rc (want 0)" +fi -echo "== Test 9: sub-floor (pre-2000) file mtime is NOT treated as stale ==" +if section "Test 9: sub-floor (pre-2000) file mtime is NOT treated as stale"; then # The FILETIME-zero guard: a freshly created file can transiently report a 1601 # mtime to an observer on Windows (probes C/C1b); # anything before 2000-01-01 must be classed unsettled — the waiter WAITS (and @@ -539,8 +511,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ grep -q STOLE "$LOG" && bad "sub-floor lock was wrongly STOLEN" || ok "no steal of sub-floor lock" [ -f "$LOCK" ] && ok "sub-floor lock file untouched" || bad "sub-floor lock file was removed" rm -f "$LOCK" +fi -echo "== Test 10: every worktree gets its OWN lock (git-dir scoping) ==" +if section "Test 10: every worktree gets its OWN lock (git-dir scoping)"; then WTREPO="$WORK/wtrepo"; mkdir -p "$WTREPO" git -C "$WTREPO" init -q; git -C "$WTREPO" config user.email t@t; git -C "$WTREPO" config user.name t git -C "$WTREPO" commit -q --allow-empty -m init @@ -573,8 +546,9 @@ wait "$h10" [ -e "$WTGD/commit.lock" ] && bad "worktree lock left behind" || ok "worktree lock released" [ -f "$WTGD/git-commit-lock.log" ] && ok "worktree log lives in its worktree git dir" || bad "no log at $WTGD" [ -e "$MAINGD/commit.lock" ] && bad "main-repo lock left behind" || ok "main-repo lock released" +fi -echo "== Test 11: TERM mid-hold — lock released, wrapper dies with 128+15 ==" +if section "Test 11: TERM mid-hold — lock released, wrapper dies with 128+15"; then # Two discriminators: (a) the EXIT/TERM trap must actually # release the lock when the `run` wrapper is killed; (b) the wrapper must NOT # swallow the signal (a swallowing wrapper releases, keeps going, and exits 0 @@ -598,8 +572,9 @@ wait "$w11"; rc=$? || bad "TERM'd run wrapper rc=$rc (want 143)" [ -e "$LOCK" ] && bad "lock left held after TERM" || ok "lock released on TERM" grep -q RELEASED "$LOG" && ok "release logged on TERM path" || bad "no RELEASED entry on TERM path" +fi -echo "== Test 12: sourced API — acquire/release, traps, strict-mode hygiene ==" +if section "Test 12: sourced API — acquire/release, traps, strict-mode hygiene"; then # 12a: sourcing must not impose errexit/nounset/pipefail; acquire/release work # across separate commands; reentrant acquire is refused (rc 1, lock kept); # release is idempotent. Distinct failure codes pinpoint the broken step. @@ -691,8 +666,9 @@ done wait "$p12"; rc=$? [ "$rc" = 143 ] && ok "post-release shell dies on TERM (143) — signal disposition restored" \ || bad "post-release shell rc=$rc on TERM (want 143; signal-immune shell?)" +fi -echo "== Test 13: garbage AGENT_LOCK_* numerics fall back to defaults with a note ==" +if section "Test 13: garbage AGENT_LOCK_* numerics fall back to defaults with a note"; then LOCK="$WORK/num.lock"; LOG="$WORK/num.log"; : > "$LOG" AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \ AGENT_LOCK_STALE_SECS=banana AGENT_LOCK_POLL_SECS=-1 AGENT_LOCK_MAX_WAIT=0 \ @@ -701,8 +677,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \ [ "$rc" = 0 ] && ok "run succeeds despite garbage numeric config" || bad "rc=$rc with garbage numerics" n="$(grep -c "ignoring invalid" "$WORK/t13.err")" [ "$n" = 4 ] && ok "all 4 garbage values noted on stderr, incl. CLAIM_STALE_SECS (got $n)" || bad "expected 4 'ignoring invalid' notes, got $n" +fi -echo "== Test 14: run outside any git repo hard-fails 96 unless AGENT_LOCK_PATH is set ==" +if section "Test 14: run outside any git repo hard-fails 96 unless AGENT_LOCK_PATH is set"; then NR="$WORK/norepo"; mkdir -p "$NR" ( cd "$NR" && env GIT_CEILING_DIRECTORIES="$WORK" bash "$LIB" run -- bash -c 'true' ) 2> "$WORK/t14.err"; rc=$? [ "$rc" = 96 ] && ok "run outside a repo refused with 96" || bad "run outside a repo rc=$rc (want 96)" @@ -710,8 +687,9 @@ grep -q "AGENT_LOCK_PATH" "$WORK/t14.err" && ok "refusal message mentions AGENT_ ( cd "$NR" && env GIT_CEILING_DIRECTORIES="$WORK" AGENT_LOCK_PATH="$NR/x.lock" AGENT_LOCK_LOG="$NR/x.log" \ bash "$LIB" run -- bash -c 'true' ) 2>/dev/null; rc=$? [ "$rc" = 0 ] && ok "explicit AGENT_LOCK_PATH works outside a repo" || bad "explicit AGENT_LOCK_PATH outside repo rc=$rc" +fi -echo "== Test 14b: SOURCING outside a repo warns on stderr and creates NO files ==" +if section "Test 14b: SOURCING outside a repo warns on stderr and creates NO files"; then # Sourcing keeps the CWD fallback (it must never explode), but the warning # goes to STDERR — warning via the lock log instead would, as a side # effect, CREATE ./git-commit-lock.log in whatever random directory the @@ -731,8 +709,9 @@ leftovers="$(ls -A "$NRS" 2>/dev/null)" # (There is deliberately no Test 15: the steal installs by rename-over and # never creates a move-aside (.dead.*) file, so there is no sweep to test. # An implementation must never create one; Test 2b's sampler enforces that.) +fi -echo "== Test 16: EMPTY lock file at release — unverifiable lane (2 / run:1), NOT a theft verdict ==" +if section "Test 16: EMPTY lock file at release — unverifiable lane (2 / run:1), NOT a theft verdict"; then # Truncation stands in for the probe-F window: a file that reads empty after # the retry ladder is a successor mid-create after a boundary steal, or # external truncation — it canNOT be our own failed write (acquire's @@ -760,8 +739,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \ bash "$LIB" run -- bash -c ': > "$AGENT_LOCK_PATH"; exit 7' 2>/dev/null; rc=$? [ "$rc" = 7 ] && ok "run keeps a failing command's own code (7) over the unverifiable 1" || bad "run empty-file+exit-7 rc=$rc (want 7)" rm -f "$LOCK" +fi -echo "== Test 16b: lock file GONE at release — definitive theft, exactly 98 ==" +if section "Test 16b: lock file GONE at release — definitive theft, exactly 98"; then # Acquire's read-back proved our # token was AT the path, so a missing file at release can only mean someone # renamed/removed it (a steal, or external interference) — report 98, loudly. @@ -780,8 +760,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" \ bash "$LIB" run -- bash -c 'rm -f "$AGENT_LOCK_PATH"' 2>/dev/null; rc=$? [ "$rc" = 98 ] && ok "run reports 98 (overrides a successful command) when the lock file is gone" \ || bad "run gone-at-release rc=$rc (want 98)" +fi -echo "== Test 16c: release rides out a TRANSIENT empty read (escalating retry ladder — ps1 parity) ==" +if section "Test 16c: release rides out a TRANSIENT empty read (escalating retry ladder — ps1 parity)"; then # A sub-second window in which the lock file reads EMPTY (stand-in for an AV # scanner's blocking handle, or a probe-F create->write gap that resolves) # must NOT produce the unverifiable verdict: the read-retry ladder (shared @@ -814,8 +795,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c ' grep -q "EMPTY/unreadable at release" "$WORK/t16c.err" \ && bad "spurious unverifiable warning despite the token reappearing" \ || ok "no unverifiable warning for the ridden-out transient" +fi -echo "== Test 17: NON-FILE at the lock path — never stolen, loud one-time config warning, waiters reach 97 ==" +if section "Test 17: NON-FILE at the lock path — never stolen, loud one-time config warning, waiters reach 97"; then # (a) a directory (a config typo like AGENT_LOCK_PATH=\$HOME, or a directory # lock left by an older release). The per-poll type guard fires regardless of # age — but only after the SAME concrete type is seen on two consecutive @@ -890,8 +872,9 @@ else rm -f "$LOCK" 2>/dev/null echo "note: mkfifo unavailable/unusable here — FIFO guard not exercised (CI POSIX legs cover it)" fi +fi -echo "== Test 17d: REGRESSION — create/delete churn at the lock path must NOT fire the non-lock warning ==" +if section "Test 17d: REGRESSION — create/delete churn at the lock path must NOT fire the non-lock warning"; then # The per-poll guard's existence (-e/-L) and classification (-f && ! -L) # checks are SEPARATE stats. A rival's release/steal unlink landing between # them — or a Windows delete-pending ghost (the unlink queues behind a rival @@ -962,26 +945,58 @@ if [ -n "$churn_pid" ]; then # never churned, so bash sees it reliably. Budget 60s: pwsh cold start on # a loaded box can take >15s. if wait_for_file "$START" 60; then - warn17d=0; got97=0 + # Per-waiter lock logs (single-writer => drop-free): a SHARED log drops lines + # under concurrent appends (cf. the per-waiter logs at Test 2B), which would make + # the WAITING anti-vacuity count below unreliable. Rebuilt into $LOG after the runs. + warn17d=0; n0=0; n1=0; n97=0; n98=0; nother=0; rc_bad="" for r in 1 2 3; do pids=() for i in 1 2 3 4; do - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=300 \ + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/t17d.$r.$i.log" AGENT_LOCK_STALE_SECS=300 \ AGENT_LOCK_POLL_SECS=0.02 AGENT_LOCK_MAX_WAIT=2 \ bash "$LIB" run -- bash -c 'true' 2> "$WORK/t17d.$r.$i.err" & pids+=($!) done for i in 1 2 3 4; do wait "${pids[$((i-1))]}"; rc=$? - [ "$rc" = 97 ] && got97=$((got97+1)) + # A CLEAN command ('true') under this churn has exactly FOUR correct terminal + # codes — do NOT tighten this set: rc 1 is the real catch that made the old + # got97>=1 assertion flaky (see the Test 17d de-flake plan). + # 0 acquired in an absent window, clean release + # 1 acquired, but release read the held lock EMPTY (the churner's + # create->write window) -> release rc 2 -> lock_run demotes the clean + # command to 1 (ownership unverifiable; correct, not a defect) + # 97 never won an absent window within MAX_WAIT -> timed out + # 98 churner overwrote the hold before release -> designed theft detection + case "$rc" in + 0) n0=$((n0+1)) ;; + 1) n1=$((n1+1)) ;; + 97) n97=$((n97+1)) ;; + 98) n98=$((n98+1)) ;; + *) nother=$((nother+1)); rc_bad="$rc_bad $r.$i=$rc" ;; + esac n="$(grep -c 'is not a lock file' "$WORK/t17d.$r.$i.err")" warn17d=$((warn17d+n)) done done + # Rebuild the consolidated churn.log artifact from the drop-free per-waiter logs. + # 'cat glob > file' is a redirect, not a pipe (no SC2002); then count WAITING from + # the single rebuilt file. + cat "$WORK"/t17d.*.log > "$LOG" 2>/dev/null || : + waited="$(grep -c 'WAITING for lock' "$LOG")" + echo "note: T17d outcomes rc0=$n0 rc1=$n1 rc97=$n97 rc98=$n98 other=$nother; WAITING=$waited" [ "$warn17d" = 0 ] && ok "12 waiters polled through churn with ZERO spurious non-lock warnings" \ || bad "churned regular file fired $warn17d non-lock warning(s) — per-poll guard TOCTOU regression!" - [ "$got97" -ge 1 ] && ok "waiters still timed out at 97 under churn ($got97/12)" \ - || bad "no waiter reached 97 under churn (got97=$got97/12) — timeout lane bypassed?" + # Replaces the old got97>=1 assertion (timeout is only ONE of the correct outcomes; + # which one occurs is machine-speed luck). Assert each waiter reached a DESIGNED + # terminal state instead — catches a real product regression (crash/139, 96, …). + [ "$nother" = 0 ] && ok "all 12 waiters reached a designed terminal state (rc in {0,1,97,98})" \ + || bad "waiter(s) hit an undesigned rc under churn:$rc_bad (rc0=$n0 rc1=$n1 rc97=$n97 rc98=$n98)" + # Anti-vacuity: WAITING is logged only after a create was blocked by a PRESENT lock, + # immediately before the per-poll type guard that warn17d guards — so >=1 proves the + # churn produced real contention and the guarded path ran. 0 => dead/absent churner. + [ "$waited" -ge 1 ] && ok "churn exercised the blocked-poll type-guard lane ($waited WAITING line(s))" \ + || bad "no WAITING logged under churn — contention never happened; test ran vacuously" else bad "T17d churner never signalled its start marker" echo " diag: churner pid=$churn_pid alive=$(kill -0 "$churn_pid" 2>/dev/null && echo yes || echo no)" @@ -998,8 +1013,9 @@ if [ -n "$churn_pid" ]; then else echo "note: $churn_skip — churn-vs-guard regression not exercised here (CI legs cover it)" fi +fi -echo "== Test 18: stale NON-LOCK CONTENT at the lock path is never stolen; torn tokens split on the tok. prefix ==" +if section "Test 18: stale NON-LOCK CONTENT at the lock path is never stolen; torn tokens split on the tok. prefix"; then # The content guard (age-gated): steal only an empty file or a line 1 starting # "tok.". A real user file at a typo'd AGENT_LOCK_PATH must survive, forever. # (a) a user file @@ -1042,8 +1058,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \ && ok "tok.-prefixed torn token IS stolen by staleness (crash-orphan lane)" \ || bad "tok.-prefixed torn token not stolen (rc=$rc marker=$(cat "$MARKER"))" grep -q STOLE "$LOG" && ok "steal of the torn token logged" || bad "no STOLE entry for torn token" +fi -echo "== Test 19: wire format — token on line 1 (tok.-prefixed), owner on line 2 ==" +if section "Test 19: wire format — token on line 1 (tok.-prefixed), owner on line 2"; then # Pins the on-disk format the ps1 port must match, and that token parsing # takes LINE 1 only (an owner line present must not pollute the token). LOCK="$WORK/wire.lock"; LOG="$WORK/wire.log"; : > "$LOG" @@ -1059,43 +1076,86 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c ' ' _ "$LIB" "$LOCK"; rc=$? [ "$rc" = 0 ] && ok "lock file carries token (line 1, tok.-prefixed) + owner (line 2); release parses line 1 with owner present" \ || bad "wire-format check failed at step code $rc" +fi -echo "== Test 20: claim contention — N concurrent stealers, ONE claim winner ($GCL_MODE: $T20_N workers) ==" +if section "Test 20: claim contention — N concurrent stealers, ONE claim winner ($GCL_MODE: $T20_N workers)"; then # N stealers race one ancient ghost: exactly one wins the O_EXCL claim and # steals (one STOLE-BY-CLAIM); the rest lose the claim create and acquire # normally in sequence after the winner releases. No displacement (zero -# LOST/98), no leftovers. STALE=5 keeps a loaded box from re-stealing the -# winner's brief hold. +# LOST/98), no leftovers. STALE keeps a loaded box from re-stealing the +# winner's brief hold — that bound only holds while STALE >> the winner's +# effective hold, which (counter-intuitively) grows with N: the WINNER is one +# of N concurrently-spawned bash processes, so under oversubscription the wall +# time between its create and its release stretches with the contention. So +# STALE must scale with N too (see t20_stale below), keeping "exactly one +# steal" a strict, config-independent correctness invariant at every N. +# +# Waiter count is swept (see load-testing-strategy.md). Unlike Test 2b/16, this test's floor is NOT +# 4 — it is the MODE-driven $T20_N (5 REDUCED / 10 FULL), the count CI already +# stresses. So instead of iterating the shared T_AXIS_A ("4 ...") it builds its +# own list: just $T20_N by default (byte-identical), and $T20_N plus the sweep's +# higher counts (12, 24) under GCL_TEST_SWEEP=1 — preserving today's per-PR AND +# full-mode coverage while still widening the sweep. MAX_WAIT scales 30*N (the +# workers run `true`, so this is ample headroom, never the floor's behaviour). LOCK="$WORK/contend.lock" -fabricate_lock "$LOCK" "tok.ghost.t20" "pid=888 host=ghost" +T20_FLOOR="$T20_N" +if [ "$GCL_TEST_SWEEP" = 1 ]; then + T20_AXIS="$T20_FLOOR" + for _n in 12 24; do [ "$_n" = "$T20_FLOOR" ] || T20_AXIS="$T20_AXIS $_n"; done +else + T20_AXIS="$T20_FLOOR" +fi +for T20_N in $T20_AXIS; do +# N-tag for assertion messages: empty in the default run (byte-identical), set +# only when sweeping so each N's pass/fail line is attributable. +if [ "$GCL_TEST_SWEEP" = 1 ]; then t20_ntag=" at N=$T20_N"; else t20_ntag=""; fi +# MAX_WAIT and STALE: keep today's exact values (120 / 5) in the default +# (non-sweep) run so the env passed to the library is byte-identical; only the +# sweep's wider N raise them. MAX_WAIT scales 30*N (workers run `true`, ample +# headroom). STALE floors to N so a wide fan-out's load-stretched winner hold +# can NEVER make a live lock look stale -> the "exactly one steal" invariant +# stays true at N=24 just as at the floor. The fixture ghost token likewise +# carries an N segment only when sweeping (distinct tokens per N), so the +# default lock CONTENT the library sees is unchanged too. +if [ "$GCL_TEST_SWEEP" = 1 ]; then + t20_maxwait=$(( 30 * T20_N )) + [ "$T20_N" -gt 5 ] && t20_stale="$T20_N" || t20_stale=5 + t20_ghost="tok.ghost.t20.$T20_N" +else + t20_maxwait=120; t20_stale=5; t20_ghost="tok.ghost.t20" +fi +rm -f "$WORK/contend".*.log "$LOCK" "$LOCK.next" +fabricate_lock "$LOCK" "$t20_ghost" "pid=888 host=ghost" backdate "$LOCK" 9999 pids=(); t20_fail=0 for i in $(seq 1 "$T20_N"); do : > "$WORK/contend.$i.log" - AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/contend.$i.log" AGENT_LOCK_STALE_SECS=5 \ - AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT=120 \ + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$WORK/contend.$i.log" AGENT_LOCK_STALE_SECS="$t20_stale" \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.05 AGENT_LOCK_MAX_WAIT="$t20_maxwait" \ bash "$LIB" run -- bash -c 'true' 2>/dev/null & pids+=($!) done for i in $(seq 1 "$T20_N"); do wait "${pids[$((i-1))]}"; rc=$? - [ "$rc" = 0 ] || { t20_fail=1; echo " worker $i rc=$rc (want 0)"; } + [ "$rc" = 0 ] || { t20_fail=1; echo " N=$T20_N worker $i rc=$rc (want 0)"; } done cat "$WORK/contend."*.log > "$WORK/contend.all.log" nst="$(grep -c "STOLE-BY-CLAIM" "$WORK/contend.all.log")" nacq="$(grep -c "ACQUIRED" "$WORK/contend.all.log")" nrel="$(grep -c "RELEASED" "$WORK/contend.all.log")" nlost="$(grep -c "lock LOST" "$WORK/contend.all.log")" -[ "$t20_fail" = 0 ] && ok "$T20_N concurrent stealers all completed with rc 0" || bad "claim-contention worker failures (see above)" -[ "$nst" = 1 ] && ok "exactly ONE claim winner stole the ghost (STOLE-BY-CLAIM x$nst)" \ - || bad "STOLE-BY-CLAIM x$nst (want exactly 1 — the claim must serialize stealers)" +[ "$t20_fail" = 0 ] && ok "$T20_N concurrent stealers all completed with rc 0" || bad "claim-contention worker failures$t20_ntag (see above)" +[ "$nst" = 1 ] && ok "exactly ONE claim winner stole the ghost$t20_ntag (STOLE-BY-CLAIM x$nst)" \ + || bad "STOLE-BY-CLAIM x$nst$t20_ntag (want exactly 1 — the claim must serialize stealers)" [ "$nacq" = "$T20_N" ] && [ "$nrel" = "$T20_N" ] && ok "balanced ACQUIRED/RELEASED ($nacq/$nrel of $T20_N)" \ - || bad "ACQUIRED=$nacq RELEASED=$nrel (want $T20_N each)" -[ "$nlost" = 0 ] && ok "zero LOST warnings under claim contention" || bad "$nlost LOST warnings under claim contention" -[ -e "$LOCK" ] && bad "leftover lock after contention" || ok "no leftover lock" -[ -e "$LOCK.next" ] && bad "leftover claim after contention" || ok "no leftover claim" + || bad "ACQUIRED=$nacq RELEASED=$nrel$t20_ntag (want $T20_N each)" +[ "$nlost" = 0 ] && ok "zero LOST warnings under claim contention$t20_ntag" || bad "$nlost LOST warnings under claim contention$t20_ntag" +[ -e "$LOCK" ] && bad "leftover lock after contention$t20_ntag" || ok "no leftover lock$t20_ntag" +[ -e "$LOCK.next" ] && bad "leftover claim after contention$t20_ntag" || ok "no leftover claim$t20_ntag" +done +fi -echo "== Test 21: crashed-claimant and empty-claim orphans age out; steals resume ==" +if section "Test 21: crashed-claimant and empty-claim orphans age out; steals resume"; then # (a) an aged foreign claim (crashed claimant): cleared by CLAIM-STALE-CLEARED, # then the steal completes; recovery latency bounded. LOCK="$WORK/cc.lock"; LOG="$WORK/cc.log"; : > "$LOG" @@ -1109,7 +1169,7 @@ t21_t1=$(date +%s) [ "$rc" = 0 ] && ok "waiter recovered through a crashed claimant's claim (rc 0)" || bad "rc=$rc behind a crashed claim" grep -q "CLAIM-STALE-CLEARED" "$LOG" && ok "aged claim cleared (CLAIM-STALE-CLEARED logged, with age)" || bad "no CLAIM-STALE-CLEARED entry" grep -q "STOLE-BY-CLAIM" "$LOG" && ok "steal completed after the clear" || bad "no STOLE-BY-CLAIM after clearing the crashed claim" -[ $((t21_t1 - t21_t0)) -le 20 ] && ok "recovery latency bounded ($((t21_t1 - t21_t0))s)" || bad "recovery took $((t21_t1 - t21_t0))s (>20s)" +[ $((t21_t1 - t21_t0)) -le 20 ] && ok_envelope "recovery latency bounded ($((t21_t1 - t21_t0))s)" || bad_envelope "recovery took $((t21_t1 - t21_t0))s (>20s)" [ -e "$LOCK.next" ] && bad "claim leftover after recovery" || ok "claim path clean after recovery" # (b) an EMPTY claim file (claimant died between create and write): same lane. LOCK="$WORK/ccempty.lock"; LOG="$WORK/ccempty.log"; : > "$LOG" @@ -1120,8 +1180,9 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ bash "$LIB" run -- bash -c 'true' 2>/dev/null; rc=$? [ "$rc" = 0 ] && ok "empty claim orphan aged out and recovery completed (rc 0)" || bad "rc=$rc behind an empty claim orphan" grep -q "CLAIM-STALE-CLEARED" "$LOG" && ok "empty claim cleared via the same staleness lane" || bad "empty claim was not cleared" +fi -echo "== Test 22: NON-CLAIM objects at the claim path — never deleted, per-path warn state ==" +if section "Test 22: NON-CLAIM objects at the claim path — never deleted, per-path warn state"; then # (a) a directory at ${LOCK}.next blocks steals (waiter reaches 97), is never # deleted, and warns once naming the claim path. LOCK="$WORK/cwt.lock"; LOG="$WORK/cwt.log"; : > "$LOG" @@ -1132,10 +1193,16 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ bash "$LIB" run -- bash -c 'true' 2> "$WORK/t22a.err"; rc=$? [ "$rc" = 97 ] && ok "dir at claim path: steals blocked, waiter timed out (97)" || bad "dir at claim path: rc=$rc (want 97)" [ -f "$LOCK.next/sub/f" ] && ok "directory at claim path untouched" || bad "directory at claim path was damaged!" -grep -q "is not a claim file" "$WORK/t22a.err" && ok "loud claim-path config warning on stderr" || bad "no claim-path config warning" -grep -q "it is a directory" "$WORK/t22a.err" && ok "claim warning names the detected type (directory)" || bad "claim warning does not name the type" n="$(grep -c "is not a claim file" "$WORK/t22a.err")" -[ "$n" = 1 ] && ok "claim-path warning fired exactly once (got $n)" || bad "claim-path warning fired $n times (want 1)" +# "warning fired at all" is timing-dependent (the two-poll confirmation needs poll +# headroom before MAX_WAIT, which an oversubscribed runner can starve) -> envelope. +# The warn-once dedup (never >1) and the type-naming are CORRECTNESS -> strict (the +# latter only asserted when a warning actually fired). +[ "$n" -ge 1 ] && ok_envelope "claim-path config warning fired (got $n)" || bad_envelope "no claim-path config warning (n=$n)" +[ "$n" -le 1 ] && ok "claim-path warning not duplicated (n=$n)" || bad "claim-path warning fired $n times (warn-once broken)" +if [ "$n" -ge 1 ]; then + grep -q "it is a directory" "$WORK/t22a.err" && ok "claim warning names the detected type (directory)" || bad "claim warning does not name the type" +fi grep -q "STOLE-BY-CLAIM" "$LOG" && bad "stole despite a squatted claim path" || ok "no steal through a squatted claim path" [ -f "$LOCK" ] && ok "stale lock left in place (cannot be stolen safely)" || bad "lock vanished behind a squatted claim path" # (b) a free LOCK path is UNaffected by claim-path junk: normal acquire works. @@ -1228,8 +1295,9 @@ AGENT_LOCK_PATH="$PPD2/c1.lock" AGENT_LOCK_LOG="$PPD2/ppg2.log" AGENT_LOCK_STALE grep -q "is not a claim file" "$PPD2/ba.err" && grep -q "is not a lock file" "$PPD2/ba.err" \ && ok "claim-path warning did not suppress the lock-path warning (reverse order)" \ || bad "lock-path warning suppressed after a claim-path warning (shared warn-once state?)" +fi -echo "== Test 23: live-slow holder — re-verify under the claim sees a fresh lock, CLAIM-ABORT (fresh), no steal ==" +if section "Test 23: live-slow holder — re-verify under the claim sees a fresh lock, CLAIM-ABORT (fresh), no steal"; then # Steered deterministically: the lock's mtime is renewed (as a live-slow # holder's re-create/renewal would) at the exact step-2 re-verify position, # via a sourced shell that wraps the library's verify internal. The claimant @@ -1260,8 +1328,9 @@ wait "$w23"; rc=$? [ "$rc" = 0 ] && ok "waiter then acquired and released normally (rc 0)" || bad "waiter rc=$rc after the slow holder released" grep -q "STOLE-BY-CLAIM" "$LOG" && bad "live lock was STOLEN despite the fresh re-verify" || ok "no steal of the live-slow holder's lock" [ -e "$LOCK.next" ] && bad "claim leftover after the fresh abort" || ok "claim deleted on the fresh abort" +fi -echo "== Test 24: OVERAGED own claim — CLAIM-ABORT (contested), no rename ==" +if section "Test 24: OVERAGED own claim — CLAIM-ABORT (contested), no rename"; then # A suspended claimant's recheck must refuse to proceed on its own overaged # claim (a clearer may be acting on it). Steered: every recheck sees the # claim backdated past CLAIM_STALE. Mutation check: an implementation that @@ -1287,8 +1356,9 @@ l1=""; IFS= read -r l1 < "$LOCK" || true [ "$l1" = "tok.ghost.t24" ] && ok "ghost lock untouched by the contested aborts" || bad "ghost lock was modified (line1=$l1)" [ -e "$LOCK.next" ] && bad "claim leftover after contested aborts" || ok "claim deleted on each contested abort" rm -f "$LOCK" +fi -echo "== Test 25: discovery-position matrix — own-claim-installed discovered on EVERY exit ==" +if section "Test 25: discovery-position matrix — own-claim-installed discovered on EVERY exit"; then # A rival's rename can install OUR claim as the lock while we sit at any # post-claim position. Each position steers that rename to the exact spot # (wrapping a library internal or shadowing mv/rm/touch in a sourced shell) @@ -1391,8 +1461,9 @@ for pos in step2-fresh recheck-gone touch-gone lock-gone contested deletion-gone bad "position $pos: rc=$rc discovery=$(grep -c DISCOVERY-HOLD "$LOG") expect-line=$(grep -cF "$expect" "$LOG") lock-left=$([ -e "$LOCK" ] && echo yes || echo no) claim-left=$([ -e "$LOCK.next" ] && echo yes || echo no)" fi done +fi -echo "== Test 26: delayed claim still installs a FRESH lease (the pre-rename touch) ==" +if section "Test 26: delayed claim still installs a FRESH lease (the pre-rename touch)"; then # A claim aged close to CLAIM_STALE (steered: backdated 40s of 60 at the # recheck) must still install a lock whose mtime is ~now — the step-3.2 # touch resets the clock; rename preserves it (probe R2). A no-touch @@ -1423,8 +1494,9 @@ case "$rc" in *) bad "delayed-claim lease harness rc=$rc" ;; esac grep -q "STOLE-BY-CLAIM" "$LOG" && ok "the delayed claim still completed its steal" || bad "no STOLE-BY-CLAIM in the lease test" +fi -echo "== Test 27: lock GONE at re-verify — CLAIM-ABORT (gone), NO rename onto the absent path ==" +if section "Test 27: lock GONE at re-verify — CLAIM-ABORT (gone), NO rename onto the absent path"; then # A live-slow holder releasing under a claimant must route to the normal # create race, never a rename onto the absent path. Mutation check: a # renaming implementation would install the CLAIM token; the correct one @@ -1455,8 +1527,9 @@ else bad "claim token vs acquired token: claim='$ctok' acquired='$atok' (equal or missing => renamed onto the absent path?)" fi grep -q "DISCOVERY-HOLD" "$LOG" && bad "spurious discovery-HOLD in the gone lane" || ok "no spurious discovery-HOLD" +fi -echo "== Test 28: SUB-FLOOR claim mtime is never cleared — treated as just-created ==" +if section "Test 28: SUB-FLOOR claim mtime is never cleared — treated as just-created"; then LOCK="$WORK/cfloor.lock" LOG="$WORK/cfloor.log" : >"$LOG" @@ -1472,8 +1545,9 @@ grep -q "CLAIM-STALE-CLEARED" "$LOG" && bad "sub-floor claim was CLEARED — mti || ok "sub-floor claim never cleared (floor applies to the claim)" [ -f "$LOCK.next" ] && ok "sub-floor claim file untouched" || bad "sub-floor claim file was removed" rm -f "$LOCK" "$LOCK.next" +fi -echo "== Test 29: BLOCKED steal rename — claim deleted IMMEDIATELY, no CLAIM_STALE penalty ==" +if section "Test 29: BLOCKED steal rename — claim deleted IMMEDIATELY, no CLAIM_STALE penalty"; then # The rename is forced to fail-with-the-lock-still-present (a shadowed mv — # the no-delete-share squat, deterministically). The claimant must delete its # own claim at once and re-poll: with CLAIM_STALE=600, a leftover claim would @@ -1496,14 +1570,15 @@ AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ ' _ "$LIB" 2>/dev/null; rc=$? [ "$rc" = 97 ] && ok "blocked-steal waiter honoured MAX_WAIT (97)" || bad "blocked-steal rc=$rc (want 97)" nclaim="$(grep -c "] CLAIM " "$LOG")" -[ "$nclaim" -ge 2 ] && ok "claim re-created on later attempts (x$nclaim) — deleted immediately, no ageout penalty" \ - || bad "only $nclaim CLAIM line(s) — the failed steal's claim was left to age out (60s-class penalty)" +[ "$nclaim" -ge 2 ] && ok_envelope "claim re-created on later attempts (x$nclaim) — deleted immediately, no ageout penalty" \ + || bad_envelope "only $nclaim CLAIM line(s) — the failed steal's claim was left to age out (60s-class penalty)" grep -q "steal FAILED" "$LOG" && ok "blocked rename logged (damped steal FAILED)" || bad "no steal FAILED log line" [ -e "$LOCK.next" ] && bad "claim leftover after the blocked steal attempts" || ok "no claim leftover at exit" [ -f "$LOCK" ] && ok "squatted lock left in place" || bad "lock vanished in the blocked lane" rm -f "$LOCK" +fi -echo "== Test 30: static checks — the claim touch is NON-creating with an explicit existence check ==" +if section "Test 30: static checks — the claim touch is NON-creating with an explicit existence check"; then grep -q 'touch -c -- "\$_LOCK_CLAIM_PATH"' "$LIB" \ && ok "claim touch uses 'touch -c --' (non-creating)" \ || bad "no 'touch -c -- \$_LOCK_CLAIM_PATH' in the implementation" @@ -1513,11 +1588,22 @@ grep -A3 'touch -c -- "\$_LOCK_CLAIM_PATH"' "$LIB" | grep -q -- '-e "\$_LOCK_CLA bad_touch="$(grep 'touch ' "$LIB" | grep '_LOCK_CLAIM_PATH' | grep -v -- '-c')" [ -z "$bad_touch" ] && ok "no creating touch of the claim path anywhere" \ || bad "creating touch of the claim path found: $bad_touch" +fi -echo "== Test 31: LEAKED-claim discovery — the leaked-token memory closes the unverified-claim lanes ==" +if section "Test 31: LEAKED-claim discovery — the leaked-token memory closes the unverified-claim lanes"; then # (a) main leg: a recheck-unreadable exit leaks the claim token; a rival -# later installs that claim as the lock; the leaver's per-poll memory check -# adopts it (HOLD) and release returns 0. +# (the external mv below) then installs that claim as the lock; the leaver +# adopts it (HOLD) and release returns 0. Adoption may go through EITHER of +# the product's two discovery routes — both correct: the inline +# ownership-discovery read that is the unreadable branch's final act +# (git-commit-lock.sh:822, "DISCOVERY-HOLD: ...") if the external mv lands +# before it, or the per-poll leaked-token-memory check +# (git-commit-lock.sh:1382, "DISCOVERY-HOLD (leaked-token memory)") on a later +# poll if it lands after. Which wins is a pure scheduling race — the external +# mv vs the leaver's inline discover ONE statement later (sh:1112 leak-add -> +# sh:1114 discover) — and is load-sensitive, so this leg accepts either and +# records which fired. The memory route is pinned DETERMINISTICALLY by +# sub-leg (b) below; the direct route by Test 25's discovery-position matrix. # NB: _lock_read_tok / _lock_cur_token shadows run inside COMMAND # SUBSTITUTIONS (subshells), so their fire-once state must live in flag # FILES — a variable assignment would be lost when the subshell exits. @@ -1547,8 +1633,18 @@ else fi wait "$w31"; rc=$? [ "$rc" = 0 ] && ok "leaver discovered its installed leaked claim and released rc 0" || bad "leaked-discovery harness rc=$rc" -grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG" && ok "adoption went through the leaked-token memory" \ - || bad "no leaked-token-memory DISCOVERY-HOLD" +# Either discovery route is correct here (see the leg comment); accept both, +# record which fired, fail only if NEITHER adopted the leaked claim. ("$LOG" +# is dedicated to this leg, so there is no cross-talk.) "DISCOVERY-HOLD:" +# (immediate colon) matches ONLY the direct route; the memory route reads +# "DISCOVERY-HOLD (leaked-token memory):" — disjoint, and checked first. +if grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG"; then + ok "adoption went through the leaked-token memory (per-poll route; the mv landed after the inline discover)" +elif grep -q "DISCOVERY-HOLD:" "$LOG"; then + ok "adoption went through the inline ownership-discovery read (direct route; the mv landed first) — memory route pinned by sub-leg (b)" +else + bad "no DISCOVERY-HOLD adoption of the leaked claim by EITHER route" +fi [ -e "$LOCK" ] && bad "lock leftover after leaked-claim adoption" || ok "lock released cleanly after adoption" [ -e "$LOCK.next" ] && bad "claim leftover after leaked-claim adoption" || ok "no claim leftover" # Hmm wait: STALE=300 — the ghost is backdated 9999 so it IS stale; fine. @@ -1704,8 +1800,9 @@ case "$(uname -s 2>/dev/null)" in echo "note: the blocked-unlink feeder leg is Windows-only by construction (POSIX open handles never block unlink); the read-shadow legs above cover the memory machinery" ;; esac +fi -echo "== Test 32: per-attempt tokens — an abandoned own-token lock never aliases discovery or release ==" +if section "Test 32: per-attempt tokens — an abandoned own-token lock never aliases discovery or release"; then # Walk: the first CREATE's read-back is forced blank (and the abandoned lock # backdated stale). A later CLAIM attempt is steered into a recheck-gone # discovery against that abandoned lock: a reused-per-acquire-token @@ -1748,8 +1845,63 @@ grep -q "DISCOVERY-HOLD" "$LOG" && bad "FALSE discovery-HOLD on the abandoned ow || ok "no false discovery-HOLD — the abandoned token did not alias the claim attempt" grep -q "STOLE-BY-CLAIM" "$LOG" && ok "the abandoned lock was then reclaimed by a normal steal" \ || bad "no STOLE-BY-CLAIM of the abandoned lock" +fi + +if section "Test 32b: steal-path read-back FAILED — rename-over WON but the lock did not read back our token (F2)"; then +# The steal-path twin of Test 32. Here the stealer WINS the claim race AND wins +# the rename-over (STOLE-BY-CLAIM is logged, the ghost is destroyed), but the +# mandatory post-rename read-back verification (git-commit-lock.sh:1171) comes +# back wrong. The product must NOT take the hold: it clears its claim token and +# re-enters the wait loop (git-commit-lock.sh:1176-1179) — never a silent +# false-hold (which, after a STOLE-BY-CLAIM, would mean a mis-attributed hold of +# a destroyed-ghost path). We fault-inject the read-back with a one-shot +# _lock_cur_token shadow gated on the claim token being SET (the INVERSE of Test +# 32's `-z` gate), so it lands at the STEAL read-back (claim token live, not yet +# held), not the create one. On firing we also backdate the just-installed +# abandoned lock stale so the re-steal is immediate (same trick as Test 32 — +# keeps it fast and deterministic). Attempt 2 (shadow spent) reads back the real +# token and acquires normally. +LOCK="$WORK/stealrb.lock"; LOG="$WORK/stealrb.log"; : > "$LOG" +fabricate_lock "$LOCK" "tok.ghost.t32b" "pid=9 host=ghost"; backdate "$LOCK" 9999 +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=5 \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \ + bash -c ' + source "$1" || exit 70 + clone_fn _lock_cur_token _ct_orig + SF1="$AGENT_LOCK_PATH.steer1" # flag FILE: the cur_token shadow runs in subshells + _lock_cur_token() { + if [ ! -e "$SF1" ] && [ "${_LOCK_HELD:-0}" = 0 ] && [ -n "$_LOCK_CLAIM_TOKEN" ]; then + : > "$SF1" + backdate "$AGENT_LOCK_PATH" 9999 2>/dev/null || true + printf "" + return 0 + fi + _ct_orig "$@" + } + lock_acquire || exit 72 + lock_release || exit 74 + exit 0 + ' _ "$LIB" 2>/dev/null; rc=$? +[ "$rc" = 0 ] && ok "steal read-back failure re-entered wait; a later steal acquired and released rc 0" \ + || bad "steal-readback harness rc=$rc" +grep -q "steal rename completed but read-back" "$LOG" \ + && ok "the steal-path read-back-verification failure lane ran (F2)" \ + || bad "F2 lane never ran (the read-back fault did not land at the steal read-back)" +nstole="$(grep -c "STOLE-BY-CLAIM" "$LOG")" +[ "$nstole" -ge 2 ] && ok "re-stole after the failed read-back (STOLE-BY-CLAIM x$nstole)" \ + || bad "expected >=2 STOLE-BY-CLAIM (won-rename then re-steal), got $nstole" +warn_line="$(grep -n "steal rename completed but read-back" "$LOG" | head -1 | cut -d: -f1)" +acq_line="$(grep -n "ACQUIRED " "$LOG" | tail -1 | cut -d: -f1)" +if [ -n "$warn_line" ] && [ -n "$acq_line" ] && [ "$warn_line" -lt "$acq_line" ]; then + ok "no false-hold: the read-back WARNING preceded the eventual ACQUIRED" +else + bad "ordering: expected the F2 WARNING (line $warn_line) before ACQUIRED (line $acq_line)" +fi +[ -e "$LOCK" ] && bad "lock leftover after the steal-readback walk" || ok "lock released cleanly" +[ -e "$LOCK.next" ] && bad "claim leftover after the steal-readback walk" || ok "no claim leftover" +fi -echo "== Test 33: TERM mid-claim — the trap deletes the claim (token-checked), no 98, no ageout penalty ==" +if section "Test 33: TERM mid-claim — the trap deletes the claim (token-checked), no 98, no ageout penalty"; then # (a) main: claimant paused inside its claim window (at the touch), TERM'd. # The trap must delete OUR claim, run the discovery read (miss: the ghost is # foreign), restore traps, re-raise (143) — and must NOT touch the lock. @@ -1880,8 +2032,9 @@ case "$(uname -s 2>/dev/null)" in echo "note: TERM-blocked-unlink leg is Windows-only by construction (POSIX open handles never block unlink)" ;; esac +fi -echo "== Test 34: TERM on a STEAL-acquired hold releases exactly like a create-acquired one ==" +if section "Test 34: TERM on a STEAL-acquired hold releases exactly like a create-acquired one"; then # All acquisition paths go through the shared claim-the-hold helper, so a # steal-acquired holder must run the same HELD/trap machinery: release on # TERM, re-raise, 143 (T11's contract, on a steal-acquired hold). @@ -1904,8 +2057,9 @@ wait "$w34"; rc=$? [ "$rc" = 143 ] && ok "TERM'd steal-acquired holder exited 143 (signal re-raised)" || bad "steal-acquired TERM rc=$rc (want 143)" [ -e "$LOCK" ] && bad "lock left held after TERM on a steal-acquired hold" || ok "steal-acquired lock released on TERM" grep -q "RELEASED" "$LOG" && ok "release logged on the steal-acquired TERM path" || bad "no RELEASED entry for the steal-acquired hold" +fi -echo "== Test 35: release-time leaked-claim cleanup — displaced hold cleans its own installed leak, 98 ==" +if section "Test 35: release-time leaked-claim cleanup — displaced hold cleans its own installed leak, 98"; then # (a) B leaks token L (recheck-unreadable; the ghost vanishes at the same # moment), acquires fresh N normally; a rival installs L over the lock, # displacing B's held N. B's release must return 98 AND unlink L (the lock @@ -1998,8 +2152,9 @@ esac grep -q "RELEASE-CLEANED-LEAKED-CLAIM" "$LOG" && bad "boundary variant wrongly logged a leaked-claim cleanup" \ || ok "no cleanup line when the re-read backed off" rm -f "$LOCK" "$LOCK.next" "$WORK/t35b.succ" +fi -echo "== Test 36: arc-end resolution pass — an INCONCLUSIVE lock read keeps the entry pending; conclusive ones drop it ==" +if section "Test 36: arc-end resolution pass — an INCONCLUSIVE lock read keeps the entry pending; conclusive ones drop it"; then # The pass's entry-drop is gated on one lock-path read. That read resolves # the entry ONLY when it is conclusive: a DIFFERENT readable token, or the # path definitively absent. A lock PRESENT but unreadable/empty proves @@ -2057,6 +2212,943 @@ grep -q "DISCOVERY-HOLD (leaked-token memory)" "$LOG" && ok "the surviving entry grep -q "resolved tok=tok.leak.t36.2" "$LOG" && ok "conclusive resolution logged for the dropped entry" \ || bad "no resolution log line for the conclusive drop" rm -f "$LOCK" "$LOCK.next" +fi + +if section "Test 37: rename-refused — a directory appearing at the lock path mid-steal aborts the steal, no false hold"; then +# The only acquire/steal VERDICT branch with no test: a NON-regular object (a +# directory) appears AT the lock path between the claimant's final re-verify +# (step 3.3, sees a stale FILE) and its rename-over, so the rename is refused +# with the lock path occupied by a non-file. The claimant must classify this +# as rename-refused (non-file at the lock path), delete its claim, take NO +# hold, and re-poll to MAX_WAIT. Steered deterministically by shadowing mv: +# the claim->lock rename (the `.next` move) is intercepted to swap the stale +# lock FILE for a DIRECTORY at the lock path, then the real `mv -T` runs and +# fails NATURALLY (mv refuses to overwrite a directory with a non-directory) — +# exactly the wrong-type rename lane. The verifies don't call mv, so the lock +# reads as a stale file through step 3.3; only the rename sees the directory. +# Mutation check: an implementation that mis-classifies the refused rename +# (e.g. treats it as blocked, or proceeds to STOLE-BY-CLAIM) fails the +# no-false-hold / rename-refused assertions below. +LOCK="$WORK/renref.lock"; LOG="$WORK/renref.log"; : > "$LOG" +fabricate_lock "$LOCK" "tok.ghost.t37" "pid=9 host=ghost"; backdate "$LOCK" 9999 +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=3 \ + bash -c ' + source "$1" || exit 70 + # Make a DIRECTORY appear at the lock path BEFORE the real rename-over runs, + # by wrapping _lock_rename_over (NOT by shadowing mv). It is refused PORTABLY: + # GNU "mv -T" refuses to overwrite a directory with a non-directory, AND the + # no-mv-T fallback [ -d ] guard (BSD/macOS) refuses it too. A mv shadow that + # mkdirs the dir INSIDE the mv call works only on GNU: it lands AFTER the + # fallback [ -d ] check, and BSD "mv file dir" MOVES the file INTO the dir + # rather than erroring (this failed the macOS CI leg). NB: no apostrophes + # here -- this comment lives inside the bash -c single-quoted steering shell. + clone_fn _lock_rename_over _ro_orig + _lock_rename_over() { + command rm -f -- "$AGENT_LOCK_PATH" 2>/dev/null + command mkdir -- "$AGENT_LOCK_PATH" 2>/dev/null + _ro_orig + } + lock_acquire + exit $? + ' _ "$LIB" 2>/dev/null; rc=$? +[ "$rc" = 97 ] && ok "rename-refused waiter honoured MAX_WAIT (97), never falsely held" \ + || bad "rename-refused rc=$rc (want 97 — a false hold would exit 0)" +grep -q "CLAIM-ABORT (rename-refused)" "$LOG" \ + && ok "CLAIM-ABORT (rename-refused) logged — the wrong-type rename branch was hit" \ + || bad "no CLAIM-ABORT (rename-refused) — branch not exercised" +grep -q "non-file at the lock path" "$LOG" \ + && ok "rename refusal classified as non-file at the lock path" \ + || bad "missing 'non-file at the lock path' classification wording" +grep -q "STOLE-BY-CLAIM" "$LOG" \ + && bad "spurious STOLE-BY-CLAIM — the steal was claimed despite the refused rename" \ + || ok "no STOLE-BY-CLAIM (no false steal of the directory-occupied path)" +grep -q "DISCOVERY-HOLD" "$LOG" \ + && bad "spurious discovery-HOLD — the victim wrongly believed it acquired" \ + || ok "no spurious discovery-HOLD — ownership-discovery read found no hold" +grep -q "acquire verification FAILED" "$LOG" \ + && bad "read-back path entered — the rename was treated as having succeeded" \ + || ok "rename treated as refused, not as a completed-then-unverified steal" +[ -e "$LOCK.next" ] \ + && bad "claim leftover (\$LOCK.next) after the rename-refused abort" \ + || ok "claim file cleaned up — no leftover \$LOCK.next" +[ -d "$LOCK" ] \ + && ok "directory left in place at the lock path (never overwritten)" \ + || bad "lock path is no longer the squatting directory" +rm -rf "$LOCK" "$LOCK.next" +fi + +if section "Test 38: step-3.3 pre-rename re-verify abort — claim cleaned, discovery, no false hold"; then +# The step-2 re-verify (sh:1075) and the step-3.3 re-verify immediately before +# the rename (sh:1149) are near-identical abort lanes; Test 23/27 exercise the +# step-2 lane only, leaving 3.3 untested. Steered with a CALL-COUNTER on +# _lock_verify_stale: call 1 (step-2) passes through to the REAL verdict +# (stale — the ghost is backdated 9999s), so the steal proceeds PAST step-2; +# call 2 (step-3.3) freshens the lock first, so the real verify reports "fresh" +# and the abort fires SPECIFICALLY at step-3.3. The proof is the log suffix +# "(lock re-verify before rename: fresh)" — step-2's suffix is "after claim", +# so the string can only be the 3.3 lane. STALE_SECS=30 keeps the freshened +# ghost fresh long enough that the post-abort re-poll does NOT re-steal before +# the test removes the lock — so the waiter then acquires via the CREATE race +# (no second STOLE-BY-CLAIM), the same shape as Test 23. +LOCK="$WORK/pr33.lock"; LOG="$WORK/pr33.log"; : > "$LOG" +fabricate_lock "$LOCK" "tok.ghost.t38" "pid=9 host=slow"; backdate "$LOCK" 9999 +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=30 \ + AGENT_LOCK_CLAIM_STALE_SECS=60 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \ + bash -c ' + source "$1" || exit 70 + clone_fn _lock_verify_stale _vs_orig + N=0 + _lock_verify_stale() { + N=$((N+1)) + # call 1 = step-2: pass through to the real verdict (stale). call 2 = + # step-3.3: freshen the ghost lock so the real verify now sees "fresh", + # tripping the pre-rename abort at the 3.3 position. + if [ "$N" = 2 ]; then command touch -- "$AGENT_LOCK_PATH"; fi + _vs_orig "$@" + } + lock_acquire || exit 72 + lock_release || exit 74 + exit 0 + ' _ "$LIB" 2>/dev/null & +w38=$! +# Proof the 3.3 lane ran AND the steal got PAST step-2: the "before rename" +# suffix is unique to the step-3.3 position (step-2 logs "after claim"). +wait_for_grep "lock re-verify before rename: fresh" "$LOG" 20 \ + && ok "step-3.3 pre-rename re-verify aborted (fresh) — got past step-2 to the 3.3 lane" \ + || bad "no step-3.3 'before rename' abort — the 3.3 lane did not run" +grep -q "CLAIM-ABORT (fresh) tok=.* (lock re-verify before rename: fresh)" "$LOG" \ + && ok "CLAIM-ABORT (fresh) logged at the 3.3 position (reason map: fresh)" \ + || bad "no CLAIM-ABORT (fresh) with the 'before rename' suffix" +grep -q "lock re-verify after claim" "$LOG" \ + && bad "the abort fired at step-2 (after claim) — the call-counter let call 1 trip, not the 3.3 lane" \ + || ok "no step-2 (after claim) abort — call 1 passed; only the 3.3 lane aborted" +grep -q "STOLE-BY-CLAIM" "$LOG" \ + && bad "a rename installed the claim — the 3.3 fresh abort did not prevent the steal" \ + || ok "no STOLE-BY-CLAIM — no rename onto the lock from the aborted attempt" +grep -q "DISCOVERY-HOLD" "$LOG" \ + && bad "spurious DISCOVERY-HOLD — the victim wrongly held after the 3.3 abort" \ + || ok "no false hold — the discovery read ran and the victim did not wrongly hold" +[ -e "$LOCK.next" ] && bad "claim leftover immediately after the 3.3 fresh abort" \ + || ok "claim deleted on the 3.3 fresh abort" +rm -f "$LOCK" # the slow holder releases normally +wait "$w38"; rc=$? +[ "$rc" = 0 ] && ok "waiter re-polled past the 3.3 abort, then acquired/released (rc 0)" \ + || bad "waiter rc=$rc after the slow holder released (want 0)" +[ -e "$LOCK.next" ] && bad "claim leftover after the waiter finished" || ok "no claim leftover at exit" +rm -f "$LOCK" "$LOCK.next" +fi + + +if section "Test 39: foreign claim at recheck — left intact, discovery, no false 98"; then +# After winning its claim and passing step-2 re-verify, the claimant rechecks +# its OWN claim file before installing. The `gone` recheck leg is covered (Test +# 25 recheck-gone / Test 32); the `foreign` leg is NOT: a waiter judged our +# claim abandoned, cleared it, and a RIVAL re-claimed in its place, so the +# recheck reads back a FOREIGN token at the claim path. The claimant must then +# LEAVE the rival's claim alone, run the ownership-discovery read (the lock is +# still the ghost, not ours -> no hold), and back off to re-poll — never a 98 +# (a mere claim recheck carries NO stolen-lease semantics) and never a deletion +# of the rival's claim. +# +# Steering (Test 24/25 idiom): clone _lock_claim_state and, on the FIRST recheck +# only (fire-once via a flag FILE so a subshell can't lose the state), overwrite +# .next with a fresh-mtime foreign "tok.rival.*" token before delegating +# to the original — exactly what a waiter-cleared + rival-reclaimed claim path +# looks like. The original then classifies it `foreign`. CLAIM_STALE is large +# and MAX_WAIT small so the freshly-planted rival claim is never aged out: it +# survives, the create on the next poll loses to it, and the waiter times out +# 97. Mutation check: an implementation that 98'd on a foreign recheck, or that +# deleted/overwrote the rival's claim, or that false-HELD, fails the asserts. +LOCK="$WORK/foreign-recheck.lock"; LOG="$WORK/foreign-recheck.log"; : > "$LOG" +fabricate_lock "$LOCK" "tok.ghost.t39" "pid=9 host=ghost"; backdate "$LOCK" 9999 +SF="$LOCK.steered"; RIVAL="tok.rival.t39.deadbeef"; rm -f "$SF" +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \ + SF="$SF" RIVAL="$RIVAL" \ + bash -c ' + source "$1" || exit 70 + clone_fn _lock_claim_state _cs_orig + _lock_claim_state() { + # Fire ONCE, at the post-win recheck of OUR claim: a waiter cleared ours + # and a rival re-claimed. Plant the rival token (fresh mtime => not stale) + # then classify via the real function. + if [ ! -e "$SF" ] && [ "$1" = "$_LOCK_CLAIM_TOKEN" ] \ + && [ "$_LOCK_CLAIM_PATH" -ef "$AGENT_LOCK_PATH.next" ] 2>/dev/null; then + : > "$SF" + printf "%s\n%s\n" "$RIVAL" "pid=4242 host=rival" > "$_LOCK_CLAIM_PATH" + fi + _cs_orig "$@" + } + lock_acquire + exit $? + ' _ "$LIB" 2>/dev/null; rc=$? + +# The foreign-recheck branch ran (its log line is the proof the leg executed). +grep -q "claim recheck: foreign token '$RIVAL' at the claim" "$LOG" \ + && ok "foreign-recheck branch ran (rival token left at the claim, discovery read)" \ + || bad "no foreign-recheck log line — branch not executed" +# A mere claim recheck must NEVER report a stolen-lease 98. +[ "$rc" = 98 ] && bad "false 98 on a foreign CLAIM recheck (no lease was ever held)" \ + || ok "no false 98 on the foreign claim recheck (rc=$rc)" +# No hold was ever taken: discovery saw the ghost, not our token. +grep -q "DISCOVERY-HOLD" "$LOG" && bad "false discovery-HOLD on the foreign recheck" \ + || ok "no false hold (ownership-discovery read found the ghost, not ours)" +grep -q "STOLE-BY-CLAIM" "$LOG" && bad "claimant stole despite a foreign claim at recheck" \ + || ok "no STOLE-BY-CLAIM — claimant backed off the foreign claim" +# The rival's claim file SURVIVES, unmodified (left intact, never deleted). +[ -e "$LOCK.next" ] && ok "rival's foreign claim file still present (not deleted)" \ + || bad "rival's foreign claim was deleted — must be left alone" +rl1=""; IFS= read -r rl1 < "$LOCK.next" 2>/dev/null || true +[ "$rl1" = "$RIVAL" ] && ok "rival's claim token intact (untouched: $rl1)" \ + || bad "rival's claim token modified (line1=$rl1, want $RIVAL)" +grep -q "CLAIM-STALE-CLEARED" "$LOG" && bad "claimant aged-out/cleared the rival's fresh claim" \ + || ok "rival's fresh claim never cleared as stale" +# Clean outcome: the lock was never acquired; the waiter timed out (97). +[ "$rc" = 97 ] && ok "waiter re-polled past the foreign claim and timed out cleanly (97)" \ + || bad "rc=$rc (want 97 — clean re-poll/timeout behind the surviving rival claim)" +# The ghost lock is untouched (never stolen). +gl1=""; IFS= read -r gl1 < "$LOCK" 2>/dev/null || true +[ "$gl1" = "tok.ghost.t39" ] && ok "ghost lock untouched by the foreign-recheck backoff" \ + || bad "ghost lock modified (line1=$gl1)" +rm -f "$LOCK" "$LOCK.next" "$SF" +fi + +if section "Test 40: exec-bypass boundary — exec in the lock-holding shell skips release (OOS-5); exec in a child does not"; then +# `lock_run` runs the wrapped command vector with `"$@"` IN THE WRAPPER SHELL +# (git-commit-lock.sh), so a command that is itself an `exec` REPLACES the +# lock-holding wrapper process: the trailing `lock_release` AND the EXIT trap +# are both skipped, and the lock is left held with no RELEASED logged. This is +# the one interleaving that can SILENTLY lose an update (guarantees.md OOS-5) — +# this test pins the exact boundary so a future change to the release/trap +# wiring can't quietly widen or close it without a red. + +# (a1) BYPASS: `run -- exec true` — the wrapped command IS an exec, so it +# replaces the wrapper. Release + EXIT trap are skipped: lock LEFT, no RELEASED +# (ACQUIRED proves the hold was taken, so "no RELEASED" means the trap really +# was bypassed, not that nothing ran). +LOCK="$WORK/t40.bypass.lock"; LOG="$WORK/t40.bypass.log"; : > "$LOG" +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- exec true; rc=$? +[ "$rc" = 0 ] && ok "run -- exec true exits 0 (the exec'd command's code)" \ + || bad "run -- exec true rc=$rc (want 0)" +grep -q ACQUIRED "$LOG" && ok "run -- exec true did take the lock (ACQUIRED logged)" \ + || bad "run -- exec true: no ACQUIRED — the hold never happened, test is vacuous" +[ -e "$LOCK" ] && ok "run -- exec true LEFT the lock file (release bypassed by exec)" \ + || bad "run -- exec true: lock released — exec did NOT bypass (boundary changed)" +grep -q RELEASED "$LOG" && bad "run -- exec true logged RELEASED — the EXIT trap was NOT skipped (boundary changed)" \ + || ok "run -- exec true logged NO RELEASED (EXIT trap skipped — OOS-5 boundary)" +rm -f "$LOCK" + +# (a2) CONTROL — NO bypass: `run -- bash -c 'exec true'` — the exec replaces the +# CHILD, not the wrapper, so the wrapper releases normally: lock GONE, RELEASED +# logged. The opposite outcome to (a1) is the whole point; assert both so the +# test documents the exact boundary. +LOCK="$WORK/t40.child.lock"; LOG="$WORK/t40.child.log"; : > "$LOG" +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'exec true'; rc=$? +[ "$rc" = 0 ] && ok "run -- bash -c 'exec true' exits 0" \ + || bad "run -- bash -c 'exec true' rc=$rc (want 0)" +[ -e "$LOCK" ] && bad "run -- bash -c 'exec true' LEFT the lock — exec in a child must NOT bypass" \ + || ok "run -- bash -c 'exec true' released the lock (exec in a child does not bypass)" +grep -q RELEASED "$LOG" && ok "run -- bash -c 'exec true' logged RELEASED (the control: release ran)" \ + || bad "run -- bash -c 'exec true' logged NO RELEASED — the control case did not release" +rm -f "$LOCK" + +# (a3) REALISTIC sourced bypass: `lock_acquire; exec true` in a sourcing shell +# (a subshell so it can't take the suite down) — the holder execs away before +# release, leaving the lock held. This is the shape a real caller hits if it +# execs while holding instead of calling lock_release. +LOCK="$WORK/t40.sourced.lock"; LOG="$WORK/t40.sourced.log"; : > "$LOG" +( AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash -c ' + source "$1" || exit 70 + lock_acquire || exit 72 + exec true + ' _ "$LIB" ); rc=$? +[ "$rc" = 0 ] && ok "sourced lock_acquire; exec true exits 0" \ + || bad "sourced lock_acquire; exec true rc=$rc (want 0)" +[ -e "$LOCK" ] && ok "sourced lock_acquire; exec true LEFT the lock held (release skipped)" \ + || bad "sourced lock_acquire; exec true released the lock — exec did not bypass" +grep -q RELEASED "$LOG" && bad "sourced exec-while-holding logged RELEASED — the trap was not skipped" \ + || ok "sourced exec-while-holding logged NO RELEASED (release + trap skipped)" +rm -f "$LOCK" + +# (b) SILENT-LOSS boundary: a DISPLACED holder that execs a 0-exit is UNWARNED. +# Build a holder H that (sourced) acquires, backdates its OWN lock ancient so a +# contender steals it (H is now displaced — a rival token sits at the path), +# then execs a 0-exit. Because the exec skips BOTH release and the EXIT trap, +# the displacement-detection in lock_release NEVER runs: H exits 0 with no +# WARNING and no 98. This is exactly the documented silent boundary (OOS-5): a +# non-unwinding exit while displaced cannot report that the hold was not +# exclusive. (backdate/epoch_to_stamp are export -f'd by the preamble, so the +# steering shell inherits them.) +LOCK="$WORK/t40.silent.lock"; LOG="$WORK/t40.silent.log"; : > "$LOG" +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 bash -c ' + source "$1" || exit 70 + lock_acquire || exit 72 # H holds the lock + backdate "$2" 9999 # H'"'"'s own lock now ancient -> instantly stealable + # A contender steals it (separate process) — H is displaced once a rival + # token lands at the path. + AGENT_LOCK_PATH="$2" AGENT_LOCK_LOG="$3" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \ + bash "$1" run -- true + exec true # H execs 0 — neither release nor trap runs + ' _ "$LIB" "$LOCK" "$LOG"; rc=$? +[ "$rc" = 0 ] && ok "displaced holder's exec-0 exits 0 (no unwinding ran)" \ + || bad "displaced holder's exec-0 rc=$rc (want 0)" +grep -q "STOLE-BY-CLAIM" "$LOG" \ + && ok "the contender genuinely displaced H (STOLE-BY-CLAIM logged) — H WAS displaced" \ + || bad "no STOLE-BY-CLAIM — H was not actually displaced, the (b) premise is gone" +grep -q "lock LOST" "$LOG" \ + && bad "H logged a 'lock LOST' displacement WARNING — the exec did NOT skip release/trap" \ + || ok "displaced holder's exec-0 emitted NO 'lock LOST' WARNING (silent boundary — OOS-5)" +grep -q "WARNING" "$LOG" \ + && bad "an unexpected WARNING was logged by the displaced exec-0 holder" \ + || ok "displaced holder's exec-0 emitted NO WARNING at all (unwarned silent loss)" +rm -f "$LOCK" +fi + +if section "Test 41: forward clock jump steals a live lock — detected as 98, never silent (E2)"; then +# Staleness is age = now - mtime (git-commit-lock.sh ~:928, ~:1409), where `now` +# is _lock_now. A process whose clock has LEAPED FORWARD computes an inflated age +# for everyone's lock, so it can judge a LIVE, fresh lock ancient and steal it. +# This is correctness-safe but liveness-degraded: it degrades into the already- +# handled robbed-holder lane (Test 4b) — the displaced holder DETECTS the theft +# at release and exits 98 with a loud WARNING; it never silently double-commits. +# +# Steering (no real sleep/backdate): holder H acquires and HOLDS a fresh lock on +# a NORMAL clock. Waiter W has _lock_now shadowed to return the real now PLUS a +# large offset (+9999s), so H's just-created lock looks ~9999s old to W and W +# steals it. STALE=100 means the lock is genuinely fresh under a normal clock +# (without the jump W would block, never steal — the jump is what's causal); +# CLAIM_STALE=99999 keeps W's own just-created claim (also judged ~9999s old by +# W's jumped clock) well under the claim-stale window, so W's recheck does not +# self-abort (contested) and the steal proceeds to rename. +LOCK="$WORK/fwdjump.lock"; LOG="$WORK/fwdjump.log"; : > "$LOG"; OUT="$WORK/fwdjump-out"; : > "$OUT" +READY="$WORK/t41.ready"; TDONE="$WORK/t41.thief-done" +# Holder H (sourced, NORMAL clock): create+hold a fresh lock, signal READY, hold +# until told the waiter is done, then release and exit with the release rc. +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=100 \ + AGENT_LOCK_CLAIM_STALE_SECS=99999 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=120 \ + bash -c ' + source "$1" || exit 70 + lock_acquire || exit 72 + echo h-work >> "$2" + touch "$3" + until [ -e "$4" ]; do sleep 0.05; done + lock_release + exit $? + ' _ "$LIB" "$OUT" "$READY" "$TDONE" & +hpid=$! +wait_for_file "$READY" || bad "T41 holder never signalled ready (lock not held)" +# Waiter W (sourced, clock JUMPED +9999s): _lock_now returns real now + offset, so +# every age it computes is inflated and H's fresh lock reads as ancient. W acquires +# (by stealing) then releases; run in the FOREGROUND so its rc is captured. +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=100 \ + AGENT_LOCK_CLAIM_STALE_SECS=99999 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=30 \ + bash -c ' + source "$1" || exit 70 + clone_fn _lock_now _now_orig + _lock_now() { echo $(( $(_now_orig) + 9999 )); } + lock_acquire || exit 72 + echo w-work >> "$2" + lock_release + exit $? + ' _ "$LIB" "$OUT" +wpid_rc=$? +touch "$TDONE" +wait "$hpid"; h_rc=$? +# W judged H's live, fresh lock ancient under the jumped clock and stole it. +grep -q "STOLE-BY-CLAIM" "$LOG" \ + && ok "forward-jumped waiter stole a LIVE fresh lock (STOLE-BY-CLAIM)" \ + || bad "no STOLE-BY-CLAIM — jumped waiter did not steal the live lock" +[ "$wpid_rc" = 0 ] && ok "thief (its own fresh hold) released cleanly (rc 0)" \ + || bad "thief rc=$wpid_rc (its own fresh hold should release 0)" +grep -q w-work "$OUT" && ok "thief did its work" || bad "thief work missing" +# The proof: the premature steal was DETECTED, not silent — H exits exactly 98. +[ "$h_rc" = 98 ] && ok "robbed holder detected the premature steal — exits exactly 98" \ + || bad "robbed holder rc=$h_rc (forward-jump steal must degrade to 98, never silent)" +grep -q "WARNING: lock LOST" "$LOG" \ + && ok "robbed holder logged a loud theft WARNING (no silent double-commit)" \ + || bad "no theft WARNING logged for the forward-jump steal" +rm -f "$LOCK" "$LOCK.next" +fi + +if section "Test 42: mtime unreadable — staleness disabled, fail-safe (no steal), warn-once, 97 (E3)"; then +# §E3: if the lock file's mtime cannot be read AT ALL (every probe fails on a +# PRESENT file), staleness detection is BROKEN. The mtime floor fails closed to +# "fresh": _lock_verify_stale returns state=fresh, so a crashed/stale holder is +# NEVER stolen — recovery is disabled and waiters block to MAX_WAIT (97). The +# tool must say so LOUDLY, exactly once per process. The concurrency canary +# (formerly Test 1, now tests/git-commit-lock.canary.test.sh) only asserts the +# NEGATIVE (the warning must NOT fire under healthy contention); this drives the +# positive lane. +# +# Steering: shadow _lock_stat_mtime — the INNER single-probe (sh:606, runs +# stat/date and prints the epoch) — to return EMPTY for the LOCK path while it +# is PRESENT. We must NOT shadow _lock_path_mtime (sh:629): that is the 3x-retry +# wrapper that EMITS the warn-once, so shadowing it would remove the very +# warning we assert. With the inner probe empty on a present file, +# _lock_path_mtime retries 3x, sees the file present-but-unreadable, fires the +# warn-once and sets _LOCK_MTIME="" -> _lock_verify_stale -> fresh -> no steal. +# The shadow returns empty ONLY for the lock path: _lock_stat_mtime is also used +# for the CLAIM file's mtime (sh:1120/1230), which must keep working, and other +# paths fall through to the real probe. +T42_LOCK="$WORK/t42.lock"; T42_LOG="$WORK/t42.log"; T42_ERR="$WORK/t42.err" +: > "$T42_LOG"; : > "$T42_ERR" +# A STALE ghost that WOULD normally be stolen (backdated 9999s, well past STALE): +# the whole point is that it is NOT stolen because its age can't be established. +fabricate_lock "$T42_LOCK" "tok.ghost.t42.99999" "pid=99999 host=ghost" +backdate "$T42_LOCK" 9999 +T42_INNER=' + source "$1" || exit 70 + clone_fn _lock_stat_mtime _sm_orig + # Return EMPTY for the present lock path; defer to the real probe otherwise + # (the claim-file mtime at sh:1120/1230 must stay readable). + _lock_stat_mtime() { + if [ "$1" = "$AGENT_LOCK_PATH" ]; then printf ""; return 0; fi + _sm_orig "$@" + } + lock_acquire; exit $? +' +# Tight timing: small MAX_WAIT so the blocked waiter reaches 97 in ~2-3s. +AGENT_LOCK_PATH="$T42_LOCK" AGENT_LOCK_LOG="$T42_LOG" AGENT_LOCK_STALE_SECS=2 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \ + bash -c "$T42_INNER" _ "$LIB" 2>"$T42_ERR"; t42_rc=$? + +# (1) The fail-safe lane ran: the warn-once line appears. It is logged via +# _lock_log (lock log) AND echoed to stderr; assert either surface. +if grep -q "Staleness detection is BROKEN" "$T42_LOG" "$T42_ERR" 2>/dev/null \ + || grep -q "cannot read the lock file's mtime" "$T42_ERR" 2>/dev/null; then + ok "mtime-unreadable: 'Staleness detection is BROKEN' fail-safe warning fired" +else + bad "mtime-unreadable: no broken-staleness warning (fail-safe lane did not run); err=$(cat "$T42_ERR")" +fi +# (2) NO steal: the stale ghost is NOT stolen and is left in place. +if grep -q "STOLE-BY-CLAIM" "$T42_LOG" 2>/dev/null || grep -q "STOLE" "$T42_LOG" 2>/dev/null; then + bad "mtime-unreadable: ghost was STOLEN — staleness should have been disabled" +else + ok "mtime-unreadable: no steal (recovery disabled, ghost not stolen)" +fi +g42="$(head -n 1 -- "$T42_LOCK" 2>/dev/null | tr -d '\r')" +[ "$g42" = "tok.ghost.t42.99999" ] \ + && ok "mtime-unreadable: stale ghost lock left in place (token unchanged)" \ + || bad "mtime-unreadable: ghost lock disturbed (line1=$g42, want tok.ghost.t42.99999)" +# (3) The waiter blocks to MAX_WAIT and exits 97 (recovery disabled). +[ "$t42_rc" = 97 ] \ + && ok "mtime-unreadable: waiter blocked to MAX_WAIT and exited 97" \ + || bad "mtime-unreadable: waiter rc=$t42_rc (want 97 — was the stale ghost stolen?)" +# (4) Warn-once: the broken-staleness warning fires EXACTLY once per process. +t42_warns="$(grep -c "Staleness detection is BROKEN" "$T42_ERR" 2>/dev/null)"; t42_warns="${t42_warns:-0}" +[ "$t42_warns" -le 1 ] \ + && ok "mtime-unreadable: broken-staleness warning fired at most once on stderr ($t42_warns)" \ + || bad "mtime-unreadable: warning repeated ($t42_warns times — warn-once broken)" +rm -f "$T42_LOCK" "$T42_LOCK.next" +fi + +if section "Test 43: malformed/unreadable lock content at the poll guard — never stolen, warned/skipped"; then +# Two sibling branches of the in-acquire steal CONTENT GUARD (git-commit-lock.sh +# ~:1419-1444), both gated on an already-stale candidate, neither of which the +# torn/empty/tok.-prefixed cases (Tests 17/18) reach: +# (a) #18 — line 1 is NON-EMPTY but BLANK (whitespace/CR only): the trim at +# :1421 reduces it to empty, but the file is NOT empty (`-s` true) and the +# read SUCCEEDED, so it lands in the final `else` -> _lock_warn_nonlock +# "its content is not lock-shaped" (the `is not a lock file` config +# warning). NO steal; waiters reach 97. +# (b) #17 — the content read FAILS on a present, non-empty regular file (the +# `[ "$rdrc" -ne 0 ]` lane at :1432): logs "steal skipped: stale lock +# content unreadable"; NO steal; waiters reach 97. We can't make a real +# file unreadable on every platform (a chmod-000 file still reads for its +# owner on Windows/Cygwin), so we STEER it: source the lib in-process and +# shadow the `read` builtin to fail ONLY for the inline steal-guard read, +# identified by its direct caller `lock_acquire` (FUNCNAME[1]) — the +# _lock_read_tok / _lock_verify_stale reads delegate to `builtin read`, so +# only the :1420 site is perturbed. + +# (a) #18 — whitespace-only line 1: non-empty, blank, read OK -> never stolen, warned. +LOCK="$WORK/t43blank.lock"; LOG="$WORK/t43blank.log"; : > "$LOG" +printf ' \n' > "$LOCK"; backdate "$LOCK" 9999 # one space + LF: non-empty, blank line 1 +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \ + bash "$LIB" run -- bash -c 'true' 2> "$WORK/t43a.err"; rc=$? +[ "$rc" = 97 ] && ok "#18 blank line 1: waiter timed out (97) instead of stealing" \ + || bad "#18 blank line 1: rc=$rc (want 97)" +grep -q "is not a lock file" "$WORK/t43a.err" \ + && ok "#18 config warning fired (line 1 not lock-shaped)" || bad "#18 no config warning for blank line 1" +grep -q "non-lock object at lock path (its content is not lock-shaped)" "$LOG" \ + && ok "#18 log records the non-lock-shaped classification (branch ran)" \ + || bad "#18 missing the non-lock-shaped log line (branch did not run)" +grep -q "STOLE" "$LOG" && bad "#18 blank-content file was STOLEN" || ok "#18 no steal of the blank-content file" +[ -f "$LOCK" ] && ok "#18 blank-content file left in place" || bad "#18 blank-content file was removed" +rm -f "$LOCK" + +# (b) #17 — steal-guard content read FAILS on a present, non-empty file. +# Steering shell: source the lib, shadow the `read` builtin to fail ONLY when +# invoked directly by lock_acquire (the inline steal read at sh:1420). The ghost +# is tok.-prefixed and ancient, so absent the shadow it WOULD be stolen — the +# 97 outcome plus the "steal skipped ... unreadable" line prove the failed-read +# lane (not some other refusal) is what blocked the steal. +LOCK="$WORK/t43unread.lock"; LOG="$WORK/t43unread.log"; : > "$LOG" +fabricate_lock "$LOCK" "tok.ghost.t43" "pid=9 host=ghost"; backdate "$LOCK" 9999 +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \ + bash -c ' + source "$1" || exit 70 + # Shadow the read builtin; reach the real one via `builtin read`. Fail only + # the steal-guard read (its direct caller is lock_acquire) so the + # _lock_read_tok / _lock_verify_stale reads stay intact. + read() { + if [ "${FUNCNAME[1]:-}" = lock_acquire ]; then return 1; fi + builtin read "$@" + } + lock_acquire || exit 97 + lock_release || exit 74 + exit 0 + ' _ "$LIB" 2> "$WORK/t43b.err"; rc=$? +[ "$rc" = 97 ] && ok "#17 unreadable steal content: waiter timed out (97) instead of stealing" \ + || bad "#17 unreadable steal content: rc=$rc (want 97)" +grep -q "steal skipped: stale lock content unreadable" "$LOG" \ + && ok "#17 log records the skipped steal (unreadable branch ran)" \ + || bad "#17 missing the 'steal skipped ... unreadable' log line (branch did not run)" +grep -q "STOLE" "$LOG" && bad "#17 ghost was STOLEN despite the unreadable content read" \ + || ok "#17 no steal while the steal-guard read fails" +[ -f "$LOCK" ] && ok "#17 stale ghost left in place" || bad "#17 stale ghost was removed" +rm -f "$LOCK" +fi + +if section "Test 44: socket & device-node at the lock path — never stolen/deleted, refused (97)"; then +# The never-steal wrong-type guard (git-commit-lock.sh ~:1557-1567) classifies +# NON-regular objects at the lock path so they are NEVER stolen and NEVER +# deleted: a real config error (a typo'd AGENT_LOCK_PATH, a stray special file) +# must wedge waiters to 97 with a loud one-time config warning, not get +# clobbered. Test 17 covers the directory / symlink / FIFO arms of that +# classifier; this test covers the two remaining arms — the SOCKET (-S) and the +# DEVICE NODE (-b/-c) — both of which name their detected type in the warning. +# For each: rc 97, the object survives unchanged (same type), the warning fires +# naming the type, and nothing is ever stolen. + +# (a) a UNIX-DOMAIN SOCKET at the lock path. Fabricated with a backgrounded +# python3 AF_UNIX bind (the socket inode persists while the process holds it); +# skipped where a real socket can't be made AND classified -S by the running +# shell — notably default Git-Bash on Windows, whose bundled python is a native +# build with no socket.AF_UNIX (probed: bind raises AttributeError, so no inode +# appears). CI's POSIX legs exercise this arm. The listener is reaped by its +# EXACT pid at the end (never by name). +LOCK="$WORK/sock.lock"; LOG="$WORK/sock.log"; : > "$LOG" +SOCKERR="$WORK/sock.py.err"; sock_pid=""; sock_ok=0 +if command -v python3 >/dev/null 2>&1; then + rm -f "$LOCK" + python3 -c 'import socket,sys,time +s=socket.socket(socket.AF_UNIX) +s.bind(sys.argv[1]) +sys.stderr.write("bound\n"); sys.stderr.flush() +time.sleep(30)' "$LOCK" 2> "$SOCKERR" & + sock_pid=$! + # Gate on the socket actually existing AND classifying -S (not just the pid + # being alive): on a no-AF_UNIX build the process exits immediately with no + # inode, so we must positively confirm the object before relying on it. + for _ in $(seq 1 100); do + [ -S "$LOCK" ] && { sock_ok=1; break; } + kill -0 "$sock_pid" 2>/dev/null || break + sleep 0.05 + done +fi +if [ "$sock_ok" = 1 ]; then + AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \ + bash "$LIB" run -- bash -c 'true' 2> "$WORK/t44a.err"; rc=$? + [ "$rc" = 97 ] && ok "socket at lock path: waiter timed out (97), command never ran" \ + || bad "socket at lock path: rc=$rc (want 97)" + [ -S "$LOCK" ] && ok "socket untouched (never stolen/deleted, still a socket)" \ + || bad "socket at lock path was removed/replaced!" + grep -q "is not a lock file" "$WORK/t44a.err" && ok "loud config warning on stderr (socket)" \ + || bad "no config warning for socket at lock path" + grep -q "it is a socket" "$WORK/t44a.err" && ok "warning names the detected type (socket)" \ + || bad "warning does not name the socket type" + n="$(grep -c "is not a lock file" "$WORK/t44a.err")" + [ "$n" = 1 ] && ok "socket config warning fired exactly once per process (got $n)" \ + || bad "socket config warning fired $n times (want 1)" + grep -q STOLE "$LOG" && bad "socket was STOLEN" || ok "no steal attempted on a socket" +else + echo "note: cannot create a unix-domain socket here (no socket.AF_UNIX / not classified -S) — socket guard not exercised (CI POSIX legs cover it)" +fi +# Reap the listener by ITS exact pid only (bounded wait, then hard-kill of the +# same pid as a last resort) — never by name. Harmless if it already exited. +if [ -n "$sock_pid" ]; then + kill "$sock_pid" 2>/dev/null + for _ in $(seq 1 40); do kill -0 "$sock_pid" 2>/dev/null || break; sleep 0.05; done + kill -0 "$sock_pid" 2>/dev/null && kill -9 "$sock_pid" 2>/dev/null + wait "$sock_pid" 2>/dev/null +fi +rm -f "$LOCK" + +# (b) a DEVICE NODE at the lock path. mknod needs root, but /dev/null is a +# character device that always exists, so we point AGENT_LOCK_PATH straight at +# it: the -c arm of the classifier must refuse it. This is SAFE precisely +# because the guard refuses — it is never opened-for-write, stolen, or deleted — +# which the post-run assertion below proves (/dev/null is still a char device). +# Skipped only if /dev/null somehow isn't a char device on this platform. +if [ -c /dev/null ]; then + LOG="$WORK/dev.log"; : > "$LOG" + AGENT_LOCK_PATH="/dev/null" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=3 \ + bash "$LIB" run -- bash -c 'true' 2> "$WORK/t44b.err"; rc=$? + [ "$rc" = 97 ] && ok "device node (/dev/null) at lock path: waiter timed out (97), command never ran" \ + || bad "device node at lock path: rc=$rc (want 97)" + [ -c /dev/null ] && ok "/dev/null untouched (never stolen/deleted, still a char device)" \ + || bad "/dev/null was damaged — the guard must NEVER touch a device node!" + grep -q "is not a lock file" "$WORK/t44b.err" && ok "loud config warning on stderr (device node)" \ + || bad "no config warning for device node at lock path" + grep -q "it is a device node" "$WORK/t44b.err" && ok "warning names the detected type (device node)" \ + || bad "warning does not name the device-node type" + n="$(grep -c "is not a lock file" "$WORK/t44b.err")" + [ "$n" = 1 ] && ok "device-node config warning fired exactly once per process (got $n)" \ + || bad "device-node config warning fired $n times (want 1)" + grep -q STOLE "$LOG" && bad "device node was STOLEN" || ok "no steal attempted on a device node" +else + echo "note: /dev/null is not a char device here — device-node guard not exercised (CI POSIX legs cover it)" +fi +fi + + +if section "Test 45: log self-truncates past ~1 MB (rotation, not unbounded growth)"; then +# _lock_log starts the log over (not rotate) once it grows past ~1MB: the size +# check at the top of _lock_log truncates the file to empty before the write, +# so a normal log-producing op on an oversized log leaves a small, well-formed +# log carrying only the fresh protocol lines. Pre-fill > 1MB, run one clean +# acquire+release, assert the log SHRANK and the lock still worked. +LOCK="$WORK/t45.lock"; LOG="$WORK/t45.log" +# Pre-fill comfortably above the 1048576-byte (1MB) threshold (~1.2MB of 'x'). +head -c 1200000 /dev/zero | tr '\0' 'x' > "$LOG" +before=$(wc -c < "$LOG") +[ "$before" -gt 1048576 ] && ok "pre-fill exceeds the 1MB threshold (${before} bytes)" \ + || bad "pre-fill not over threshold (${before} bytes)" +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" bash "$LIB" run -- bash -c 'true'; rc=$? +[ "$rc" = 0 ] && ok "lock op succeeded over an oversized log (rc=0)" \ + || bad "lock op rc=$rc over oversized log (want 0)" +after=$(wc -c < "$LOG") +# Truncation fired iff the log is now far below the threshold (it holds only a +# handful of fresh lines). Use 1MB as the boundary: any non-truncation leaves +# it at/above the 1.2MB pre-fill. +[ "$after" -lt 1048576 ] && ok "log shrank below threshold after the op (${before} -> ${after} bytes — rotation fired)" \ + || bad "log did NOT shrink (${before} -> ${after} bytes — truncation never fired)" +# Well-formed: the new log carries the fresh protocol lines, not the old giant +# 'x' content, and records the truncation. +grep -q 'log exceeded 1MB; truncated' "$LOG" && ok "log records the self-truncation notice" \ + || bad "no truncation notice in the restarted log" +grep -q 'ACQUIRED' "$LOG" && grep -q 'RELEASED' "$LOG" \ + && ok "restarted log carries fresh ACQUIRED + RELEASED protocol lines" \ + || bad "restarted log missing fresh protocol lines (ACQUIRED/RELEASED)" +grep -q 'xxxx' "$LOG" && bad "old oversized 'x' content survived into the restarted log" \ + || ok "old oversized content is gone (clean restart, not appended)" +[ -e "$LOCK" ] && bad "lock left held after run" || ok "lock released after the over-threshold run" +rm -f "$LOCK" "$LOG" +fi + +if section "Test 46: EXIT while waiting (no hold) — no-hold trap arc, no spurious release"; then +# Covers _lock_on_exit's no-hold arc-end (sh:1009,1017-1018). +# A sourced waiter, blocked in the wait loop against a LIVE held lock, exits 0 +# while still parked — the EXIT trap is STILL '_lock_on_exit' (the timeout's +# trap-restore has NOT run, because we never time out), so EXIT fires the +# handler on the NO-HOLD path: claim-trap cleanup (no token => no-op), +# leaked-resolve, restore traps. NO release semantics may run (we never held). +# +# Why interposition and not "lock_acquire times out 97 then exit": the 97 +# timeout path itself runs _lock_restore_traps BEFORE returning, so by the time +# the caller exits the EXIT trap is already gone and _lock_on_exit never fires +# (verified: post-97 `trap -p EXIT` is empty). To exercise the EXIT-while- +# WAITING arc the process must leave the loop via `exit` with the trap still +# armed — so W shadows `sleep` (called once per poll inside the wait loop) to +# park on a marker, then `exit 0` from inside that first poll-sleep. At that +# point _LOCK_HELD=0 and no claim is in flight (the live lock is never stale, so +# no steal/claim was attempted), which is exactly the no-hold arc. +T46_INNER=' + source "$1" || exit 70 + F46=0 + sleep() { + if [ "$F46" = 0 ]; then + F46=1 + command touch "$T46R" # signal: parked in the wait loop + until [ -e "$T46G" ]; do command sleep 0.05; done + # Record the live EXIT trap so the assertions can prove _lock_on_exit + # (not a bare/restored trap) is what fires on the exit below. + trap -p EXIT > "$T46T" + exit 0 # EXIT while waiting, no hold held + fi + command sleep "$@" + } + lock_acquire + echo "REACHED-UNEXPECTED rc=$?" >&2 # the shadowed sleep must exit first +' +LOCK="$WORK/exitwait.lock"; LOG="$WORK/exitwait.log"; : > "$LOG" +HLOG="$WORK/exitwait.h.log"; : > "$HLOG" +T46R="$WORK/t46.ready"; T46G="$WORK/t46.go"; T46T="$WORK/t46.trap" +rm -f "$T46R" "$T46G" "$T46T" "$LOCK" "$LOCK.next" +# H: holder — sourced, takes a FRESH live lock and parks until released. STALE is +# huge so the lock is never judged stealable; W therefore stays a pure waiter. +HR="$WORK/t46.hready"; HG="$WORK/t46.hgo"; rm -f "$HR" "$HG" +HR="$HR" HG="$HG" \ +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$HLOG" AGENT_LOCK_STALE_SECS=600 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \ + bash -c ' + source "$1" || exit 70 + lock_acquire || exit 72 + touch "$HR" + until [ -e "$HG" ]; do sleep 0.05; done + lock_release + ' _ "$LIB" 2>/dev/null & +h46=$! +wait_for_file "$HR" 30 || bad "T46 holder never acquired the lock" +htok=""; IFS= read -r htok < "$LOCK" || true # the live holder's token +# W: the waiter that will exit while parked in the wait loop (no hold). +T46R="$T46R" T46G="$T46G" T46T="$T46T" \ +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=600 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=60 \ + bash -c "$T46_INNER" _ "$LIB" 2>/dev/null & +w46=$! +# Gate on W proving it reached the wait-loop poll (its WAITING line is logged, +# and its shadowed sleep touched the ready marker) before releasing it to exit. +wait_for_grep "WAITING for lock" "$LOG" 30 || bad "T46 waiter never logged WAITING" +wait_for_file "$T46R" 30 || bad "T46 waiter never reached its wait-loop poll" +touch "$T46G" +wait "$w46"; rc=$? +# Core assertion: W exited cleanly via the EXIT no-hold arc, with NO release +# semantics — it never held the lock, so a RELEASED or a 98/'lock LOST' would +# mean the handler wrongly ran the holding branch. +[ "$rc" = 0 ] && ok "waiter exited 0 via the EXIT-while-waiting no-hold arc" \ + || bad "T46 waiter rc=$rc (want 0; EXIT trap mishandled the no-hold arc?)" +grep -q RELEASED "$LOG" && bad "spurious RELEASED on the no-hold EXIT arc (release ran without a hold)" \ + || ok "no RELEASED on the no-hold EXIT arc (no release semantics)" +grep -q "lock LOST" "$LOG" && bad "98-classification ran on the no-hold EXIT arc" \ + || ok "no 98 classification on the no-hold EXIT arc" +# The trap that fired was our handler, not a bare/restored one — this is the +# discriminator that the EXIT-WHILE-WAITING arc ran (vs a post-97 exit, where +# the trap is already empty). Mirrors Test 12d's trap-restoration idiom. +grep -q "_lock_on_exit" "$T46T" && ok "EXIT trap still armed as _lock_on_exit at exit (no-hold arc, not post-97)" \ + || bad "EXIT trap was not _lock_on_exit at exit (got: $(cat "$T46T" 2>/dev/null))" +# The waiter left no claim behind (it never claimed — the live lock is not stale). +[ -e "$LOCK.next" ] && bad "waiter left a claim file behind on the no-hold EXIT arc" \ + || ok "no leftover claim from the no-hold EXIT waiter" +# H's lock is untouched — still the holder's original token, still held. +l1=""; IFS= read -r l1 < "$LOCK" 2>/dev/null || true +[ -n "$htok" ] && [ "$l1" = "$htok" ] && ok "holder's lock untouched by the dying waiter (token intact)" \ + || bad "holder's lock changed by the dying waiter (was=$htok now=$l1)" +# Release H and confirm it shut down cleanly (no fallout from W's exit). +touch "$HG"; wait "$h46" 2>/dev/null +grep -q "lock LOST" "$HLOG" && bad "holder saw a stolen lease (98) — the waiter's exit disturbed the hold" \ + || ok "holder released its still-held lock cleanly (no 98)" +rm -f "$LOCK" "$LOCK.next" "$T46R" "$T46G" "$T46T" "$HR" "$HG" +fi + +if section "Test 47: no-mv-T rename-over fallback (BSD/macOS lane) forced via _LOCK_MVT=0 — steal still installs"; then +# _lock_rename_over (git-commit-lock.sh ~:961-979) probes once for GNU `mv -T` +# and caches the verdict in _LOCK_MVT (""=unprobed, 1=supported, 0=not). On +# Linux/MINGW the probe ALWAYS picks `mv -T`, so the no-`-T` fallback lane +# (~:976-977: a last-instant `[ -d "$dst" ]` guard + a bare `mv`) is NEVER +# executed in CI except on a real BSD/macOS runner. Pre-seeding _LOCK_MVT=0 in +# the sourced steal shell BEFORE any acquire makes the `[ -z "$_LOCK_MVT" ]` +# probe short-circuit (the var is already non-empty), forcing the fallback on +# the common leg. Two scenarios: +# (a) a normal steal of a stale ghost under _LOCK_MVT=0 installs the lock via +# the unlink-free bare-`mv` fallback (STOLE-BY-CLAIM, the steal acquires); +# (b) a DIRECTORY squatting the lock path under _LOCK_MVT=0 is refused by the +# fallback's `[ -d ]` last-instant guard (no clobber) — the fallback-path +# analogue of Test 37's `mv -T` natural refusal. +# Determinism proof that the fallback truly ran (not GNU `mv -T`): scenario (a) +# shadows `mv` to record, per invocation touching ".next", whether `-T` was +# passed; under _LOCK_MVT=0 the steal's claim->lock rename MUST be a bare `mv` +# (no `-T`). A control run WITHOUT the override is asserted to still steal, so a +# pass cannot come from the override having silently broken the steal entirely. + +# ---- (a) forced-fallback steal of a stale ghost: STOLE-BY-CLAIM via bare mv ---- +LOCK="$WORK/mvt0.lock"; LOG="$WORK/mvt0.log"; : > "$LOG" +MVTRACE="$WORK/mvt0.mvtrace"; : > "$MVTRACE" +fabricate_lock "$LOCK" "tok.ghost.t47" "pid=9 host=ghost"; backdate "$LOCK" 9999 +# Sourced steal shell: pre-seed _LOCK_MVT=0, shadow `mv` to log the flags it was +# called with on the ".next" (claim->lock) rename, then call the real `mv`. +AGENT_LOCK_PATH="$LOCK" AGENT_LOCK_LOG="$LOG" AGENT_LOCK_STALE_SECS=2 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \ + bash -c ' + source "$1" || exit 70 + _LOCK_MVT=0 # force the no-mv-T fallback lane + export MVTRACE_PATH="$2" # pass the trace path into mv() via env + mv() { + case "$*" in + *".next"*) printf "%s\n" "$*" >> "$MVTRACE_PATH" ;; # record claim->lock rename flags + esac + command mv "$@" + } + lock_acquire || exit 72 + lock_release || exit 74 + exit 0 + ' _ "$LIB" "$MVTRACE" 2>/dev/null; rc=$? +[ "$rc" = 0 ] && ok "T47(a): forced-fallback steal acquired+released rc 0 (_LOCK_MVT=0)" \ + || bad "T47(a): forced-fallback steal rc=$rc (want 0)" +grep -q "STOLE-BY-CLAIM" "$LOG" \ + && ok "T47(a): stale ghost stolen via the no-mv-T fallback (STOLE-BY-CLAIM logged)" \ + || bad "T47(a): no STOLE-BY-CLAIM under _LOCK_MVT=0 — fallback did not install the lock" +grep -q "ACQUIRED" "$LOG" && grep -q "RELEASED" "$LOG" \ + && ok "T47(a): fallback steal produced a clean ACQUIRED/RELEASED pair" \ + || bad "T47(a): missing ACQUIRED/RELEASED after the fallback steal" +# The mv trace proves the fallback lane (bare mv, no -T) actually carried the +# claim->lock rename — the whole point of forcing _LOCK_MVT=0. +[ -s "$MVTRACE" ] \ + && ok "T47(a): claim->lock rename went through the shadowed mv (trace non-empty)" \ + || bad "T47(a): no .next rename recorded — the steal did not rename-over as expected" +if grep -q -- '-T' "$MVTRACE"; then + bad "T47(a): claim->lock rename used 'mv -T' — the GNU fast path ran, fallback NOT forced" +else + ok "T47(a): claim->lock rename used a BARE mv (no -T) — the BSD/macOS fallback lane was taken" +fi +{ [ -e "$LOCK" ] || [ -e "$LOCK.next" ]; } \ + && bad "T47(a): leftover lock/claim after the fallback steal+release" \ + || ok "T47(a): clean final state (no lock, no claim) after fallback steal+release" + +# ---- (a-control) same steal WITHOUT the override still succeeds ---- +# Guards against a false pass where _LOCK_MVT=0 silently broke the steal: the +# unmodified library must steal the identical ghost too (here via mv -T). +LOCKC="$WORK/mvt0c.lock"; LOGC="$WORK/mvt0c.log"; : > "$LOGC" +fabricate_lock "$LOCKC" "tok.ghost.t47c" "pid=9 host=ghost"; backdate "$LOCKC" 9999 +AGENT_LOCK_PATH="$LOCKC" AGENT_LOCK_LOG="$LOGC" AGENT_LOCK_STALE_SECS=2 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=10 \ + bash -c 'source "$1" || exit 70; lock_acquire || exit 72; lock_release || exit 74; exit 0' \ + _ "$LIB" 2>/dev/null; rcc=$? +[ "$rcc" = 0 ] && grep -q "STOLE-BY-CLAIM" "$LOGC" \ + && ok "T47(a-control): unmodified steal of the same ghost also succeeds (override didn't trivially break it)" \ + || bad "T47(a-control): control steal rc=$rcc / no STOLE-BY-CLAIM (the (a) pass may be vacuous)" + +# ---- (b) directory at the lock path under _LOCK_MVT=0: [ -d ] guard refuses ---- +# The fallback's last-instant `[ -d "$dst" ]` guard (sh:976) must refuse to +# rename a file over a directory — Test 37's no-clobber outcome, reached via the +# fallback rather than `mv -T`'s natural directory refusal. Test 37 shadows `mv` +# so the directory appears just before the real `mv -T` refuses it; that timing +# does NOT exercise the fallback's `[ -d ]` because the swap lands AFTER the +# library has already passed line 976. To hit the fallback guard itself we wrap +# `_lock_rename_over`: the wrapper installs the directory and pins _LOCK_MVT=0, +# THEN calls the unmodified original — whose own `[ -d "$dst" ]` check (line 976) +# now sees the directory and returns 1, with NO library `mv`/`mv -T` ever run. +# The verifies (step 3.3) ran before the wrapper, so they saw a stale FILE; the +# directory exists only from the wrapper's first line onward. This is the +# fallback-lane analogue of Test 37's wrong-type refusal. +LOCKB="$WORK/mvt0dir.lock"; LOGB="$WORK/mvt0dir.log"; : > "$LOGB" +fabricate_lock "$LOCKB" "tok.ghost.t47b" "pid=9 host=ghost"; backdate "$LOCKB" 9999 +AGENT_LOCK_PATH="$LOCKB" AGENT_LOCK_LOG="$LOGB" AGENT_LOCK_STALE_SECS=1 \ + AGENT_LOCK_CLAIM_STALE_SECS=600 AGENT_LOCK_POLL_SECS=0.2 AGENT_LOCK_MAX_WAIT=3 \ + bash -c ' + source "$1" || exit 70 + clone_fn _lock_rename_over _ro_orig + _lock_rename_over() { + # Land a DIRECTORY at the lock path, then force the fallback lane and run + # the REAL rename-over: its own `[ -d ]` guard (sh:976) must refuse (rc 1). + command rm -f -- "$AGENT_LOCK_PATH" 2>/dev/null + command mkdir -- "$AGENT_LOCK_PATH" 2>/dev/null + _LOCK_MVT=0 + _ro_orig + } + lock_acquire + exit $? + ' _ "$LIB" 2>/dev/null; rcb=$? +[ "$rcb" = 97 ] && ok "T47(b): fallback [ -d ] guard refused; waiter honoured MAX_WAIT (97), no false hold" \ + || bad "T47(b): rc=$rcb (want 97 — a clobber/false hold would differ)" +grep -q "CLAIM-ABORT (rename-refused)" "$LOGB" \ + && ok "T47(b): CLAIM-ABORT (rename-refused) logged — fallback guard hit the wrong-type lane" \ + || bad "T47(b): no CLAIM-ABORT (rename-refused) — fallback guard branch not exercised" +grep -q "non-file at the lock path" "$LOGB" \ + && ok "T47(b): refusal classified as non-file at the lock path" \ + || bad "T47(b): missing 'non-file at the lock path' classification" +grep -q "STOLE-BY-CLAIM" "$LOGB" \ + && bad "T47(b): spurious STOLE-BY-CLAIM — the directory-occupied path was falsely stolen" \ + || ok "T47(b): no STOLE-BY-CLAIM (the [ -d ] guard prevented a false steal)" +[ -d "$LOCKB" ] \ + && ok "T47(b): directory left in place at the lock path (never clobbered by the fallback mv)" \ + || bad "T47(b): lock path no longer the squatting directory — the guard failed to protect it" +[ -e "$LOCKB.next" ] \ + && bad "T47(b): claim leftover (\$LOCK.next) after the fallback rename-refused abort" \ + || ok "T47(b): claim file cleaned up — no leftover \$LOCK.next" +rm -rf "$LOCK" "$LOCK.next" "$LOCKC" "$LOCKC.next" "$LOCKB" "$LOCKB.next" +fi + + +if section "Test 48: unwritable lock dir -> clean 97, command never runs, no false hold (F4)"; then +# F4 (failure-modes.md §4 item 5): a read-only / unwritable lock-dir parent makes the +# O_EXCL create fail every poll, so the waiter times out at 97 — no corruption, no +# false hold, and the wrapped command never runs. POSIX-only: chmod 0555 is a no-op +# for writes on Git-Bash/NTFS (the create would wrongly succeed), so skip-with-note +# on Windows; the Linux/macOS CI legs exercise it. +case "$(uname -s)" in + MINGW*|MSYS*|CYGWIN*) + echo "note: Test 48 skipped on Windows — chmod 0555 does not deny writes on NTFS; the POSIX CI legs cover it" ;; + *) + T48DIR="$WORK/t48.nowrite"; T48LOG="$WORK/t48.log"; mkdir -p "$T48DIR"; : > "$T48LOG" + T48MARK="$WORK/t48.ran"; rm -f "$T48MARK" + chmod 0555 "$T48DIR" + AGENT_LOCK_PATH="$T48DIR/commit.lock" AGENT_LOCK_LOG="$T48LOG" \ + AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \ + bash "$LIB" run -- bash -c "touch '$T48MARK'" 2> "$WORK/t48.err"; rc=$? + [ "$rc" = 97 ] && ok "F4 unwritable lock dir: waiter timed out (97)" \ + || bad "F4 unwritable lock dir: rc=$rc (want 97)" + [ ! -e "$T48MARK" ] && ok "F4: the wrapped command never ran" \ + || bad "F4: the wrapped command ran despite no lock" + [ ! -e "$T48DIR/commit.lock" ] && ok "F4: no lock file created in the unwritable dir" \ + || bad "F4: a lock file appeared in an unwritable dir" + grep -q "WAITING for lock" "$T48LOG" && ok "F4: logged WAITING (the create kept failing)" \ + || bad "F4: no WAITING log" + grep -q "TIMEOUT after" "$T48LOG" && ok "F4: logged the TIMEOUT" || bad "F4: no TIMEOUT log" + chmod 0755 "$T48DIR" 2>/dev/null; rm -rf "$T48DIR" # restore so cleanup() can rm -rf $WORK + ;; +esac +fi + +if section "Test 49: failing log path -> lock still works, the log write is swallowed (F2/J1)"; then +# F2/J1 (failure-modes.md §4 item 5): logging is best-effort (every write ends || true). +# Point AGENT_LOCK_LOG under a REGULAR FILE so every append/open fails ENOTDIR — the +# lock must still acquire+release cleanly (rc 0) with the log write swallowed. +# Portable (no chmod/perms). NOTE: bash's redirection-OPEN failure leaks to stderr +# (the ||true is on the write, not the open), so do NOT assert clean stderr; and do +# NOT grep the log (nothing is ever written to it). +T49P="$WORK/t49.notadir"; : > "$T49P" # a regular FILE; using it as a dir -> ENOTDIR +T49LOG="$T49P/x.log" # every open/append under it fails ENOTDIR +T49MARK="$WORK/t49.ran"; rm -f "$T49MARK" +AGENT_LOCK_PATH="$WORK/t49.lock" AGENT_LOCK_LOG="$T49LOG" \ + bash "$LIB" run -- bash -c "touch '$T49MARK'" 2>/dev/null; rc=$? +[ "$rc" = 0 ] && ok "F2/J1 failing log: lock acquired+released, command ran (rc 0)" \ + || bad "F2/J1 failing log: rc=$rc (want 0 — a bad log must not fail the lock)" +[ -e "$T49MARK" ] && ok "F2/J1: the wrapped command ran" \ + || bad "F2/J1: the wrapped command did not run" +[ ! -e "$WORK/t49.lock" ] && ok "F2/J1: lock released/cleaned up despite the failing log" \ + || bad "F2/J1: lock left behind" +[ ! -e "$T49LOG" ] && ok "F2/J1: the log write was swallowed (no log file under the non-dir)" \ + || bad "F2/J1: a log file was created under a non-dir" +rm -f "$T49P" "$WORK/t49.lock" +fi + +if section "Test 50: ENOSPC on lock create/write -> wait then 97, no false hold (F1)"; then +# F1 (failure-modes.md §4 item 5): a full filesystem makes the create's write fail +# (ENOSPC); the created-but-write-failed file is an empty orphan and the waiter +# times out at 97 — no corruption, no false hold. Real ENOSPC needs a full FS, which +# needs root (a small tmpfs); `ulimit -f` is NOT usable (it raises SIGXFSZ and kills +# the wrapper, the wrong lane). So: Linux + passwordless sudo only; skip-with-note +# otherwise. The Linux CI leg (ubuntu runners have passwordless sudo) exercises it. +if [ "$(uname -s)" = Linux ] && sudo -n true 2>/dev/null; then + T50MNT="$WORK/t50.full"; T50LOG="$WORK/t50.log"; mkdir -p "$T50MNT"; : > "$T50LOG" + T50MARK="$WORK/t50.ran"; rm -f "$T50MARK" + if sudo mount -t tmpfs -o size=64k tmpfs "$T50MNT" 2>/dev/null; then + dd if=/dev/zero of="$T50MNT/fill" bs=1k count=256 2>/dev/null || true # fill to ENOSPC + AGENT_LOCK_PATH="$T50MNT/commit.lock" AGENT_LOCK_LOG="$T50LOG" \ + AGENT_LOCK_STALE_SECS=1 AGENT_LOCK_POLL_SECS=0.1 AGENT_LOCK_MAX_WAIT=2 \ + bash "$LIB" run -- bash -c "touch '$T50MARK'" 2> "$WORK/t50.err"; rc=$? + [ "$rc" = 97 ] && ok "F1 ENOSPC: waiter timed out (97)" \ + || bad "F1 ENOSPC: rc=$rc (want 97)" + [ ! -e "$T50MARK" ] && ok "F1: the wrapped command never ran under ENOSPC" \ + || bad "F1: the wrapped command ran despite ENOSPC" + sudo umount "$T50MNT" 2>/dev/null + else + echo "note: Test 50 skipped — could not mount a tmpfs (sudo mount failed); covered where mountable" + fi + rmdir "$T50MNT" 2>/dev/null || true +else + echo "note: Test 50 skipped — ENOSPC injection needs Linux + passwordless sudo (a small tmpfs); the Linux CI leg covers it" +fi +fi # NOTES (deliberately untested here): # * lock_release's LEFTOVER lane (the unlink blocked persistently) needs a @@ -2064,10 +3156,23 @@ rm -f "$LOCK" "$LOCK.next" # blocker is most naturally a pwsh FileShare.Read holder, so the interop # suite owns that test (on POSIX, unlink never blocks on open handles and # the lane is unreachable). -# * lock_acquire's read-back-verification failure lane needs fault injection -# to make a winning create read back wrong; it is defence in depth (see the -# ACQUIRE VERIFICATION header section), not suite-covered. +# * lock_acquire's read-back-verification failure lanes (defence in depth; see +# the ACQUIRE VERIFICATION header section) are covered via _lock_cur_token +# fault injection: the create-path lane (create won, read-back wrong) by +# Test 32, the steal-path lane (F2 — rename-over won, read-back wrong) by +# Test 32b. + +# Zero-match guard + selector-report line (shared helper in _harness.sh): a +# set-but-non-matching GCL_TEST_ONLY ran NO test block, which without the guard +# would fall through to a vacuous PASS=0 FAIL=0 "green" — a typo'd selector regex +# would silently look like success; bail loudly instead. (The finish EXIT trap +# also fires there since DONE is still 0; that exit is non-zero regardless.) When +# the selector matched, it reports how many blocks ran. Both are gated on +# GCL_TEST_ONLY being non-empty, so a default run stays byte-identical. +selector_report +DONE=1 echo -echo "==== RESULT: $PASS passed, $FAIL failed (fan-out: $GCL_MODE) ====" +echo "==== RESULT: $PASS passed, $FAIL failed, $ENV_WARN envelope warning(s) (fan-out: $GCL_MODE) ====" +[ "$GCL_TAP" = 1 ] && echo "1..$TAPN" [ "$FAIL" = 0 ] diff --git a/tests/with-load.sh b/tests/with-load.sh new file mode 100644 index 0000000..077511f --- /dev/null +++ b/tests/with-load.sh @@ -0,0 +1,308 @@ +#!/usr/bin/env bash +# with-load.sh — run a command under a calibrated, reproducible background load. +# +# Usage: bash tests/with-load.sh [args...] +# Example: bash tests/with-load.sh bash tests/git-commit-lock.test.sh +# +# Wraps "$@", applies artificial background load for the command's lifetime, then +# tears the load down (by EXACT spawned PIDs — never by name, so it is safe on a +# shared dev box and doubly safe on an ephemeral CI runner) and exits with the +# wrapped command's exit code. +# +# WHY load exists here (see docs/load-testing-strategy.md §1): this protocol's +# *correctness* is load-independent (O_EXCL + atomic rename + per-attempt tokens +# never consult the clock for a correctness decision), so load cannot break +# exclusion. Load's only jobs are (J1) perturb scheduling so the protocol's +# multi-syscall sequences get preempted at adversarial points, and (J2) stretch +# the few genuinely timing-derived decisions. Magnitude past ~2x CPU +# oversubscription mostly manufactures harness wall-clock flakes, not bugs — which +# is why load is expressed as an oversubscription RATIO and the total ratio is +# CAPPED. +# +# ── Calibrated interface (the contract nightly/deep-sweep CI calls against) ────── +# +# GCL_STRESS_KIND none | cpu | disk | both (default: none) +# none/unset => CLEAN PASS-THROUGH: zero added load, the +# command's exit code is propagated verbatim. +# +# GCL_STRESS_RATIO Oversubscription ratio R = stressors / nproc, PER KIND. +# (default: 1) Stressors-per-kind = round(R * nproc), +# floored at 1 when a kind is selected. Runner-independent: +# "R=2" means the same pressure on a 2-core and a 32-core box, +# whereas a raw hog count does not. +# +# GCL_STRESS_RATIO_MAX Cap on the TOTAL oversubscription ratio across all kinds +# (default: 2). `both` runs cpu + disk, so its total ratio is +# 2*R; this cap scales each kind's stressor count down +# proportionally so the runner is never wedged. Set the +# deep-sweep flake-hunt higher deliberately. +# +# GCL_STRESS_LOAD BACK-COMPAT raw-count override. If set to a positive +# integer it REPLACES the ratio computation: exactly N +# stressors per selected kind (still capped by RATIO_MAX +# unless GCL_STRESS_RATIO_MAX is also raised). Empty/unset => +# use the ratio. Kept so the existing deep-sweep +# `stress_load=N` dispatch input keeps working. +# +# GCL_STRESS_CGROUP 1 => on Linux with a writable cgroup v2 cpu controller, +# PROBE the calibrated cgroup CPU-quota path (envelope leg). +# The probe is recorded in the manifest. cgroup IO throttling +# is experimental and intentionally NOT attempted here. +# (default: 0) Absent/unwritable => fall back to spinners. +# +# GCL_LOAD_MANIFEST Path for the per-run load-manifest JSON +# (default: test-output/load-manifest..json, created +# under a known dir so CI can upload it). One file per run, +# capturing {kind, R, nproc, stressor counts, achieved +# slowdown, tool versions, os/arch, git sha} so any flake is +# reproducible. Written on success too. +# +# CPU stressor: `stress-ng --cpu` when available (calibrated, measurable), else a +# portable bash spin loop (one busy core each). +# Disk stressor: a tight create / write+fsync / delete loop over a small file on the +# same volume as the test scratch dir — metadata + write-back pressure +# that contends with the lock-file create/delete the suite itself does. +# (Always the portable shell hog; cross-platform, low-fidelity but real +# metadata-op pressure — see strategy §4.) +set -uo pipefail + +# ── Inputs ─────────────────────────────────────────────────────────────────── +kind="${GCL_STRESS_KIND:-none}" +nproc_count="$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)" +case "$nproc_count" in ''|*[!0-9]*) nproc_count=4 ;; esac +[ "$nproc_count" -lt 1 ] && nproc_count=1 + +ratio="${GCL_STRESS_RATIO:-1}" +case "$ratio" in ''|*[!0-9]*) ratio=1 ;; esac # integer ratios only (R in {0,1,2,…}) + +ratio_max="${GCL_STRESS_RATIO_MAX:-2}" +case "$ratio_max" in ''|*[!0-9]*) ratio_max=2 ;; esac + +raw_load="${GCL_STRESS_LOAD:-}" +case "$raw_load" in *[!0-9]*) raw_load="" ;; esac # non-numeric => ignore, use ratio + +manifest="${GCL_LOAD_MANIFEST:-test-output/load-manifest.$$.json}" + +# ── Stressor-count calibration ───────────────────────────────────────────────── +# Per-kind count: raw-count override wins, else round(R * nproc) floored at 1. +if [ -n "$raw_load" ]; then + per_kind="$raw_load" +else + per_kind=$(( ratio * nproc_count )) + [ "$ratio" -gt 0 ] && [ "$per_kind" -lt 1 ] && per_kind=1 +fi + +# How many kinds spawn stressors. +n_kinds=0 +case "$kind" in + cpu|disk) n_kinds=1 ;; + both) n_kinds=2 ;; +esac + +# R_total cap: total stressors must not exceed ratio_max * nproc. `both` would +# otherwise be 2*per_kind; scale each kind down proportionally if it would breach. +cpu_count=0 +disk_count=0 +capped="no" +if [ "$n_kinds" -gt 0 ] && [ "$per_kind" -gt 0 ]; then + total_cap=$(( ratio_max * nproc_count )) + [ "$total_cap" -lt "$n_kinds" ] && total_cap="$n_kinds" # always allow >=1 per active kind + requested_total=$(( per_kind * n_kinds )) + if [ "$requested_total" -gt "$total_cap" ]; then + per_kind=$(( total_cap / n_kinds )) + [ "$per_kind" -lt 1 ] && per_kind=1 + capped="yes" + fi + case "$kind" in + cpu) cpu_count="$per_kind" ;; + disk) disk_count="$per_kind" ;; + both) cpu_count="$per_kind"; disk_count="$per_kind" ;; + esac +fi + +# ── Tool discovery ───────────────────────────────────────────────────────────── +stress_ng_bin="$(command -v stress-ng 2>/dev/null || true)" +stress_ng_ver="none" +[ -n "$stress_ng_bin" ] && stress_ng_ver="$("$stress_ng_bin" --version 2>/dev/null | head -1 | tr -d '\r')" +bash_ver="$(bash --version 2>/dev/null | head -1 | tr -d '\r')" +os_uname="$(uname -srm 2>/dev/null | tr -d '\r' || echo unknown)" +git_sha="$(git rev-parse --short HEAD 2>/dev/null || echo unknown)" + +# CPU mechanism actually used. +cpu_mech="none" +[ "$cpu_count" -gt 0 ] && { [ -n "$stress_ng_bin" ] && cpu_mech="stress-ng" || cpu_mech="spinner"; } + +# ── cgroup v2 CPU-quota probe (Linux envelope leg only; probe-gated) ─────────── +# We only PROBE writability + record it; we do not create scopes here (that needs a +# usable systemd manager — see strategy §3). IO throttling is experimental: skipped. +cgroup_probe="not-requested" +if [ "${GCL_STRESS_CGROUP:-0}" = 1 ]; then + cgroup_probe="unavailable" + if [ "$(uname -s 2>/dev/null)" = "Linux" ] && [ -r /sys/fs/cgroup/cgroup.controllers ]; then + if grep -qw cpu /sys/fs/cgroup/cgroup.controllers 2>/dev/null; then + # cpu controller present at the v2 root; is a cpu.max writable in our subtree? + if [ -w /sys/fs/cgroup/cgroup.subtree_control ] 2>/dev/null; then + cgroup_probe="writable" # the calibrated quota path is reachable on this leg + else + cgroup_probe="present-not-delegated" + fi + else + cgroup_probe="no-cpu-controller" + fi + else + cgroup_probe="no-cgroup-v2" + fi +fi + +# ── Stressor scratch dir (same volume as the test scratch) ───────────────────── +hogdir="${TMPDIR:-/tmp}/gcl-stress.$$" +mkdir -p "$hogdir" 2>/dev/null || hogdir="." + +# ── Spawn / teardown (track EXACT PIDs; kill only those) ─────────────────────── +hogs=() + +spawn_cpu() { + local i + if [ "$cpu_mech" = "stress-ng" ]; then + # One stress-ng manager spawning $cpu_count workers; reap the manager's PID. + "$stress_ng_bin" --cpu "$cpu_count" --cpu-load 100 >/dev/null 2>&1 & + hogs+=("$!") + else + for ((i = 0; i < cpu_count; i++)); do + bash -c 'while :; do :; done' & + hogs+=("$!") + done + fi +} + +spawn_disk() { + local i + for ((i = 0; i < disk_count; i++)); do + bash -c ' + d="$1"; j=0 + while :; do + f="$d/dh.$$.$((j % 24))" + dd if=/dev/zero of="$f" bs=32k count=8 conv=fsync 2>/dev/null + rm -f "$f" + j=$((j + 1)) + done' _ "$hogdir" & + hogs+=("$!") + done +} + +cleanup() { + local p + for p in "${hogs[@]:-}"; do + [ -n "$p" ] && kill "$p" 2>/dev/null + done + # stress-ng forks workers under its manager; kill the worker group too (only the + # manager PIDs we spawned are used as the group leader — never a name match). + if [ "$cpu_mech" = "stress-ng" ]; then + for p in "${hogs[@]:-}"; do + [ -n "$p" ] && kill -- "-$p" 2>/dev/null # negative PID = the manager's process group + done + fi + rm -rf "$hogdir" 2>/dev/null +} +trap cleanup EXIT INT TERM + +# ── Achieved-slowdown micro-benchmark (cheap fixed busy-loop, baseline vs loaded) ─ +# A small fixed integer loop timed once unloaded (baseline) and once mid-load gives a +# coarse, reproducible "how much did this load slow a CPU-bound task" figure for the +# manifest. Pure bash, no deps. Only run when load is actually applied — on the +# none/pass-through path it would be pure overhead. +micro_bench() { + local start end k=0 + start="$(date +%s%N 2>/dev/null || echo 0)" + while [ "$k" -lt 50000 ]; do k=$((k + 1)); done + end="$(date +%s%N 2>/dev/null || echo 0)" + echo $(( (end - start) / 1000000 )) # ms +} + +# Will any stressors spawn? (kind selected AND a positive per-kind count.) +will_load="no" +case "$kind" in + cpu) [ "$cpu_count" -gt 0 ] && will_load="yes" ;; + disk) [ "$disk_count" -gt 0 ] && will_load="yes" ;; + both) { [ "$cpu_count" -gt 0 ] || [ "$disk_count" -gt 0 ]; } && will_load="yes" ;; +esac + +base_ms=0 +loaded_ms=0 +slowdown="1.00" +[ "$will_load" = yes ] && base_ms="$(micro_bench)" + +# ── Apply load ───────────────────────────────────────────────────────────────── +case "$kind" in + cpu) spawn_cpu ;; + disk) spawn_disk ;; + both) spawn_cpu; spawn_disk ;; + none) : ;; + *) echo "with-load: unknown GCL_STRESS_KIND='$kind' — running with NO load" >&2; kind="none" ;; +esac + +if [ "${#hogs[@]}" -gt 0 ] && [ "$base_ms" -gt 0 ]; then + loaded_ms="$(micro_bench)" + # slowdown = loaded/base to 2 dp, integer-only arithmetic. Pad the centi-value to + # >=3 digits so the integer part is always whatever precedes the last 2 digits + # (handles slowdown <1.00 from timing noise, e.g. 80 -> "0.80"). + centi="$(( loaded_ms * 100 / base_ms ))" + while [ "${#centi}" -lt 3 ]; do centi="0$centi"; done + slowdown="${centi%??}.${centi: -2}" +fi + +# ── Write the load-manifest (best-effort; never fails the run) ────────────────── +write_manifest() { + local dir + dir="$(dirname "$manifest")" + mkdir -p "$dir" 2>/dev/null || return 0 + # Hand-rolled JSON (no jq/python dependency on the runner). Escape the JSON-special + # chars in string values: backslash, double-quote, and the control chars that the + # wrapped command line can legitimately contain (newline/tab/CR) — a raw newline in + # a value is invalid JSON. awk keeps this robust where sed's newline handling is not. + esc() { + printf '%s' "$1" | awk ' + BEGIN { ORS = "" } + { + if (NR > 1) printf "\\n" # join input lines with an escaped newline + gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); gsub(/\t/, "\\t"); gsub(/\r/, "\\r") + print + }' + } + { + printf '{\n' + printf ' "kind": "%s",\n' "$(esc "$kind")" + printf ' "ratio_R": %s,\n' "$ratio" + printf ' "ratio_max": %s,\n' "$ratio_max" + printf ' "raw_load_override": "%s",\n' "$(esc "${raw_load:-}")" + printf ' "nproc": %s,\n' "$nproc_count" + printf ' "cpu_stressors": %s,\n' "$cpu_count" + printf ' "disk_stressors": %s,\n' "$disk_count" + printf ' "total_stressors": %s,\n' "${#hogs[@]}" + printf ' "ratio_total_capped": "%s",\n' "$capped" + printf ' "cpu_mechanism": "%s",\n' "$(esc "$cpu_mech")" + printf ' "cgroup_cpu_probe": "%s",\n' "$(esc "$cgroup_probe")" + printf ' "baseline_ms": %s,\n' "$base_ms" + printf ' "loaded_ms": %s,\n' "$loaded_ms" + printf ' "achieved_slowdown": %s,\n' "$slowdown" + printf ' "stress_ng_version": "%s",\n' "$(esc "$stress_ng_ver")" + printf ' "bash_version": "%s",\n' "$(esc "$bash_ver")" + printf ' "os_arch": "%s",\n' "$(esc "$os_uname")" + printf ' "git_sha": "%s",\n' "$(esc "$git_sha")" + printf ' "command": "%s"\n' "$(esc "$*")" + printf '}\n' + } > "$manifest" 2>/dev/null || true +} +write_manifest "$@" + +echo "stress: kind=$kind R=$ratio nproc=$nproc_count cpu=$cpu_count disk=$disk_count" \ + "mech=$cpu_mech capped=$capped slowdown=${slowdown}x manifest=$manifest :: $*" + +# ── Run the wrapped command, tear down, propagate its exit code ───────────────── +"$@" +rc=$? + +cleanup +hogs=() +echo "stress: hogs reaped; wrapped command rc=$rc" +exit "$rc"