From bd4b0c6d07a7ce784562907e1c8e57c0991ae9bc Mon Sep 17 00:00:00 2001 From: Jim Schaff Date: Wed, 24 Jun 2026 08:03:09 -0400 Subject: [PATCH] Stop release jobs hanging 6h on failure (gate tmate, add timeouts) The release-main workflow ran a `Setup tmate` step on any failure (`if: failure()`), which opens an interactive debug session that blocks the job until the 6h runner limit. During the 0.0.16 release the PyPI publish step failed (project size > 10GB), and tmate then held 7 jobs open for 6h each until the whole run was cancelled. - Gate the tmate step behind a `workflow_dispatch` boolean input `debug_tmate` (default false), so release events never open a session and failures fail fast; the session is still available for manual debug runs. - Bound the tmate step with `timeout-minutes: 30`. - Add a job-level `timeout-minutes: 180` as defense-in-depth (successful builds take ~35-60 min). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/on-release-main.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/on-release-main.yml b/.github/workflows/on-release-main.yml index 9c7ecf0..6be48a2 100644 --- a/.github/workflows/on-release-main.yml +++ b/.github/workflows/on-release-main.yml @@ -2,6 +2,11 @@ name: release-main on: workflow_dispatch: + inputs: + debug_tmate: + description: "Open an interactive tmate debug session if a job fails" + type: boolean + default: false release: types: [published] branches: [main] @@ -22,6 +27,9 @@ jobs: fail-fast: false runs-on: ${{ matrix.os }} + # Successful builds take ~35-60 min; cap well under the 6h default so a stuck + # job (e.g. a hung publish) fails fast instead of burning a runner for 6 hours. + timeout-minutes: 180 defaults: run: shell: bash @@ -109,8 +117,13 @@ jobs: poetry publish --skip-existing if: github.event_name == 'release' + # Only open a tmate session when explicitly requested via workflow_dispatch. + # On a release event `inputs.debug_tmate` is false, so a failed job fails fast + # instead of holding the runner open until the 6h job timeout. The step is also + # bounded by timeout-minutes as a safety net. - name: Setup tmate - if: failure() + if: ${{ failure() && inputs.debug_tmate }} + timeout-minutes: 30 uses: mxschmitt/action-tmate@v3 with: limit-access-to-actor: true