From 6b8bdaf1153d9be7bc2ec5a4820f1be856bc7e20 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 18:55:07 +0000 Subject: [PATCH 1/2] Add code-hygiene checks (large-file, folder-size, AI-writing) across prek + CI + cloud Port four hygiene checks wired at three layers kept in sync: - check_large_files.sh / check_folder_sizes.sh: line-count and folder-fanout guards (SOURCE_EXTS=rs), shared by prek and GitHub Actions. - check_ai_writing.sh: fail on em dashes (U+2014) and contrastive-parallelism constructions; pure ripgrep with a git-grep fallback, no build needed. - prek.toml: local hooks plus upstream check-added-large-files. - large-files.yaml / folder-size.yaml workflows and an AI-writing step in ci.yml. - .claude SessionStart hook installs prek so cloud commits run the same checks. Replace existing em dashes in the docs with hyphens so the AI-writing check passes on a clean tree. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_017wmfUGTZR6HFL78eSjYxBU --- .claude/hooks/session-start.sh | 20 ++++++++ .claude/settings.json | 15 ++++++ .github/workflows/ci.yml | 2 + .github/workflows/folder-size.yaml | 22 ++++++++ .github/workflows/large-files.yaml | 22 ++++++++ ARCHITECTURE.md | 42 ++++++++-------- CONTRIBUTING.md | 2 +- README.md | 6 +-- SECURITY.md | 4 +- prek.toml | 30 +++++++++++ scripts/check_ai_writing.sh | 34 +++++++++++++ scripts/check_folder_sizes.sh | 81 ++++++++++++++++++++++++++++++ scripts/check_large_files.sh | 81 ++++++++++++++++++++++++++++++ 13 files changed, 334 insertions(+), 27 deletions(-) create mode 100755 .claude/hooks/session-start.sh create mode 100644 .claude/settings.json create mode 100644 .github/workflows/folder-size.yaml create mode 100644 .github/workflows/large-files.yaml create mode 100644 prek.toml create mode 100755 scripts/check_ai_writing.sh create mode 100755 scripts/check_folder_sizes.sh create mode 100755 scripts/check_large_files.sh diff --git a/.claude/hooks/session-start.sh b/.claude/hooks/session-start.sh new file mode 100755 index 0000000..d556b53 --- /dev/null +++ b/.claude/hooks/session-start.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Install deps + wire prek git hooks so cloud commits run the same checks as local. +# Scoped to remote (web/cloud) sessions; remove the guard to run locally too. +set -euo pipefail +[ "${CLAUDE_CODE_REMOTE:-}" != "true" ] && exit 0 +cd "${CLAUDE_PROJECT_DIR:-.}" + +export PATH="$HOME/.local/bin:$PATH" +line='export PATH="$HOME/.local/bin:$PATH"' +if [ -n "${CLAUDE_ENV_FILE:-}" ] && ! grep -qF "$line" "$CLAUDE_ENV_FILE" 2>/dev/null; then + echo "$line" >> "$CLAUDE_ENV_FILE" +fi + +# Install deps (Rust toolchain + fetch crates). +command -v cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; cargo fetch + +# Install prek (Rust binary, language-agnostic), then wire the git hooks. +command -v prek >/dev/null 2>&1 || curl -LsSf https://prek.j178.dev/install.sh | sh +prek install +exit 0 diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..ea39e04 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "hooks": { + "SessionStart": [ + { + "matcher": "startup|resume", + "hooks": [ + { + "type": "command", + "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/session-start.sh" + } + ] + } + ] + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 018591d..a5208d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,8 @@ jobs: with: components: rustfmt, clippy - uses: Swatinem/rust-cache@v2 + - name: AI writing check + run: scripts/check_ai_writing.sh - name: Format run: cargo fmt --all --check - name: Clippy diff --git a/.github/workflows/folder-size.yaml b/.github/workflows/folder-size.yaml new file mode 100644 index 0000000..8e4f30a --- /dev/null +++ b/.github/workflows/folder-size.yaml @@ -0,0 +1,22 @@ +name: Folder Size Check +on: + workflow_dispatch: + pull_request: + paths: + - '**.rs' +jobs: + check-folder-sizes: + name: Folder File Count Limit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + with: { fetch-depth: 0 } + - name: Check for oversized folders + run: | + if [ -n "${{ github.event.pull_request.base.sha }}" ]; then + mapfile -t files < <(git diff --name-only --diff-filter=d "${{ github.event.pull_request.base.sha }}...HEAD") + [ "${#files[@]}" -eq 0 ] && { echo "No files changed."; exit 0; } + scripts/check_folder_sizes.sh "${files[@]}" + else + scripts/check_folder_sizes.sh --all + fi diff --git a/.github/workflows/large-files.yaml b/.github/workflows/large-files.yaml new file mode 100644 index 0000000..1fd2a80 --- /dev/null +++ b/.github/workflows/large-files.yaml @@ -0,0 +1,22 @@ +name: Large File Check +on: + workflow_dispatch: + pull_request: + paths: + - '**.rs' +jobs: + check-file-sizes: + name: Source File Line Limit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + with: { fetch-depth: 0 } + - name: Check for large source files + run: | + if [ -n "${{ github.event.pull_request.base.sha }}" ]; then + mapfile -t files < <(git diff --name-only --diff-filter=d "${{ github.event.pull_request.base.sha }}...HEAD") + [ "${#files[@]}" -eq 0 ] && { echo "No files changed."; exit 0; } + scripts/check_large_files.sh "${files[@]}" + else + scripts/check_large_files.sh --all + fi diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index f19acfb..e7d6967 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,4 +1,4 @@ -# stdiod — Architecture +# stdiod - Architecture `edison-stdiod` is a small, long-lived daemon that runs on a user's machine. It maintains a single authenticated outbound connection to a backend, supervises a @@ -16,7 +16,7 @@ contract is described here. - **Subprocesses run locally.** Every MCP stdio server the daemon manages is spawned as a child process on the user's machine. Nothing is spawned remotely. - **The backend is the source of truth.** The daemon stores almost no durable - state of its own — it connects, fetches the desired set of servers, and + state of its own - it connects, fetches the desired set of servers, and reconciles its running children against it. ## Workspace layout @@ -40,7 +40,7 @@ crates/ platform/ macOS / Linux / Windows service integration tunnel-protocol/ generated Rust types for the wire protocol schema/ - tunnel-protocol.json JSON Schema — single source of truth for the protocol + tunnel-protocol.json JSON Schema - single source of truth for the protocol dev/ spike/ throwaway v0 prototype that informed the design ``` @@ -65,7 +65,7 @@ reverse tunnel because: - One authentication check, one stateful connection, lowest latency. - Server-initiated frames (desired-state pushes, credential invalidations) are - natural — the backend can talk to the daemon at any time. + natural - the backend can talk to the daemon at any time. - It reuses the same outbound TLS:443 posture that already traverses corporate firewalls, with no third-party tunnelling dependency. @@ -80,13 +80,13 @@ Defined as JSON Schema at `schema/tunnel-protocol.json`. Frames are JSON with a `hostname`, `label`, `os`, `client_version`, `currently_running: [server_id]`. Sent immediately after the socket is established. - `server_hello` (backend → daemon): `protocol_version` plus a **full - desired-state snapshot** — + desired-state snapshot** - `servers: [{server_id, name, command, args, env, working_dir, enabled}]`. If the daemon's `protocol_version` is below the minimum the backend supports, the upgrade is refused with a `needs_upgrade` close code; the daemon records `needs_upgrade=true` in `state.json` and stops retrying until the binary is updated. -- `desired_state_update` (backend → daemon): steady-state delta — +- `desired_state_update` (backend → daemon): steady-state delta - `added` / `updated` / `removed` server lists. - `device_status` (daemon → backend): periodic snapshot of which children are running and their last health timestamp. @@ -99,7 +99,7 @@ Defined as JSON Schema at `schema/tunnel-protocol.json`. Frames are JSON with a - `fetch_logs_request` / `fetch_logs_response`: an operator-initiated, bounded (default 200 lines) pull of a child's recent `stdout`/`stderr`. Never streamed continuously, to keep bandwidth predictable. -- `ping` / `pong` (both directions): heartbeat — see +- `ping` / `pong` (both directions): heartbeat - see [Disconnect handling](#disconnect-handling). The `request_id` on `fetch_logs_*` is a control-layer correlation id, distinct @@ -109,7 +109,7 @@ from the JSON-RPC `id` carried inside MCP frames. - `mcp_frame` (both directions): a JSON-RPC frame addressed to or originating from a specific child. Fields: `server_id` and `frame` (the JSON-RPC body - verbatim — request, response, or notification). + verbatim - request, response, or notification). - `tunnel_error` (both directions): a structured, non-JSON-RPC error (subprocess crashed, unknown server, transport fault). Carries the inner JSON-RPC `id` it relates to when applicable, so the receiver can fail the @@ -118,14 +118,14 @@ from the JSON-RPC `id` carried inside MCP frames. A single symmetric frame type captures every MCP interaction because JSON-RPC's own envelope already distinguishes requests (`id` + `method`), responses (`id` + `result`/`error`), and notifications (`method`, no `id`). JSON-RPC `id`s are -scoped to the originator, so the inner `id` is the correlation key — no outer +scoped to the originator, so the inner `id` is the correlation key - no outer `request_id` is needed for MCP traffic. ### MCP-agnostic by design The transport is **MCP-agnostic**: the daemon's `tunnel` module treats every `frame` field as opaque bytes and never inspects its contents. This is a -load-bearing invariant — any temptation to sniff a method name or peek at +load-bearing invariant - any temptation to sniff a method name or peek at `params` inside the daemon is a smell; that logic belongs above the transport, on the backend. @@ -137,7 +137,7 @@ Concrete consequences: - **Bidirectional notifications** (e.g. `notifications/cancelled`, `notifications/progress`) are just notification-shaped `mcp_frame`s. - **MCP version bumps and new methods** require no changes anywhere in the - daemon — `initialize` negotiation happens between the backend and the stdio + daemon - `initialize` negotiation happens between the backend and the stdio server, both outside the transport. ## Child-process supervision @@ -164,7 +164,7 @@ waiting for a response that never arrives. The WebSocket itself stays open and other children on the same device are unaffected; the supervisor then decides whether and when to respawn the dead child per the latest desired state. This was the one behaviour the early spike could not derive from "treat MCP frames as -opaque" alone — it is a deliberate active signal the daemon must produce. +opaque" alone - it is a deliberate active signal the daemon must produce. ## Persistence and survival @@ -191,7 +191,7 @@ The daemon keeps almost nothing durable; the backend is the source of truth. ~/.config/edison-stdiod/ config.toml backend URL, device_id, api_key, secret state.json atomic writes; consumed by the desktop tray UI -~/Library/Logs/edison-stdiod/ (macOS — platform-equivalent paths elsewhere) +~/Library/Logs/edison-stdiod/ (macOS - platform-equivalent paths elsewhere) daemon.log rotated daily child-.log per-child stdout/stderr capture ``` @@ -240,7 +240,7 @@ The daemon keeps almost nothing durable; the backend is the source of truth. Every (re)connect runs the same protocol: 1. Daemon sends `client_hello { device_id, currently_running: [...] }`. -2. Backend replies `server_hello { servers: [...] }` — a full desired-state +2. Backend replies `server_hello { servers: [...] }` - a full desired-state snapshot for this device. 3. Daemon diffs: - Start any enabled server not currently running. @@ -254,15 +254,15 @@ Every (re)connect runs the same protocol: Every outbound `mcp_frame` carries a JSON-RPC `id` used as the correlation key. On socket close, all outstanding calls are failed cleanly (the backend surfaces a `device_offline`-style JSON-RPC error to the caller); there are no automatic -retries — the calling agent decides whether to retry. +retries - the calling agent decides whether to retry. ## CLI The same binary is the daemon and the control CLI: -- `edison-stdiod login --backend --api-key ` — store credentials. -- `edison-stdiod install` / `uninstall` — manage the OS service unit. -- `edison-stdiod run` — run the daemon (normally invoked by the service unit). -- `edison-stdiod server …` — add / list / remove locally-defined servers. -- `edison-stdiod status` — show connection and per-child state. -- `edison-stdiod logs` — tail daemon / child logs. +- `edison-stdiod login --backend --api-key ` - store credentials. +- `edison-stdiod install` / `uninstall` - manage the OS service unit. +- `edison-stdiod run` - run the daemon (normally invoked by the service unit). +- `edison-stdiod server …` - add / list / remove locally-defined servers. +- `edison-stdiod status` - show connection and per-child state. +- `edison-stdiod logs` - tail daemon / child logs. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d46b4c..2953edc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -51,7 +51,7 @@ cargo fmt --all --check && \ - For ordinary bugs and feature requests, open a [GitHub issue](https://github.com/Edison-Watch/stdiod/issues). -- For **security vulnerabilities**, do **not** open a public issue — follow +- For **security vulnerabilities**, do **not** open a public issue - follow [`SECURITY.md`](./SECURITY.md) instead. ## License diff --git a/README.md b/README.md index fc8ca5a..9eff077 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ **stdiod** is a small daemon that bridges local [stdio MCP servers](https://modelcontextprotocol.io/) to the Edison Watch backend over a single outbound WebSocket tunnel. -It runs on a user's machine, dials out to the backend (no inbound ports), and lets the backend drive locally-spawned MCP server subprocesses — forwarding MCP frames in both directions. An AI client talking to the backend's gateway reaches these local servers as if they were hosted remotely, while the processes (and their filesystem/credentials) stay on the user's device. +It runs on a user's machine, dials out to the backend (no inbound ports), and lets the backend drive locally-spawned MCP server subprocesses - forwarding MCP frames in both directions. An AI client talking to the backend's gateway reaches these local servers as if they were hosted remotely, while the processes (and their filesystem/credentials) stay on the user's device. ``` AI client ──▶ Edison backend gateway ──▶ WebSocket tunnel ──▶ stdiod ──▶ local MCP server subprocess @@ -81,8 +81,8 @@ edison-stdiod server remove filesystem Settings resolve in two layers, highest precedence first: -1. **CLI flags / environment variables** — handy for development overrides. -2. **`~/.config/edison-stdiod/config.toml`** — written by `edison-stdiod login`; this is what the OS supervisor unit reads (service units don't carry secrets in their environment). +1. **CLI flags / environment variables** - handy for development overrides. +2. **`~/.config/edison-stdiod/config.toml`** - written by `edison-stdiod login`; this is what the OS supervisor unit reads (service units don't carry secrets in their environment). | Field (`config.toml`) | Env var | Description | | --- | --- | --- | diff --git a/SECURITY.md b/SECURITY.md index c7d9c9c..6f76394 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -17,9 +17,9 @@ requests, or discussions.** Instead, report privately through either channel: -- **GitHub private advisory** — open the repository's **Security** tab and click +- **GitHub private advisory** - open the repository's **Security** tab and click **"Report a vulnerability"** ([private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability)). -- **Email** — . +- **Email** - . Please include enough detail to reproduce: affected version/commit, platform, configuration, and a description of the impact. We aim to acknowledge reports diff --git a/prek.toml b/prek.toml new file mode 100644 index 0000000..a096e43 --- /dev/null +++ b/prek.toml @@ -0,0 +1,30 @@ +[[repos]] +repo = "https://github.com/pre-commit/pre-commit-hooks" +rev = "v4.6.0" +hooks = [ + { id = "check-added-large-files" }, +] + +[[repos]] +repo = "local" +hooks = [ + { id = "ai-writing-check", name = "AI writing check", entry = "scripts/check_ai_writing.sh", language = "system", pass_filenames = false, always_run = true }, +] + +# ── Source-size guardrails (mirror GitHub Actions) ──────────────── +[[repos]] +repo = "local" + +[[repos.hooks]] +id = "check-large-files" +name = "fail if any source file exceeds the line-count error threshold" +language = "system" +entry = "scripts/check_large_files.sh" +files = "\\.(rs)$" + +[[repos.hooks]] +id = "check-folder-sizes" +name = "fail if any source folder exceeds the file-count error threshold" +language = "system" +entry = "scripts/check_folder_sizes.sh" +files = "\\.(rs)$" diff --git a/scripts/check_ai_writing.sh b/scripts/check_ai_writing.sh new file mode 100755 index 0000000..db6fd2a --- /dev/null +++ b/scripts/check_ai_writing.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# AI writing check: fail on em dashes or contrastive-parallelism constructions. +# Pure ripgrep/git-grep - no language runtime or build needed. +# POSIX-ERE-safe patterns (no lazy quantifiers, no (?:); apostrophes written as +# `.` so they match '/’ and dodge shell quoting) work in both rg and git grep -E. +set -uo pipefail +cd "$(dirname "$0")/.." + +EM_DASH=$'\xe2\x80\x94' # U+2014 +SELF="scripts/check_ai_writing.sh" + +CONTRA='not (just|only|merely|simply)[^.?!]{0,60} but' +CONTRA="$CONTRA"'|(it.s|that.s|this is) not [^.?!]{0,60}(it.s|that.s|they.re)' +CONTRA="$CONTRA"'|(isn.t|aren.t|wasn.t|weren.t) (just|only|merely|simply)' +CONTRA="$CONTRA"'|(isn.t|aren.t) (just )?about[^.?!]{0,60}it.s about' +CONTRA="$CONTRA"'|more than just' +CONTRA="$CONTRA"'|less about[^.?!]{0,60}more about' +CONTRA="$CONTRA"'|not [^.?!]{0,40}so much as' +CONTRA="$CONTRA"'|goes? beyond' # noisiest; drop this line if it over-flags + +if command -v rg >/dev/null 2>&1; then + em=$(rg -n --hidden --glob '!.git' --glob "!$SELF" -e "$EM_DASH" . || true) + contra=$(rg -ni --hidden --glob '!.git' --glob "!$SELF" -e "$CONTRA" . || true) +else + # git grep scans tracked files only - target/ and other build output excluded for free. + em=$(git grep -n -e "$EM_DASH" -- . ":(exclude)$SELF" || true) + contra=$(git grep -niE -e "$CONTRA" -- . ":(exclude)$SELF" || true) +fi + +fail=0 +if [ -n "$em" ]; then echo "AI writing check failed: em dash (U+2014) detected"; echo "$em"; fail=1; fi +if [ -n "$contra" ]; then echo "AI writing check failed: contrastive parallelism ('not just X, but Y') detected"; echo "$contra"; fail=1; fi +[ "$fail" -eq 0 ] && echo "AI writing check passed." +exit "$fail" diff --git a/scripts/check_folder_sizes.sh b/scripts/check_folder_sizes.sh new file mode 100755 index 0000000..358f1c4 --- /dev/null +++ b/scripts/check_folder_sizes.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# +# Enforce a file-count limit on source folders (non-recursive). Shared by +# .github/workflows/folder-size.yaml and prek.toml. +# check_folder_sizes.sh [file ...] # check folders containing the given files +# check_folder_sizes.sh --all # scan every folder +# Thresholds via FOLDER_WARN_THRESHOLD / FOLDER_ERROR_THRESHOLD. +set -euo pipefail + +WARN_THRESHOLD="${FOLDER_WARN_THRESHOLD:-20}" +ERROR_THRESHOLD="${FOLDER_ERROR_THRESHOLD:-35}" + +SOURCE_EXTS=(rs) + +GRANDFATHERED=() # folders allowed to exceed (warn instead of error) + +EXCLUDE_PATH_RE='(^|/)(vendor|target|\.git)(/|$)' +GENERATED_RE='(^|/)(alembic[^/]*/versions|migrations)(/|$)' + +is_grandfathered() { local t="$1"; for g in "${GRANDFATHERED[@]}"; do [ "$t" = "$g" ] && return 0; done; return 1; } +should_skip() { + local f="$1"; [ -z "$f" ] && return 0; [ "$f" = "." ] && return 0 + echo "$f" | grep -qE "$EXCLUDE_PATH_RE" && return 0 + echo "$f" | grep -qE "$GENERATED_RE" && return 0 + return 1 +} + +count_folder() { + local fa=() first=1 + for e in "${SOURCE_EXTS[@]}"; do + if [ "$first" = 1 ]; then fa+=( -name "*.$e" ); first=0; else fa+=( -o -name "*.$e" ); fi + done + find "$1" -mindepth 1 -maxdepth 1 -type f \( "${fa[@]}" \) \ + -not -name 'test_*' -not -name '*_test.*' -not -name '*.test.*' \ + -not -name '*.spec.*' -not -name '*.d.ts' -not -name 'conftest.py' | wc -l +} + +collect_all_folders() { + find . -type d -not -path './.git/*' -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' -not -path '*/.venv/*' -not -path '*/target/*' | sed 's|^\./||' +} + +folder_list=$(mktemp); trap 'rm -f "$folder_list"' EXIT +if [ "${1:-}" = "--all" ]; then collect_all_folders > "$folder_list" +else for f in "$@"; do [ -z "$f" ] && continue; dirname "$f"; done | sort -u > "$folder_list"; fi + +warnings=0; errors=0; warn_list=""; error_list="" +while IFS= read -r folder; do + folder="${folder#./}" + should_skip "$folder" && continue + [ ! -d "$folder" ] && continue + count=$(count_folder "$folder") + if [ "$count" -gt "$ERROR_THRESHOLD" ]; then + if is_grandfathered "$folder"; then + warnings=$((warnings + 1)); warn_list="${warn_list}| \`${folder}/\` | ${count} | :warning: exceeds ${ERROR_THRESHOLD} (grandfathered) |\n" + else + errors=$((errors + 1)); error_list="${error_list}| \`${folder}/\` | ${count} | :x: exceeds ${ERROR_THRESHOLD} |\n" + fi + elif [ "$count" -gt "$WARN_THRESHOLD" ]; then + warnings=$((warnings + 1)); warn_list="${warn_list}| \`${folder}/\` | ${count} | :warning: exceeds ${WARN_THRESHOLD} |\n" + fi +done < "$folder_list" + +if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && { [ "$errors" -gt 0 ] || [ "$warnings" -gt 0 ]; }; then + { + echo "## Folder Size Report"; echo "" + echo "| Folder | Files | Status |"; echo "|--------|-------|--------|" + [ "$errors" -gt 0 ] && printf '%b' "$error_list" + [ "$warnings" -gt 0 ] && printf '%b' "$warn_list" + echo ""; echo "**Thresholds:** warn at ${WARN_THRESHOLD} files, error at ${ERROR_THRESHOLD} files. Immediate source children only - subfolders are the fix." + } >> "$GITHUB_STEP_SUMMARY" +fi + +format_list() { if command -v column >/dev/null 2>&1; then printf '%b' "$1" | column -t -s '|'; else printf '%b' "$1"; fi; } + +if [ "$errors" -gt 0 ]; then echo "::error::${errors} folder(s) exceed the ${ERROR_THRESHOLD}-file error threshold" >&2; format_list "$error_list" >&2; fi +if [ "$warnings" -gt 0 ]; then echo "::warning::${warnings} folder(s) exceed the ${WARN_THRESHOLD}-file warning threshold" >&2; format_list "$warn_list" >&2; fi +if [ "$errors" -eq 0 ] && [ "$warnings" -eq 0 ]; then echo "All folders are within the ${WARN_THRESHOLD}-file limit."; fi + +[ "$errors" -gt 0 ] && exit 1 +exit 0 diff --git a/scripts/check_large_files.sh b/scripts/check_large_files.sh new file mode 100755 index 0000000..ef758bb --- /dev/null +++ b/scripts/check_large_files.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# +# Enforce a line-count limit on source files. Shared by +# .github/workflows/large-files.yaml and prek.toml. +# check_large_files.sh [file ...] # check the given files +# check_large_files.sh --all # scan the whole tree +# Thresholds via LARGE_FILE_WARN_THRESHOLD / LARGE_FILE_ERROR_THRESHOLD. +# Exit 1 on errors, 0 on warnings-only or clean. +set -euo pipefail + +WARN_THRESHOLD="${LARGE_FILE_WARN_THRESHOLD:-500}" +ERROR_THRESHOLD="${LARGE_FILE_ERROR_THRESHOLD:-800}" + +SOURCE_EXTS=(rs) + +declare -A _EXT_SET=() +for e in "${SOURCE_EXTS[@]}"; do _EXT_SET[".$e"]=1; done + +EXCLUDE_PATH_RE='(^|/)(vendor|target|\.git)(/|$)' +GENERATED_RE='(^|/)(alembic[^/]*/versions|migrations)(/|$)' +EXCLUDE_NAME_RE='(.+_test\..+)$' + +is_source_file() { local ext=".${1##*.}"; [ -n "${_EXT_SET[$ext]:-}" ]; } + +is_excluded() { + local f="$1" base + echo "$f" | grep -qE "$EXCLUDE_PATH_RE" && return 0 + echo "$f" | grep -qE "$GENERATED_RE" && return 0 + base=$(basename "$f") + echo "$base" | grep -qE "$EXCLUDE_NAME_RE" && return 0 + return 1 +} + +collect_all() { + local fa=() first=1 + for e in "${SOURCE_EXTS[@]}"; do + if [ "$first" = 1 ]; then fa+=( -name "*.$e" ); first=0; else fa+=( -o -name "*.$e" ); fi + done + find . -type f \( "${fa[@]}" \) \ + -not -path './.git/*' -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' -not -path '*/.venv/*' -not -path '*/target/*' \ + | sed 's|^\./||' +} + +files=() +if [ "${1:-}" = "--all" ]; then mapfile -t files < <(collect_all); else files=("$@"); fi + +warnings=0; errors=0; warn_list=""; error_list="" +for file in "${files[@]}"; do + [ -z "$file" ] && continue + [ ! -f "$file" ] && continue + is_source_file "$file" || continue + is_excluded "$file" && continue + lines=$(wc -l < "$file") + if [ "$lines" -gt "$ERROR_THRESHOLD" ]; then + errors=$((errors + 1)) + error_list="${error_list}| \`${file}\` | ${lines} | :x: exceeds ${ERROR_THRESHOLD} |\n" + elif [ "$lines" -gt "$WARN_THRESHOLD" ]; then + warnings=$((warnings + 1)) + warn_list="${warn_list}| \`${file}\` | ${lines} | :warning: exceeds ${WARN_THRESHOLD} |\n" + fi +done + +if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && { [ "$errors" -gt 0 ] || [ "$warnings" -gt 0 ]; }; then + { + echo "## Large File Report"; echo "" + echo "| File | Lines | Status |"; echo "|------|-------|--------|" + [ "$errors" -gt 0 ] && printf '%b' "$error_list" + [ "$warnings" -gt 0 ] && printf '%b' "$warn_list" + echo ""; echo "**Thresholds:** warn at ${WARN_THRESHOLD} lines, error at ${ERROR_THRESHOLD} lines" + } >> "$GITHUB_STEP_SUMMARY" +fi + +format_list() { if command -v column >/dev/null 2>&1; then printf '%b' "$1" | column -t -s '|'; else printf '%b' "$1"; fi; } + +if [ "$errors" -gt 0 ]; then echo "::error::${errors} file(s) exceed the ${ERROR_THRESHOLD}-line error threshold" >&2; format_list "$error_list" >&2; fi +if [ "$warnings" -gt 0 ]; then echo "::warning::${warnings} file(s) exceed the ${WARN_THRESHOLD}-line warning threshold" >&2; format_list "$warn_list" >&2; fi +if [ "$errors" -eq 0 ] && [ "$warnings" -eq 0 ]; then echo "All source files are within the ${WARN_THRESHOLD}-line limit."; fi + +[ "$errors" -gt 0 ] && exit 1 +exit 0 From 0fb034a86d6c57e62afa3079bac460487fc6a1c0 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 25 Jun 2026 23:19:53 +0000 Subject: [PATCH 2/2] Fix cargo PATH in session-start hook after fresh rustup install rustup installs cargo to ~/.cargo/bin, which the hook never added to PATH, so `cargo fetch` would fail on a clean machine. Add ~/.cargo/bin to PATH (persisted to CLAUDE_ENV_FILE), source ~/.cargo/env after a fresh install, and split the install/fetch off the `||` one-liner so fetch only runs once cargo is resolvable. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_017wmfUGTZR6HFL78eSjYxBU --- .claude/hooks/session-start.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.claude/hooks/session-start.sh b/.claude/hooks/session-start.sh index d556b53..1b9a3f2 100755 --- a/.claude/hooks/session-start.sh +++ b/.claude/hooks/session-start.sh @@ -5,14 +5,20 @@ set -euo pipefail [ "${CLAUDE_CODE_REMOTE:-}" != "true" ] && exit 0 cd "${CLAUDE_PROJECT_DIR:-.}" -export PATH="$HOME/.local/bin:$PATH" -line='export PATH="$HOME/.local/bin:$PATH"' +# rustup installs cargo under ~/.cargo/bin; prek installs under ~/.local/bin. +export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" +line='export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"' if [ -n "${CLAUDE_ENV_FILE:-}" ] && ! grep -qF "$line" "$CLAUDE_ENV_FILE" 2>/dev/null; then echo "$line" >> "$CLAUDE_ENV_FILE" fi -# Install deps (Rust toolchain + fetch crates). -command -v cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; cargo fetch +# Install deps (Rust toolchain + fetch crates). Source cargo env after a fresh +# rustup install so `cargo` is on PATH for this run. +if ! command -v cargo >/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + [ -f "$HOME/.cargo/env" ] && . "$HOME/.cargo/env" +fi +cargo fetch # Install prek (Rust binary, language-agnostic), then wire the git hooks. command -v prek >/dev/null 2>&1 || curl -LsSf https://prek.j178.dev/install.sh | sh