diff --git a/.claude/hooks/session-start.sh b/.claude/hooks/session-start.sh new file mode 100755 index 0000000..1b9a3f2 --- /dev/null +++ b/.claude/hooks/session-start.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Install deps + wire prek git hooks so cloud commits run the same checks as local. +# Scoped to remote (web/cloud) sessions; remove the guard to run locally too. +set -euo pipefail +[ "${CLAUDE_CODE_REMOTE:-}" != "true" ] && exit 0 +cd "${CLAUDE_PROJECT_DIR:-.}" + +# rustup installs cargo under ~/.cargo/bin; prek installs under ~/.local/bin. +export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" +line='export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"' +if [ -n "${CLAUDE_ENV_FILE:-}" ] && ! grep -qF "$line" "$CLAUDE_ENV_FILE" 2>/dev/null; then + echo "$line" >> "$CLAUDE_ENV_FILE" +fi + +# Install deps (Rust toolchain + fetch crates). Source cargo env after a fresh +# rustup install so `cargo` is on PATH for this run. +if ! command -v cargo >/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + [ -f "$HOME/.cargo/env" ] && . "$HOME/.cargo/env" +fi +cargo fetch + +# Install prek (Rust binary, language-agnostic), then wire the git hooks. +command -v prek >/dev/null 2>&1 || curl -LsSf https://prek.j178.dev/install.sh | sh +prek install +exit 0 diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..ea39e04 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "hooks": { + "SessionStart": [ + { + "matcher": "startup|resume", + "hooks": [ + { + "type": "command", + "command": "$CLAUDE_PROJECT_DIR/.claude/hooks/session-start.sh" + } + ] + } + ] + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 018591d..a5208d2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,6 +25,8 @@ jobs: with: components: rustfmt, clippy - uses: Swatinem/rust-cache@v2 + - name: AI writing check + run: scripts/check_ai_writing.sh - name: Format run: cargo fmt --all --check - name: Clippy diff --git a/.github/workflows/folder-size.yaml b/.github/workflows/folder-size.yaml new file mode 100644 index 0000000..8e4f30a --- /dev/null +++ b/.github/workflows/folder-size.yaml @@ -0,0 +1,22 @@ +name: Folder Size Check +on: + workflow_dispatch: + pull_request: + paths: + - '**.rs' +jobs: + check-folder-sizes: + name: Folder File Count Limit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + with: { fetch-depth: 0 } + - name: Check for oversized folders + run: | + if [ -n "${{ github.event.pull_request.base.sha }}" ]; then + mapfile -t files < <(git diff --name-only --diff-filter=d "${{ github.event.pull_request.base.sha }}...HEAD") + [ "${#files[@]}" -eq 0 ] && { echo "No files changed."; exit 0; } + scripts/check_folder_sizes.sh "${files[@]}" + else + scripts/check_folder_sizes.sh --all + fi diff --git a/.github/workflows/large-files.yaml b/.github/workflows/large-files.yaml new file mode 100644 index 0000000..1fd2a80 --- /dev/null +++ b/.github/workflows/large-files.yaml @@ -0,0 +1,22 @@ +name: Large File Check +on: + workflow_dispatch: + pull_request: + paths: + - '**.rs' +jobs: + check-file-sizes: + name: Source File Line Limit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v7 + with: { fetch-depth: 0 } + - name: Check for large source files + run: | + if [ -n "${{ github.event.pull_request.base.sha }}" ]; then + mapfile -t files < <(git diff --name-only --diff-filter=d "${{ github.event.pull_request.base.sha }}...HEAD") + [ "${#files[@]}" -eq 0 ] && { echo "No files changed."; exit 0; } + scripts/check_large_files.sh "${files[@]}" + else + scripts/check_large_files.sh --all + fi diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index e18b9e0..bf64f18 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,4 +1,4 @@ -# stdiod — Architecture +# stdiod - Architecture `edison-stdiod` is a small, long-lived daemon that runs on a user's machine. It maintains a single authenticated outbound connection to a backend, supervises a @@ -16,14 +16,14 @@ contract is described here. - **Subprocesses run locally.** Every MCP stdio server the daemon manages is spawned as a child process on the user's machine. Nothing is spawned remotely. - **The backend is the source of truth.** The daemon stores almost no durable - state of its own — it connects, fetches the desired set of servers, and + state of its own - it connects, fetches the desired set of servers, and reconciles its running children against it. ## Components `edison-stdiod` is a single binary that is both the long-lived service and the -control CLI. Its responsibilities — described here by role, independent of how -the source happens to be arranged on disk — are: +control CLI. Its responsibilities - described here by role, independent of how +the source happens to be arranged on disk - are: ``` control commands ┌──────────────────────────────┐ @@ -43,15 +43,15 @@ the source happens to be arranged on disk — are: Edison backend local MCP servers ``` -- **Control surface** — the CLI subcommands a user runs to authenticate, +- **Control surface** - the CLI subcommands a user runs to authenticate, register the OS service, manage servers, and inspect state. They persist configuration; they do not carry MCP traffic. -- **Supervisor** — the long-lived run loop: connect, fetch desired state, +- **Supervisor** - the long-lived run loop: connect, fetch desired state, reconcile running children against it, and supervise. -- **Tunnel transport** — the single outbound WebSocket and its framing. It is +- **Tunnel transport** - the single outbound WebSocket and its framing. It is MCP-agnostic: MCP frames are forwarded as opaque bytes (see [MCP-agnostic by design](#mcp-agnostic-by-design)). -- **Child supervision** — spawning each desired server as a subprocess and +- **Child supervision** - spawning each desired server as a subprocess and pumping its stdio to and from the tunnel. Cross-cutting concerns sit beneath all of the above: the thin HTTP client for @@ -79,7 +79,7 @@ reverse tunnel because: - One authentication check, one stateful connection, lowest latency. - Server-initiated frames (desired-state pushes, credential invalidations) are - natural — the backend can talk to the daemon at any time. + natural - the backend can talk to the daemon at any time. - It reuses the same outbound TLS:443 posture that already traverses corporate firewalls, with no third-party tunnelling dependency. @@ -94,13 +94,13 @@ Defined as JSON Schema at `schema/tunnel-protocol.json`. Frames are JSON with a `hostname`, `label`, `os`, `client_version`, `currently_running: [server_id]`. Sent immediately after the socket is established. - `server_hello` (backend → daemon): `protocol_version` plus a **full - desired-state snapshot** — + desired-state snapshot** - `servers: [{server_id, name, command, args, env, working_dir, enabled}]`. If the daemon's `protocol_version` is below the minimum the backend supports, the upgrade is refused with a `needs_upgrade` close code; the daemon records `needs_upgrade=true` in `state.json` and stops retrying until the binary is updated. -- `desired_state_update` (backend → daemon): steady-state delta — +- `desired_state_update` (backend → daemon): steady-state delta - `added` / `updated` / `removed` server lists. - `device_status` (daemon → backend): periodic snapshot of which children are running and their last health timestamp. @@ -113,7 +113,7 @@ Defined as JSON Schema at `schema/tunnel-protocol.json`. Frames are JSON with a - `fetch_logs_request` / `fetch_logs_response`: an operator-initiated, bounded (default 200 lines) pull of a child's recent `stdout`/`stderr`. Never streamed continuously, to keep bandwidth predictable. -- `ping` / `pong` (both directions): heartbeat — see +- `ping` / `pong` (both directions): heartbeat - see [Disconnect handling](#disconnect-handling). The `request_id` on `fetch_logs_*` is a control-layer correlation id, distinct @@ -123,7 +123,7 @@ from the JSON-RPC `id` carried inside MCP frames. - `mcp_frame` (both directions): a JSON-RPC frame addressed to or originating from a specific child. Fields: `server_id` and `frame` (the JSON-RPC body - verbatim — request, response, or notification). + verbatim - request, response, or notification). - `tunnel_error` (both directions): a structured, non-JSON-RPC error (subprocess crashed, unknown server, transport fault). Carries the inner JSON-RPC `id` it relates to when applicable, so the receiver can fail the @@ -132,14 +132,14 @@ from the JSON-RPC `id` carried inside MCP frames. A single symmetric frame type captures every MCP interaction because JSON-RPC's own envelope already distinguishes requests (`id` + `method`), responses (`id` + `result`/`error`), and notifications (`method`, no `id`). JSON-RPC `id`s are -scoped to the originator, so the inner `id` is the correlation key — no outer +scoped to the originator, so the inner `id` is the correlation key - no outer `request_id` is needed for MCP traffic. ### MCP-agnostic by design The transport is **MCP-agnostic**: the daemon's `tunnel` module treats every `frame` field as opaque bytes and never inspects its contents. This is a -load-bearing invariant — any temptation to sniff a method name or peek at +load-bearing invariant - any temptation to sniff a method name or peek at `params` inside the daemon is a smell; that logic belongs above the transport, on the backend. @@ -151,7 +151,7 @@ Concrete consequences: - **Bidirectional notifications** (e.g. `notifications/cancelled`, `notifications/progress`) are just notification-shaped `mcp_frame`s. - **MCP version bumps and new methods** require no changes anywhere in the - daemon — `initialize` negotiation happens between the backend and the stdio + daemon - `initialize` negotiation happens between the backend and the stdio server, both outside the transport. ## Child-process supervision @@ -178,7 +178,7 @@ waiting for a response that never arrives. The WebSocket itself stays open and other children on the same device are unaffected; the supervisor then decides whether and when to respawn the dead child per the latest desired state. This was the one behaviour the early spike could not derive from "treat MCP frames as -opaque" alone — it is a deliberate active signal the daemon must produce. +opaque" alone - it is a deliberate active signal the daemon must produce. ## Persistence and survival @@ -205,7 +205,7 @@ The daemon keeps almost nothing durable; the backend is the source of truth. ~/.config/edison-stdiod/ config.toml backend URL, device_id, api_key, secret state.json atomic writes; consumed by the desktop tray UI -~/Library/Logs/edison-stdiod/ (macOS — platform-equivalent paths elsewhere) +~/Library/Logs/edison-stdiod/ (macOS - platform-equivalent paths elsewhere) daemon.log rotated daily child-.log per-child stdout/stderr capture ``` @@ -254,7 +254,7 @@ The daemon keeps almost nothing durable; the backend is the source of truth. Every (re)connect runs the same protocol: 1. Daemon sends `client_hello { device_id, currently_running: [...] }`. -2. Backend replies `server_hello { servers: [...] }` — a full desired-state +2. Backend replies `server_hello { servers: [...] }` - a full desired-state snapshot for this device. 3. Daemon diffs: - Start any enabled server not currently running. @@ -268,15 +268,15 @@ Every (re)connect runs the same protocol: Every outbound `mcp_frame` carries a JSON-RPC `id` used as the correlation key. On socket close, all outstanding calls are failed cleanly (the backend surfaces a `device_offline`-style JSON-RPC error to the caller); there are no automatic -retries — the calling agent decides whether to retry. +retries - the calling agent decides whether to retry. ## CLI The same binary is the daemon and the control CLI: -- `edison-stdiod login --backend --api-key ` — store credentials. -- `edison-stdiod install` / `uninstall` — manage the OS service unit. -- `edison-stdiod run` — run the daemon (normally invoked by the service unit). -- `edison-stdiod server …` — add / list / remove locally-defined servers. -- `edison-stdiod status` — show connection and per-child state. -- `edison-stdiod logs` — tail daemon / child logs. +- `edison-stdiod login --backend --api-key ` - store credentials. +- `edison-stdiod install` / `uninstall` - manage the OS service unit. +- `edison-stdiod run` - run the daemon (normally invoked by the service unit). +- `edison-stdiod server …` - add / list / remove locally-defined servers. +- `edison-stdiod status` - show connection and per-child state. +- `edison-stdiod logs` - tail daemon / child logs. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d46b4c..2953edc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -51,7 +51,7 @@ cargo fmt --all --check && \ - For ordinary bugs and feature requests, open a [GitHub issue](https://github.com/Edison-Watch/stdiod/issues). -- For **security vulnerabilities**, do **not** open a public issue — follow +- For **security vulnerabilities**, do **not** open a public issue - follow [`SECURITY.md`](./SECURITY.md) instead. ## License diff --git a/README.md b/README.md index 879968e..6a0088f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

stdiod

-Bridge local stdio MCP servers to a remote backend over a single outbound WebSocket — no inbound ports, and your processes, files, and credentials never leave the machine. +Bridge local stdio MCP servers to a remote backend over a single outbound WebSocket - no inbound ports, and your processes, files, and credentials never leave the machine.

@@ -23,14 +23,14 @@ --- -**stdiod** is a small Rust daemon that bridges local [stdio MCP servers](https://modelcontextprotocol.io/) to the Edison Watch backend over one outbound WebSocket tunnel. It runs on a user's machine, dials out to the backend (no inbound ports), and lets the backend drive locally-spawned MCP server subprocesses — forwarding MCP frames in both directions. An AI client talking to the backend's gateway reaches these local servers as if they were hosted remotely, while the processes (and their filesystem and credentials) stay on the user's device. +**stdiod** is a small Rust daemon that bridges local [stdio MCP servers](https://modelcontextprotocol.io/) to the Edison Watch backend over one outbound WebSocket tunnel. It runs on a user's machine, dials out to the backend (no inbound ports), and lets the backend drive locally-spawned MCP server subprocesses - forwarding MCP frames in both directions. An AI client talking to the backend's gateway reaches these local servers as if they were hosted remotely, while the processes (and their filesystem and credentials) stay on the user's device.

An AI client reaches the Edison backend gateway, which drives the stdiod daemon over a single outbound WebSocket tunnel; the daemon spawns and supervises local stdio MCP servers on the user's machine.

> [!WARNING] -> **Experimental (v0.0.1).** Early software under active development; expect bugs. It has **not** had an independent security audit. The wire protocol, CLI surface, and on-disk formats may change without notice before a 1.0 release. Today the daemon runs as a supervised service on **macOS only** — Linux and Windows support is on the roadmap, and the CLI will tell you when a step is unsupported on your platform. +> **Experimental (v0.0.1).** Early software under active development; expect bugs. It has **not** had an independent security audit. The wire protocol, CLI surface, and on-disk formats may change without notice before a 1.0 release. Today the daemon runs as a supervised service on **macOS only** - Linux and Windows support is on the roadmap, and the CLI will tell you when a step is unsupported on your platform. ## How it works @@ -39,7 +39,7 @@ - **Child supervision.** The backend pushes a desired set of servers; the daemon spawns/stops the matching subprocesses and pumps their stdio. - **Survival.** It reconnects with backoff across network blips and machine sleep/resume, and reconciles desired state on every (re)connect. -See [`ARCHITECTURE.md`](./ARCHITECTURE.md) for the full design and [`schema/tunnel-protocol.json`](./schema/tunnel-protocol.json) for the wire protocol — the single source of truth for the frame types. +See [`ARCHITECTURE.md`](./ARCHITECTURE.md) for the full design and [`schema/tunnel-protocol.json`](./schema/tunnel-protocol.json) for the wire protocol - the single source of truth for the frame types. ## Install @@ -119,7 +119,7 @@ TLDR: `edison-stdiod --help` (and `edison-stdiod --help` for any subco | `logs` | Print the daemon log. `-f`/`--follow` to tail in real time; `-n`/`--lines N` to set the backscroll (default 200). | | `server add ` | Register a stdio_tunnel server. `--command `, repeatable `--arg `, optional `--working-dir` and `--display-name`. The prefix name must be alphanumeric (plus hyphens). | | `server list` | List stdio_tunnel servers registered for this device. `--json` for raw output. | -| `server remove ` | Delete a server by name. Idempotent — a missing name is reported as a no-op. | +| `server remove ` | Delete a server by name. Idempotent - a missing name is reported as a no-op. | @@ -132,8 +132,8 @@ TLDR: `edison-stdiod login` writes everything to `~/.config/edison-stdiod/config Settings resolve in two layers, highest precedence first: -1. **CLI flags / environment variables** — handy for development overrides. -2. **`~/.config/edison-stdiod/config.toml`** — written by `edison-stdiod login`; this is what the OS supervisor unit reads (service units don't carry secrets in their environment). +1. **CLI flags / environment variables** - handy for development overrides. +2. **`~/.config/edison-stdiod/config.toml`** - written by `edison-stdiod login`; this is what the OS supervisor unit reads (service units don't carry secrets in their environment). ```toml # ~/.config/edison-stdiod/config.toml (mode 0600) @@ -158,7 +158,7 @@ Rotate the API key by re-running `edison-stdiod login --api-key …`. To remove ## Files on disk -TLDR: the daemon keeps almost nothing durable — the backend is the source of truth. +TLDR: the daemon keeps almost nothing durable - the backend is the source of truth.
Expand @@ -167,7 +167,7 @@ TLDR: the daemon keeps almost nothing durable — the backend is the source of t ~/.config/edison-stdiod/ config.toml # backend URL, device_id, api_key, secret (mode 0600) state.json # atomic writes; snapshot consumed by the desktop tray UI -~/Library/Logs/edison-stdiod/ # macOS — platform-equivalent paths elsewhere +~/Library/Logs/edison-stdiod/ # macOS - platform-equivalent paths elsewhere daemon.log # rotated daily child-.log # per-child stdout/stderr capture ``` @@ -204,7 +204,7 @@ TLDR: one outbound WebSocket carries a symmetric, MCP-agnostic frame protocol; t ``` - **Outbound-only & reverse RPC.** The daemon dials out; the backend drives it. Server-initiated frames (desired-state pushes, sampling requests, credential invalidations) are natural over the single long-lived connection. -- **MCP-agnostic transport.** The `tunnel` module treats each `frame` field as opaque bytes — MCP version bumps and new methods need no daemon changes. +- **MCP-agnostic transport.** The `tunnel` module treats each `frame` field as opaque bytes - MCP version bumps and new methods need no daemon changes. - **Reconcile on (re)connect.** `client_hello` → `server_hello` (full desired-state snapshot) → diff and start/stop/restart children; steady-state changes arrive as `desired_state_update` deltas.
@@ -223,7 +223,7 @@ cargo fmt --all --check # formatting cargo clippy --workspace --all-targets -- -D warnings # lints ``` -The `tunnel-protocol` crate's Rust types are generated from [`schema/tunnel-protocol.json`](./schema/tunnel-protocol.json) — keep the schema and the generated types in lock-step. +The `tunnel-protocol` crate's Rust types are generated from [`schema/tunnel-protocol.json`](./schema/tunnel-protocol.json) - keep the schema and the generated types in lock-step. [`dev/spike/`](./dev/spike/) holds a throwaway v0 Python prototype that validated the wire protocol before the Rust daemon was written; it is kept as a historical record and is not part of the build. @@ -235,12 +235,12 @@ See [`CONTRIBUTING.md`](./CONTRIBUTING.md) for the contribution workflow and [`S This software is built with: -- [Tokio](https://tokio.rs/) — async runtime -- [tokio-tungstenite](https://github.com/snapview/tokio-tungstenite) — WebSocket transport -- [reqwest](https://github.com/seanmonstar/reqwest) — HTTP client for the backend REST surface -- [clap](https://github.com/clap-rs/clap) — CLI parsing -- [serde](https://serde.rs/) + [serde_json](https://github.com/serde-rs/json) — serialization -- [tracing](https://github.com/tokio-rs/tracing) — structured logging +- [Tokio](https://tokio.rs/) - async runtime +- [tokio-tungstenite](https://github.com/snapview/tokio-tungstenite) - WebSocket transport +- [reqwest](https://github.com/seanmonstar/reqwest) - HTTP client for the backend REST surface +- [clap](https://github.com/clap-rs/clap) - CLI parsing +- [serde](https://serde.rs/) + [serde_json](https://github.com/serde-rs/json) - serialization +- [tracing](https://github.com/tokio-rs/tracing) - structured logging ## License diff --git a/SECURITY.md b/SECURITY.md index c7d9c9c..6f76394 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -17,9 +17,9 @@ requests, or discussions.** Instead, report privately through either channel: -- **GitHub private advisory** — open the repository's **Security** tab and click +- **GitHub private advisory** - open the repository's **Security** tab and click **"Report a vulnerability"** ([private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability)). -- **Email** — . +- **Email** - . Please include enough detail to reproduce: affected version/commit, platform, configuration, and a description of the impact. We aim to acknowledge reports diff --git a/docs/architecture.svg b/docs/architecture.svg index 708275c..b3285bb 100644 --- a/docs/architecture.svg +++ b/docs/architecture.svg @@ -39,7 +39,7 @@ source of truth - + @@ -80,12 +80,12 @@ ONE WEBSOCKET TUNNEL outbound · TLS 443 · Bearer auth - no inbound ports — daemon dials out + no inbound ports - daemon dials out stdio - Processes, files, and credentials stay on the user's device — the daemon only dials out. + Processes, files, and credentials stay on the user's device - the daemon only dials out. diff --git a/prek.toml b/prek.toml new file mode 100644 index 0000000..a096e43 --- /dev/null +++ b/prek.toml @@ -0,0 +1,30 @@ +[[repos]] +repo = "https://github.com/pre-commit/pre-commit-hooks" +rev = "v4.6.0" +hooks = [ + { id = "check-added-large-files" }, +] + +[[repos]] +repo = "local" +hooks = [ + { id = "ai-writing-check", name = "AI writing check", entry = "scripts/check_ai_writing.sh", language = "system", pass_filenames = false, always_run = true }, +] + +# ── Source-size guardrails (mirror GitHub Actions) ──────────────── +[[repos]] +repo = "local" + +[[repos.hooks]] +id = "check-large-files" +name = "fail if any source file exceeds the line-count error threshold" +language = "system" +entry = "scripts/check_large_files.sh" +files = "\\.(rs)$" + +[[repos.hooks]] +id = "check-folder-sizes" +name = "fail if any source folder exceeds the file-count error threshold" +language = "system" +entry = "scripts/check_folder_sizes.sh" +files = "\\.(rs)$" diff --git a/scripts/check_ai_writing.sh b/scripts/check_ai_writing.sh new file mode 100755 index 0000000..db6fd2a --- /dev/null +++ b/scripts/check_ai_writing.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# AI writing check: fail on em dashes or contrastive-parallelism constructions. +# Pure ripgrep/git-grep - no language runtime or build needed. +# POSIX-ERE-safe patterns (no lazy quantifiers, no (?:); apostrophes written as +# `.` so they match '/’ and dodge shell quoting) work in both rg and git grep -E. +set -uo pipefail +cd "$(dirname "$0")/.." + +EM_DASH=$'\xe2\x80\x94' # U+2014 +SELF="scripts/check_ai_writing.sh" + +CONTRA='not (just|only|merely|simply)[^.?!]{0,60} but' +CONTRA="$CONTRA"'|(it.s|that.s|this is) not [^.?!]{0,60}(it.s|that.s|they.re)' +CONTRA="$CONTRA"'|(isn.t|aren.t|wasn.t|weren.t) (just|only|merely|simply)' +CONTRA="$CONTRA"'|(isn.t|aren.t) (just )?about[^.?!]{0,60}it.s about' +CONTRA="$CONTRA"'|more than just' +CONTRA="$CONTRA"'|less about[^.?!]{0,60}more about' +CONTRA="$CONTRA"'|not [^.?!]{0,40}so much as' +CONTRA="$CONTRA"'|goes? beyond' # noisiest; drop this line if it over-flags + +if command -v rg >/dev/null 2>&1; then + em=$(rg -n --hidden --glob '!.git' --glob "!$SELF" -e "$EM_DASH" . || true) + contra=$(rg -ni --hidden --glob '!.git' --glob "!$SELF" -e "$CONTRA" . || true) +else + # git grep scans tracked files only - target/ and other build output excluded for free. + em=$(git grep -n -e "$EM_DASH" -- . ":(exclude)$SELF" || true) + contra=$(git grep -niE -e "$CONTRA" -- . ":(exclude)$SELF" || true) +fi + +fail=0 +if [ -n "$em" ]; then echo "AI writing check failed: em dash (U+2014) detected"; echo "$em"; fail=1; fi +if [ -n "$contra" ]; then echo "AI writing check failed: contrastive parallelism ('not just X, but Y') detected"; echo "$contra"; fail=1; fi +[ "$fail" -eq 0 ] && echo "AI writing check passed." +exit "$fail" diff --git a/scripts/check_folder_sizes.sh b/scripts/check_folder_sizes.sh new file mode 100755 index 0000000..358f1c4 --- /dev/null +++ b/scripts/check_folder_sizes.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# +# Enforce a file-count limit on source folders (non-recursive). Shared by +# .github/workflows/folder-size.yaml and prek.toml. +# check_folder_sizes.sh [file ...] # check folders containing the given files +# check_folder_sizes.sh --all # scan every folder +# Thresholds via FOLDER_WARN_THRESHOLD / FOLDER_ERROR_THRESHOLD. +set -euo pipefail + +WARN_THRESHOLD="${FOLDER_WARN_THRESHOLD:-20}" +ERROR_THRESHOLD="${FOLDER_ERROR_THRESHOLD:-35}" + +SOURCE_EXTS=(rs) + +GRANDFATHERED=() # folders allowed to exceed (warn instead of error) + +EXCLUDE_PATH_RE='(^|/)(vendor|target|\.git)(/|$)' +GENERATED_RE='(^|/)(alembic[^/]*/versions|migrations)(/|$)' + +is_grandfathered() { local t="$1"; for g in "${GRANDFATHERED[@]}"; do [ "$t" = "$g" ] && return 0; done; return 1; } +should_skip() { + local f="$1"; [ -z "$f" ] && return 0; [ "$f" = "." ] && return 0 + echo "$f" | grep -qE "$EXCLUDE_PATH_RE" && return 0 + echo "$f" | grep -qE "$GENERATED_RE" && return 0 + return 1 +} + +count_folder() { + local fa=() first=1 + for e in "${SOURCE_EXTS[@]}"; do + if [ "$first" = 1 ]; then fa+=( -name "*.$e" ); first=0; else fa+=( -o -name "*.$e" ); fi + done + find "$1" -mindepth 1 -maxdepth 1 -type f \( "${fa[@]}" \) \ + -not -name 'test_*' -not -name '*_test.*' -not -name '*.test.*' \ + -not -name '*.spec.*' -not -name '*.d.ts' -not -name 'conftest.py' | wc -l +} + +collect_all_folders() { + find . -type d -not -path './.git/*' -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' -not -path '*/.venv/*' -not -path '*/target/*' | sed 's|^\./||' +} + +folder_list=$(mktemp); trap 'rm -f "$folder_list"' EXIT +if [ "${1:-}" = "--all" ]; then collect_all_folders > "$folder_list" +else for f in "$@"; do [ -z "$f" ] && continue; dirname "$f"; done | sort -u > "$folder_list"; fi + +warnings=0; errors=0; warn_list=""; error_list="" +while IFS= read -r folder; do + folder="${folder#./}" + should_skip "$folder" && continue + [ ! -d "$folder" ] && continue + count=$(count_folder "$folder") + if [ "$count" -gt "$ERROR_THRESHOLD" ]; then + if is_grandfathered "$folder"; then + warnings=$((warnings + 1)); warn_list="${warn_list}| \`${folder}/\` | ${count} | :warning: exceeds ${ERROR_THRESHOLD} (grandfathered) |\n" + else + errors=$((errors + 1)); error_list="${error_list}| \`${folder}/\` | ${count} | :x: exceeds ${ERROR_THRESHOLD} |\n" + fi + elif [ "$count" -gt "$WARN_THRESHOLD" ]; then + warnings=$((warnings + 1)); warn_list="${warn_list}| \`${folder}/\` | ${count} | :warning: exceeds ${WARN_THRESHOLD} |\n" + fi +done < "$folder_list" + +if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && { [ "$errors" -gt 0 ] || [ "$warnings" -gt 0 ]; }; then + { + echo "## Folder Size Report"; echo "" + echo "| Folder | Files | Status |"; echo "|--------|-------|--------|" + [ "$errors" -gt 0 ] && printf '%b' "$error_list" + [ "$warnings" -gt 0 ] && printf '%b' "$warn_list" + echo ""; echo "**Thresholds:** warn at ${WARN_THRESHOLD} files, error at ${ERROR_THRESHOLD} files. Immediate source children only - subfolders are the fix." + } >> "$GITHUB_STEP_SUMMARY" +fi + +format_list() { if command -v column >/dev/null 2>&1; then printf '%b' "$1" | column -t -s '|'; else printf '%b' "$1"; fi; } + +if [ "$errors" -gt 0 ]; then echo "::error::${errors} folder(s) exceed the ${ERROR_THRESHOLD}-file error threshold" >&2; format_list "$error_list" >&2; fi +if [ "$warnings" -gt 0 ]; then echo "::warning::${warnings} folder(s) exceed the ${WARN_THRESHOLD}-file warning threshold" >&2; format_list "$warn_list" >&2; fi +if [ "$errors" -eq 0 ] && [ "$warnings" -eq 0 ]; then echo "All folders are within the ${WARN_THRESHOLD}-file limit."; fi + +[ "$errors" -gt 0 ] && exit 1 +exit 0 diff --git a/scripts/check_large_files.sh b/scripts/check_large_files.sh new file mode 100755 index 0000000..ef758bb --- /dev/null +++ b/scripts/check_large_files.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# +# Enforce a line-count limit on source files. Shared by +# .github/workflows/large-files.yaml and prek.toml. +# check_large_files.sh [file ...] # check the given files +# check_large_files.sh --all # scan the whole tree +# Thresholds via LARGE_FILE_WARN_THRESHOLD / LARGE_FILE_ERROR_THRESHOLD. +# Exit 1 on errors, 0 on warnings-only or clean. +set -euo pipefail + +WARN_THRESHOLD="${LARGE_FILE_WARN_THRESHOLD:-500}" +ERROR_THRESHOLD="${LARGE_FILE_ERROR_THRESHOLD:-800}" + +SOURCE_EXTS=(rs) + +declare -A _EXT_SET=() +for e in "${SOURCE_EXTS[@]}"; do _EXT_SET[".$e"]=1; done + +EXCLUDE_PATH_RE='(^|/)(vendor|target|\.git)(/|$)' +GENERATED_RE='(^|/)(alembic[^/]*/versions|migrations)(/|$)' +EXCLUDE_NAME_RE='(.+_test\..+)$' + +is_source_file() { local ext=".${1##*.}"; [ -n "${_EXT_SET[$ext]:-}" ]; } + +is_excluded() { + local f="$1" base + echo "$f" | grep -qE "$EXCLUDE_PATH_RE" && return 0 + echo "$f" | grep -qE "$GENERATED_RE" && return 0 + base=$(basename "$f") + echo "$base" | grep -qE "$EXCLUDE_NAME_RE" && return 0 + return 1 +} + +collect_all() { + local fa=() first=1 + for e in "${SOURCE_EXTS[@]}"; do + if [ "$first" = 1 ]; then fa+=( -name "*.$e" ); first=0; else fa+=( -o -name "*.$e" ); fi + done + find . -type f \( "${fa[@]}" \) \ + -not -path './.git/*' -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' -not -path '*/.venv/*' -not -path '*/target/*' \ + | sed 's|^\./||' +} + +files=() +if [ "${1:-}" = "--all" ]; then mapfile -t files < <(collect_all); else files=("$@"); fi + +warnings=0; errors=0; warn_list=""; error_list="" +for file in "${files[@]}"; do + [ -z "$file" ] && continue + [ ! -f "$file" ] && continue + is_source_file "$file" || continue + is_excluded "$file" && continue + lines=$(wc -l < "$file") + if [ "$lines" -gt "$ERROR_THRESHOLD" ]; then + errors=$((errors + 1)) + error_list="${error_list}| \`${file}\` | ${lines} | :x: exceeds ${ERROR_THRESHOLD} |\n" + elif [ "$lines" -gt "$WARN_THRESHOLD" ]; then + warnings=$((warnings + 1)) + warn_list="${warn_list}| \`${file}\` | ${lines} | :warning: exceeds ${WARN_THRESHOLD} |\n" + fi +done + +if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && { [ "$errors" -gt 0 ] || [ "$warnings" -gt 0 ]; }; then + { + echo "## Large File Report"; echo "" + echo "| File | Lines | Status |"; echo "|------|-------|--------|" + [ "$errors" -gt 0 ] && printf '%b' "$error_list" + [ "$warnings" -gt 0 ] && printf '%b' "$warn_list" + echo ""; echo "**Thresholds:** warn at ${WARN_THRESHOLD} lines, error at ${ERROR_THRESHOLD} lines" + } >> "$GITHUB_STEP_SUMMARY" +fi + +format_list() { if command -v column >/dev/null 2>&1; then printf '%b' "$1" | column -t -s '|'; else printf '%b' "$1"; fi; } + +if [ "$errors" -gt 0 ]; then echo "::error::${errors} file(s) exceed the ${ERROR_THRESHOLD}-line error threshold" >&2; format_list "$error_list" >&2; fi +if [ "$warnings" -gt 0 ]; then echo "::warning::${warnings} file(s) exceed the ${WARN_THRESHOLD}-line warning threshold" >&2; format_list "$warn_list" >&2; fi +if [ "$errors" -eq 0 ] && [ "$warnings" -eq 0 ]; then echo "All source files are within the ${WARN_THRESHOLD}-line limit."; fi + +[ "$errors" -gt 0 ] && exit 1 +exit 0