diff --git a/CLAUDE.md b/CLAUDE.md index c27c626..311b536 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,9 +29,14 @@ it first; everything else is a stage it calls, in order: JSDoc, with precise source spans. 3. **call graph** (`src/semantic_analysis`) — `selectProvider()` picks tsc / jelly / union; each provider returns edges + external (phantom) symbols. -4. **cache** (`src/utils/cache.ts`) — content-hash cache under `.codeanalyzer/`, so - re-analysis only touches what changed. -5. **output** (`src/build`, `src/build/neo4j`) — `analysis.json`, a self-contained +4. **program graphs** (`src/dataflow`) — level 3 only (`-a 3`): CFG → post-dominance/CDG → + access-path def-use → PDG → SCC-condensed bottom-up summaries → SDG, emitted as the + `program_graphs` section keyed by `(signature, node_id)`. Decisions: the "Level 3" + half of `.claude/SCHEMA_DECISIONS.md`; contract + staged follow-ups: issue #2. +5. **cache** (`src/utils/cache.ts`) — content-hash cache under `.codeanalyzer/`, so + re-analysis only touches what changed (level 3 also records summaries + + dependency edges in `graphs_summaries.json`). +6. **output** (`src/build`, `src/build/neo4j`) — `analysis.json`, a self-contained `graph.cypher` snapshot, or an incremental Bolt push to a live database. The shape of everything is the **schema** in `src/schema` (`TSApplication` is the top @@ -47,10 +52,11 @@ a contract. | `src/options` | Parsed CLI options / `AnalysisOptions` | | `src/syntactic_analysis` | Symbol table (ts-morph traversal) | | `src/semantic_analysis` | Call-graph providers (tsc, jelly, union), phantoms | -| `src/schema` | `TSApplication` types + signatures (the output contract) | +| `src/dataflow` | Level-3 program graphs: CFG, dominance/CDG, def-use, summaries, SDG, slicing | +| `src/schema` | `TSApplication` types + signatures + `program_graphs` (the output contract) | | `src/build` | Dep materialization + output; `build/neo4j` = graph projection | | `src/utils` | fs, caching, logging, serialization, version | -| `test` | Bun tests + `fixtures/sample-app` | +| `test` | Bun tests + `fixtures/sample-app` (levels 1–2) + `fixtures/dataflow-app` (level-3 gates) | ## Commands diff --git a/README.md b/README.md index 5646c55..62918f0 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,17 @@ Options: (default: "neo4j", env: NEO4J_PASSWORD) --neo4j-database Neo4j database name (env: NEO4J_DATABASE) -a, --analysis-level analysis depth: 1 = symbol table + tsc resolver - call graph + RTA (default); 2 = call graph - (default: "1") + call graph + RTA (default); 2 = call graph; 3 = + + program graphs (CFG/PDG/SDG) (default: "1") + --graphs level-3 graph sections to emit, + comma-separated: cfg | dfg | pdg | sdg + (default: all; requires -a 3) + --graph-field-depth access-path depth bound (k-limit) for level-3 + dataflow (default: "3") + -j, --jobs worker parallelism for level-3 graphs (default: + sequential; opt in with N ≥ 2 on large projects + — each worker loads its own copy of the + program) -t, --target-files restrict analysis to specific files (incremental) --skip-tests skip test trees (default) @@ -213,6 +222,12 @@ Options: cants --input ./my-ts-project --eager --cache-dir /path/to/custom-cache ``` +6. **Program graphs (level 3): CFG/PDG/SDG in `analysis.json`:** + ```sh + cants --input ./my-ts-project -a 3 # full program_graphs section + cants --input ./my-ts-project -a 3 --graphs cfg,pdg # scope the emitted graphs + ``` + ## Output targets `cants` builds one analysis in memory and can emit it three ways (`--emit`): @@ -234,6 +249,56 @@ A `TSApplication` document — the canonical CLDK contract the Python SDK parses Caller- and callee-side identifiers come from a single signature canonicalizer, so call-graph `source`/`target` values byte-match the corresponding `symbol_table` / `external_symbols` keys. +### Program graphs (`-a 3`) + +`--analysis-level 3` adds a `program_graphs` section to `analysis.json` — native, whole-program +dependence graphs built in-process from the same ts-morph AST (no external engine), per the CLDK +level-3 dataflow contract: + +```jsonc +{ + "program_graphs": { + "schema_version": "1.0.0", + "k_limit": 3, // access-path depth bound (--graph-field-depth) + "functions": { + "": { + "cfg": { "nodes": [...], "edges": [...] }, // exceptional control-flow graph + "pdg": { "edges": [...] } // CDG (control) + DDG (data) dependence + } + }, + "sdg_edges": [ /* cross-function CALL / PARAM_IN / PARAM_OUT / SUMMARY edges */ ] + } +} +``` + +Every graph node is keyed by `(signature, node_id)` — the same signature canonicalizer as the +symbol table and call graph — so graphs, call edges, and callables all join. `--graphs +cfg,dfg,pdg,sdg` scopes the emitted sections (default: all). + +**Substrate (locked in [issue #2](https://github.com/codellm-devkit/codeanalyzer-typescript/issues/2)):** +the CFG and reaching-definitions are hand-built from the ts-morph AST; the call-graph oracle is +the existing provenance-merged tsc ∪ Jelly graph; aliasing is a flow-insensitive copy-alias MVP +(Jelly points-to-backed propagation is a staged upgrade). Function summaries are composed +bottom-up over the SCC condensation of the call graph, with k-limited access paths; module +globals ride the SDG as extra parameters. The analysis is deliberately sound-leaning and +over-approximate; known unsoundness (dynamic `eval`, reflection/monkey-patching, npm-internal +effects) is recorded in `.claude/SCHEMA_DECISIONS.md`. Backward slicing and taint run as queries +over the SDG (slicing ships now inside the analyzer; the configurable taint pack is staged). + +**Parallelism (`-j/--jobs`).** The pipeline implements the level-3 parallel execution model: +stage-1–4 extraction fans out per callable over a Bun worker pool (partitioned by file) and is +posted *before* the call-graph solve so the two overlap; summary composition runs as a +Kahn-style ready-queue wavefront over the SCC condensation DAG (the SCC is the atomic unit). +`--jobs N` output is **byte-identical** to `--jobs 1` (node ids are span-ordered, all edge lists +are collect-then-sorted, and the SCC fixpoint is a pure function of its inputs) — enforced by a +differential test. It is off by default and worth opting into only on large codebases: ts-morph +ASTs cannot cross the worker boundary, so each extraction worker loads its own copy of the +program, which dominates the parallelizable graph math on small/mid repos (self-analysis runs +2.5× slower at `-j 14`). Worker failure at any stage degrades to the sequential path with a +warning — never to wrong or missing output. + +Levels 1/2 are unaffected: nothing in level 3 runs unless `-a 3` is requested. + ### Neo4j graph `--emit neo4j` projects the same analysis into a labeled property graph (declarations keyed by diff --git a/package.json b/package.json index b3b15e2..7a2d193 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ }, "scripts": { "start": "bun run src/index.ts", - "build": "bun build ./src/main.ts --compile --external @babel/preset-typescript --outfile dist/cants", + "build": "bun build ./src/main.ts ./src/dataflow/worker.ts --compile --external @babel/preset-typescript --outfile dist/cants", "gen:schema": "bun run src/index.ts --emit schema > schema.neo4j.json", "gen:readme": "bun run scripts/update-readme.ts", "test:container": "RUN_CONTAINER_TESTS=1 bun test test/neo4j-bolt.test.ts", diff --git a/src/cli.ts b/src/cli.ts index fba5dd4..261049f 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1,6 +1,7 @@ import * as path from "node:path"; import { Command, Option } from "commander"; import type { AnalysisOptions, CallGraphProviderName, EmitTarget } from "./options"; +import { ALL_GRAPHS, type GraphSelector } from "./schema"; /** * Build the commander program. Shared by parseArgs and by the README generator @@ -35,7 +36,20 @@ export function buildProgram(): Command { .default("neo4j"), ) .addOption(new Option("--neo4j-database ", "Neo4j database name").env("NEO4J_DATABASE")) - .option("-a, --analysis-level ", "analysis depth: 1 = symbol table + tsc resolver call graph + RTA (default); 2 = call graph", "1") + .option( + "-a, --analysis-level ", + "analysis depth: 1 = symbol table + tsc resolver call graph + RTA (default); 2 = call graph; 3 = + program graphs (CFG/PDG/SDG)", + "1", + ) + .option( + "--graphs ", + "level-3 graph sections to emit, comma-separated: cfg | dfg | pdg | sdg (default: all; requires -a 3)", + ) + .option("--graph-field-depth ", "access-path depth bound (k-limit) for level-3 dataflow", "3") + .option( + "-j, --jobs ", + "worker parallelism for level-3 graphs (default: sequential; opt in with N ≥ 2 on large projects — each worker loads its own copy of the program)", + ) .option("-t, --target-files ", "restrict analysis to specific files (incremental)") .option("--skip-tests", "skip test trees (default)") .option("--include-tests", "include test trees") @@ -61,7 +75,45 @@ export function parseArgs(argv: string[]): AnalysisOptions { program.parse(argv, { from: "user" }); const o = program.opts(); - const level = String(o.analysisLevel) === "2" ? 2 : 1; + const levelStr = String(o.analysisLevel); + if (!["1", "2", "3"].includes(levelStr)) { + program.error(`error: invalid --analysis-level '${levelStr}' (expected 1, 2, or 3)`); + } + const level = Number(levelStr) as 1 | 2 | 3; + + // --graphs: strict validation (never a silent fallback), and only meaningful at -a 3. + let graphs: GraphSelector[] = [...ALL_GRAPHS]; + if (o.graphs !== undefined) { + if (level !== 3) program.error("error: --graphs requires --analysis-level 3"); + const requested = String(o.graphs) + .split(",") + .map((g) => g.trim()) + .filter((g) => g.length > 0); + if (!requested.length) program.error("error: --graphs requires at least one of: cfg, dfg, pdg, sdg"); + for (const g of requested) { + if (!(ALL_GRAPHS as string[]).includes(g)) { + program.error(`error: unknown --graphs value '${g}' (expected: cfg, dfg, pdg, sdg)`); + } + } + graphs = [...new Set(requested)] as GraphSelector[]; + } + + const kStr = String(o.graphFieldDepth); + const k = Number(kStr); + if (!Number.isInteger(k) || k < 1) { + program.error(`error: invalid --graph-field-depth '${kStr}' (expected a positive integer)`); + } + + // -j/--jobs: explicit value must be a positive integer; omitted ⇒ 0 = auto, resolved against + // the project size at extraction time (see startExtraction). + let jobs = 0; + if (o.jobs !== undefined) { + const j = Number(String(o.jobs)); + if (!Number.isInteger(j) || j < 1) { + program.error(`error: invalid --jobs '${String(o.jobs)}' (expected a positive integer)`); + } + jobs = j; + } const emit: EmitTarget = o.emit === "neo4j" ? "neo4j" : o.emit === "schema" ? "schema" : "json"; // --emit schema is a static artifact and needs no project; every other target requires -i. if (emit !== "schema" && !o.input) program.error("required option '-i, --input ' not specified"); @@ -94,6 +146,9 @@ export function parseArgs(argv: string[]): AnalysisOptions { neo4jPassword: String(o.neo4jPassword), neo4jDatabase: o.neo4jDatabase ? String(o.neo4jDatabase) : null, analysisLevel: level, + graphs, + graphFieldDepth: k, + jobs, targetFiles: targets, skipTests: o.includeTests ? false : true, eager: Boolean(o.eager), diff --git a/src/core.ts b/src/core.ts index 85e768a..f29861f 100644 --- a/src/core.ts +++ b/src/core.ts @@ -1,4 +1,5 @@ import * as path from "node:path"; +import { buildProgramGraphs, startExtraction } from "./dataflow"; import { selectProvider } from "./semantic_analysis"; import { loadCache, saveCache } from "./utils"; import { materialize } from "./build"; @@ -11,7 +12,7 @@ import { Logger } from "./utils"; * The orchestrator. Order mirrors the reference analyzers: materialize deps → build the symbol * table → build the resolver call graph → cache the base → return the Application. */ -export function analyze(opts: AnalysisOptions): TSApplication { +export async function analyze(opts: AnalysisOptions): Promise { const log = new Logger(opts.verbosity); log.info(`analyzing ${opts.input} (level ${opts.analysisLevel})`); const cacheDir = opts.cacheDir ?? path.join(opts.input, ".codeanalyzer"); @@ -22,6 +23,11 @@ export function analyze(opts: AnalysisOptions): TSApplication { const cached = opts.eager ? null : loadCache(cacheDir); const { project, symbol_table } = buildSymbolTable(opts, mat, cached?.symbol_table ?? null, log); + // Level 3: post stage-1–4 graph extraction to the worker pool BEFORE the call-graph solve — + // extraction doesn't need callee resolution, so the two run concurrently (the contract's + // "points-to solve runs concurrently with stages 1–4") and join in buildProgramGraphs. + const extraction = opts.analysisLevel === 3 ? startExtraction(project, symbol_table, mat.tsConfigFilePath, opts, log) : null; + // Call graph via the selected provider (union of tsc+jelly by default; --tsc-only / jelly opt-in). const provider = selectProvider(opts.callGraphProvider); log.info(`call graph provider: ${provider.name}`); @@ -34,6 +40,13 @@ export function analyze(opts: AnalysisOptions): TSApplication { external_symbols: cg.external_symbols, synthesized_callables: cg.synthesized_callables, }; + + // Level 3 join: stages 5–7 (summary wavefront + SDG) consume the extraction AND the + // provider-backfilled callee signatures. Strictly flag-gated so -a 1/-a 2 cost nothing. + if (extraction) { + app.program_graphs = await buildProgramGraphs(extraction, symbol_table, opts, log); + } + saveCache(cacheDir, { symbol_table, call_graph }); return app; } diff --git a/src/dataflow/cfg.ts b/src/dataflow/cfg.ts new file mode 100644 index 0000000..a8200d0 --- /dev/null +++ b/src/dataflow/cfg.ts @@ -0,0 +1,482 @@ +/** + * Stage 1 — exceptional CFG per callable, lowered directly from the ts-morph AST. + * + * Shape: statement-level nodes plus one `param` node per formal, between a synthetic ENTRY (id 0) + * and a synthetic EXIT (last id). Node ids are the source-span order of the owning AST nodes + * within the callable, so they are stable across runs on identical content. + * + * TS lowering rules (each asserted by the fixture gate tests): + * - if/loops/switch: the control statement itself is the condition-carrying node; branches get + * `true`/`false` edges, the loop back edge is `loop_back`, switch dispatch is `switch_case` + * (clause fallthrough between non-terminated clauses is `fallthrough`). + * - A `for` node carries its init/condition/incrementor as one node; `for-of`/`for-in` nodes + * bind the iteration variable. + * - Multi-exit is normalized: `return` → EXIT (`return` edge), fall-off-end → EXIT + * (`fallthrough`), `throw` → nearest enclosing handler or EXIT (`exception`). + * - Exceptional edges are over-approximate: every node whose expression contains a call / `new` / + * `await` / tagged template — and every `throw` — gets an `exception` edge to the nearest + * enclosing catch node, else the enclosing finally block, else EXIT. Bare property reads are + * NOT treated as throwing (documented unsoundness for TypeError-on-undefined). + * - try/catch/finally: region splicing, not duplication — try-body throwers edge to the catch + * node (which binds the exception variable), catch-body throwers to the finally entry or + * outward; a finally block's exits additionally edge to the outer handler (`exception`), + * over-approximating the rethrow continuation. + * - `await` / `yield`: the suspending statement's outgoing normal edge is kind `await_resume` / + * `yield` (resumption), per the shared vocabulary. + * - Short-circuit (`&&`/`||`/`??`), ternaries, and optional chaining stay intra-statement: they + * do not split nodes; their reads are attributed to the containing statement + * (over-approximation, recorded in SCHEMA_DECISIONS.md). + * - Infinite loops (`while (true)` / `for (;;)`): the dead loop-exit `false` edge is still + * emitted from the loop header, which is exactly the synthetic edge post-dominance needs. + * - Nested callables (arrows, function expressions, declarations): the declaring statement is a + * single node in the enclosing CFG; the nested body gets its own CFG (closure capture is a + * def-use concern, stage 3). + */ +import { Node, SyntaxKind, type SourceFile } from "ts-morph"; +import type { CfgEdge, CfgEdgeKind } from "../schema"; +import type { DfNode, FunctionCfgBuild } from "./model"; + +/** A dangling forward edge: source node emitted, target not yet known. */ +interface Dangling { + from: number; + kind: CfgEdgeKind; +} + +/** Result of lowering a statement (list): its entry node and the dangling normal exits. */ +interface Lowered { + entry: number | null; // null ⇒ the region is empty (control passes straight through) + exits: Dangling[]; +} + +interface LoopLabel { + breaks: Dangling[]; + continueHeader: number | null; +} + +interface LowerCtx { + /** Nearest enclosing handler node (catch node / finally entry) or EXIT. */ + exceptionTarget: number; + /** Break/continue sinks of the nearest enclosing loop/switch. */ + nearestBreaks: Dangling[] | null; + nearestContinueHeader: number | null; + labels: Map; +} + +export function buildCfg(signature: string, fn: Node): FunctionCfgBuild | null { + const body = getBodyNode(fn); + if (!body) return null; // ambient / abstract / overload signature / implicit ctor — no graph + const sf = fn.getSourceFile(); + + // ---- pass 1: collect the node universe and assign span-ordered ids ---- + const astNodes: Node[] = []; + const params = getParameters(fn); + for (const p of params) astNodes.push(p); + collectStatementNodes(body, astNodes); + astNodes.sort((a, b) => a.getStart() - b.getStart() || b.getEnd() - a.getEnd()); + + const nodes: DfNode[] = [{ id: 0, kind: "entry", ast: null }]; + const idOf = new Map(); + for (const [i, n] of astNodes.entries()) { + const id = i + 1; + idOf.set(n, id); + nodes.push({ id, kind: Node.isParameterDeclaration(n) ? "param" : "statement", ast: n }); + } + const exitId = astNodes.length + 1; + nodes.push({ id: exitId, kind: "exit", ast: null }); + + // ---- pass 2: lower to edges ---- + const edgeSet = new Set(); + const edges: CfgEdge[] = []; + const addEdge = (source: number, target: number, kind: CfgEdgeKind): void => { + const k = `${source}>${target}>${kind}`; + if (edgeSet.has(k)) return; + edgeSet.add(k); + edges.push({ source, target, kind }); + }; + + const lower = new Lowerer(idOf, addEdge, exitId); + const ctx: LowerCtx = { + exceptionTarget: exitId, + nearestBreaks: null, + nearestContinueHeader: null, + labels: new Map(), + }; + + // ENTRY → params (in order) → body. + let cursor: Dangling[] = [{ from: 0, kind: "fallthrough" }]; + const paramIds: number[] = []; + for (const p of params) { + const pid = idOf.get(p) as number; + paramIds.push(pid); + for (const d of cursor) addEdge(d.from, pid, d.kind); + cursor = [{ from: pid, kind: "fallthrough" }]; + } + + let bodyLowered: Lowered; + if (Node.isBlock(body)) { + bodyLowered = lower.statements(body.getStatements(), ctx); + } else { + // Arrow expression body: one statement node, implicit return. + const id = idOf.get(body) as number; + lower.exceptionEdgeIfThrows(body, id, ctx); + addEdge(id, exitId, "return"); + bodyLowered = { entry: id, exits: [] }; + } + + if (bodyLowered.entry !== null) { + for (const d of cursor) addEdge(d.from, bodyLowered.entry, d.kind); + cursor = bodyLowered.exits; + } + // Fall-off-end → EXIT. + for (const d of cursor) addEdge(d.from, exitId, d.kind); + + return { signature, fn, sf, nodes, edges, entryId: 0, exitId, paramIds }; +} + +// ------------------------------------------------------------------------------------------------ +// Node collection (pass 1) +// ------------------------------------------------------------------------------------------------ + +/** Statements (and catch clauses) that become CFG nodes, recursing only through control structure. */ +function collectStatementNodes(node: Node, out: Node[]): void { + const stmts = Node.isBlock(node) ? node.getStatements() : [node]; + for (const s of stmts) { + if (Node.isBlock(s)) { + collectStatementNodes(s, out); + } else if (Node.isLabeledStatement(s)) { + collectStatementNodes(s.getStatement(), out); + } else if (Node.isIfStatement(s)) { + out.push(s); + collectStatementNodes(s.getThenStatement(), out); + const els = s.getElseStatement(); + if (els) collectStatementNodes(els, out); + } else if ( + Node.isWhileStatement(s) || + Node.isDoStatement(s) || + Node.isForStatement(s) || + Node.isForOfStatement(s) || + Node.isForInStatement(s) + ) { + out.push(s); + collectStatementNodes(s.getStatement(), out); + } else if (Node.isSwitchStatement(s)) { + out.push(s); + for (const clause of s.getClauses()) for (const cs of clause.getStatements()) collectStatementNodes(cs, out); + } else if (Node.isTryStatement(s)) { + collectStatementNodes(s.getTryBlock(), out); + const cc = s.getCatchClause(); + if (cc) { + out.push(cc); + collectStatementNodes(cc.getBlock(), out); + } + const fin = s.getFinallyBlock(); + if (fin) collectStatementNodes(fin, out); + } else { + // Leaf statement: expression / variable / return / throw / break / continue / nested + // function or class declaration / debugger / empty. One node; nested bodies excluded. + out.push(s); + } + } +} + +// ------------------------------------------------------------------------------------------------ +// Lowering (pass 2) +// ------------------------------------------------------------------------------------------------ + +class Lowerer { + constructor( + private idOf: Map, + private addEdge: (s: number, t: number, k: CfgEdgeKind) => void, + private exitId: number, + ) {} + + statements(stmts: readonly Node[], ctx: LowerCtx): Lowered { + let entry: number | null = null; + let cursor: Dangling[] | null = null; // null before the first statement + for (const s of stmts) { + const low = this.statement(s, ctx); + if (low.entry === null) continue; // empty region (e.g. bare block with nothing in it) + if (entry === null) entry = low.entry; + if (cursor) for (const d of cursor) this.addEdge(d.from, low.entry, d.kind); + cursor = low.exits; + } + return { entry, exits: cursor ?? [] }; + } + + statement(s: Node, ctx: LowerCtx, label?: string): Lowered { + if (Node.isBlock(s)) return this.statements(s.getStatements(), ctx); + if (Node.isLabeledStatement(s)) return this.statement(s.getStatement(), ctx, s.getLabel().getText()); + if (Node.isIfStatement(s)) return this.ifStatement(s, ctx); + if (Node.isWhileStatement(s) || Node.isForStatement(s) || Node.isForOfStatement(s) || Node.isForInStatement(s)) + return this.loop(s, ctx, label); + if (Node.isDoStatement(s)) return this.doLoop(s, ctx, label); + if (Node.isSwitchStatement(s)) return this.switchStatement(s, ctx); + if (Node.isTryStatement(s)) return this.tryStatement(s, ctx); + return this.leaf(s, ctx); + } + + private leaf(s: Node, ctx: LowerCtx): Lowered { + const id = this.idOf.get(s) as number; + this.exceptionEdgeIfThrows(s, id, ctx); + + if (Node.isReturnStatement(s)) { + this.addEdge(id, this.exitId, "return"); + return { entry: id, exits: [] }; + } + if (Node.isThrowStatement(s)) { + this.addEdge(id, ctx.exceptionTarget, "exception"); + return { entry: id, exits: [] }; + } + if (Node.isBreakStatement(s)) { + const lbl = s.getLabel()?.getText(); + const sink = lbl ? ctx.labels.get(lbl)?.breaks : ctx.nearestBreaks; + sink?.push({ from: id, kind: "break" }); + return { entry: id, exits: [] }; + } + if (Node.isContinueStatement(s)) { + const lbl = s.getLabel()?.getText(); + const header = lbl ? (ctx.labels.get(lbl)?.continueHeader ?? null) : ctx.nearestContinueHeader; + if (header !== null) this.addEdge(id, header, "continue"); + return { entry: id, exits: [] }; + } + // Plain statement: the outgoing normal edge carries the suspend/resume kind when the + // statement awaits or yields (stopping at nested function boundaries). + const kind: CfgEdgeKind = containsKind(s, SyntaxKind.AwaitExpression) + ? "await_resume" + : containsKind(s, SyntaxKind.YieldExpression) + ? "yield" + : "fallthrough"; + return { entry: id, exits: [{ from: id, kind }] }; + } + + private ifStatement(s: Node, ctx: LowerCtx): Lowered { + if (!Node.isIfStatement(s)) throw new Error("unreachable"); + const id = this.idOf.get(s) as number; + this.exceptionEdgeIfThrows(s.getExpression(), id, ctx); + const then = this.statement(s.getThenStatement(), ctx); + const exits: Dangling[] = []; + if (then.entry !== null) { + this.addEdge(id, then.entry, "true"); + exits.push(...then.exits); + } else { + exits.push({ from: id, kind: "true" }); + } + const elseStmt = s.getElseStatement(); + if (elseStmt) { + const els = this.statement(elseStmt, ctx); + if (els.entry !== null) { + this.addEdge(id, els.entry, "false"); + exits.push(...els.exits); + } else { + exits.push({ from: id, kind: "false" }); + } + } else { + exits.push({ from: id, kind: "false" }); + } + return { entry: id, exits }; + } + + /** while / for / for-of / for-in: the statement node is the header (condition/binding). */ + private loop(s: Node, ctx: LowerCtx, label?: string): Lowered { + const id = this.idOf.get(s) as number; + const cond = Node.isWhileStatement(s) + ? s.getExpression() + : Node.isForStatement(s) + ? (s.getCondition() ?? null) + : Node.isForOfStatement(s) || Node.isForInStatement(s) + ? s.getExpression() + : null; + if (Node.isForStatement(s)) { + const init = s.getInitializer(); + if (init) this.exceptionEdgeIfThrows(init, id, ctx); + const incr = s.getIncrementor(); + if (incr) this.exceptionEdgeIfThrows(incr, id, ctx); + } + if (cond) this.exceptionEdgeIfThrows(cond, id, ctx); + + const breaks: Dangling[] = []; + const labelEntry: LoopLabel = { breaks, continueHeader: id }; + if (label) ctx.labels.set(label, labelEntry); + const bodyCtx: LowerCtx = { ...ctx, nearestBreaks: breaks, nearestContinueHeader: id }; + const body = this.statement(getLoopBody(s), bodyCtx); + if (label) ctx.labels.delete(label); + + if (body.entry !== null) { + this.addEdge(id, body.entry, "true"); + for (const d of body.exits) this.addEdge(d.from, id, "loop_back"); + } else { + this.addEdge(id, id, "loop_back"); // empty body: the header loops on itself + } + // The loop-exit edge is emitted even for `while (true)` / `for (;;)` — that dead `false` + // edge is the synthetic edge that keeps EXIT the unique post-dominance root. + return { entry: id, exits: [{ from: id, kind: "false" }, ...breaks] }; + } + + private doLoop(s: Node, ctx: LowerCtx, label?: string): Lowered { + if (!Node.isDoStatement(s)) throw new Error("unreachable"); + const id = this.idOf.get(s) as number; // the do-while node carries the condition + this.exceptionEdgeIfThrows(s.getExpression(), id, ctx); + const breaks: Dangling[] = []; + if (label) ctx.labels.set(label, { breaks, continueHeader: id }); + const bodyCtx: LowerCtx = { ...ctx, nearestBreaks: breaks, nearestContinueHeader: id }; + const body = this.statement(s.getStatement(), bodyCtx); + if (label) ctx.labels.delete(label); + + if (body.entry !== null) { + for (const d of body.exits) this.addEdge(d.from, id, d.kind); + this.addEdge(id, body.entry, "loop_back"); // condition true → run the body again + return { entry: body.entry, exits: [{ from: id, kind: "false" }, ...breaks] }; + } + this.addEdge(id, id, "loop_back"); + return { entry: id, exits: [{ from: id, kind: "false" }, ...breaks] }; + } + + private switchStatement(s: Node, ctx: LowerCtx): Lowered { + if (!Node.isSwitchStatement(s)) throw new Error("unreachable"); + const id = this.idOf.get(s) as number; + this.exceptionEdgeIfThrows(s.getExpression(), id, ctx); + const breaks: Dangling[] = []; + const clauseCtx: LowerCtx = { ...ctx, nearestBreaks: breaks }; + + const clauses = s.getClauses(); + const lowered = clauses.map((c) => this.statements(c.getStatements(), clauseCtx)); + const exits: Dangling[] = []; + let nonEmptyDefault = false; + let pendingFallthrough: Dangling[] = []; + for (const [i, clause] of clauses.entries()) { + const low = lowered[i] as Lowered; + if (low.entry === null) continue; // empty clause: dispatch/fallthrough slides to the next + if (Node.isDefaultClause(clause)) nonEmptyDefault = true; + this.addEdge(id, low.entry, "switch_case"); + for (const d of pendingFallthrough) this.addEdge(d.from, low.entry, d.kind); + pendingFallthrough = low.exits; + } + exits.push(...pendingFallthrough, ...breaks); + // Without a (non-empty) default arm, dispatch may skip the switch entirely. + if (!nonEmptyDefault) exits.push({ from: id, kind: "fallthrough" }); + return { entry: id, exits }; + } + + private tryStatement(s: Node, ctx: LowerCtx): Lowered { + if (!Node.isTryStatement(s)) throw new Error("unreachable"); + const cc = s.getCatchClause(); + const fin = s.getFinallyBlock(); + + // Lower the finally region first so try/catch know their exceptional continuation. + let finLowered: Lowered | null = null; + if (fin) { + finLowered = this.statements(fin.getStatements(), ctx); + // A finally region may re-raise (it runs on the exceptional path too): over-approximate by + // edging every finally exit to the outer handler as well. + if (finLowered.entry !== null) { + for (const d of finLowered.exits) this.addEdge(d.from, ctx.exceptionTarget, "exception"); + } + } + const afterCatchTarget = finLowered?.entry ?? ctx.exceptionTarget; + + const exits: Dangling[] = []; + let catchEntry: number | null = null; + if (cc) { + const catchId = this.idOf.get(cc) as number; // binds the exception variable (a def, stage 3) + catchEntry = catchId; + const catchCtx: LowerCtx = { ...ctx, exceptionTarget: afterCatchTarget }; + const catchBody = this.statements(cc.getBlock().getStatements(), catchCtx); + if (catchBody.entry !== null) { + this.addEdge(catchId, catchBody.entry, "fallthrough"); + this.routeThroughFinally(catchBody.exits, finLowered, exits); + } else { + this.routeThroughFinally([{ from: catchId, kind: "fallthrough" }], finLowered, exits); + } + } + + const tryCtx: LowerCtx = { ...ctx, exceptionTarget: catchEntry ?? afterCatchTarget }; + const tryBody = this.statements(s.getTryBlock().getStatements(), tryCtx); + if (tryBody.entry !== null) this.routeThroughFinally(tryBody.exits, finLowered, exits); + else if (finLowered?.entry != null) this.routeThroughFinally([], finLowered, exits); + + const entry = tryBody.entry ?? catchEntry ?? finLowered?.entry ?? null; + if (tryBody.entry === null && finLowered?.entry !== null && finLowered) { + // Empty try block: control passes straight to finally. + exits.push(...finLowered.exits); + } + return { entry, exits }; + } + + /** Route a region's normal exits through the finally block (if any) or straight out. */ + private routeThroughFinally(regionExits: Dangling[], fin: Lowered | null, outExits: Dangling[]): void { + if (fin && fin.entry !== null) { + for (const d of regionExits) this.addEdge(d.from, fin.entry, d.kind); + for (const d of fin.exits) if (!outExits.includes(d)) outExits.push(d); + } else { + outExits.push(...regionExits); + } + } + + /** Over-approximate exceptional flow: calls / new / await / tagged templates may throw. */ + exceptionEdgeIfThrows(expr: Node, nodeId: number, ctx: LowerCtx): void { + if (mayThrow(expr)) this.addEdge(nodeId, ctx.exceptionTarget, "exception"); + } +} + +// ------------------------------------------------------------------------------------------------ +// AST helpers +// ------------------------------------------------------------------------------------------------ + +function getBodyNode(fn: Node): Node | undefined { + const f = fn as unknown as { getBody?: () => Node | undefined }; + return f.getBody?.(); +} + +function getParameters(fn: Node): Node[] { + const f = fn as unknown as { getParameters?: () => Node[] }; + return f.getParameters?.() ?? []; +} + +function getLoopBody(s: Node): Node { + return (s as unknown as { getStatement: () => Node }).getStatement(); +} + +export function isFunctionBoundary(n: Node): boolean { + return ( + Node.isFunctionDeclaration(n) || + Node.isFunctionExpression(n) || + Node.isArrowFunction(n) || + Node.isMethodDeclaration(n) || + Node.isConstructorDeclaration(n) || + Node.isGetAccessorDeclaration(n) || + Node.isSetAccessorDeclaration(n) || + Node.isClassDeclaration(n) || + Node.isClassExpression(n) + ); +} + +/** Does this subtree (stopping at nested function boundaries) contain a node of `kind`? */ +export function containsKind(root: Node, kind: SyntaxKind): boolean { + if (root.getKind() === kind) return true; + let found = false; + root.forEachDescendant((n, traversal) => { + if (found) { + traversal.stop(); + return; + } + if (isFunctionBoundary(n)) { + traversal.skip(); + return; + } + if (n.getKind() === kind) { + found = true; + traversal.stop(); + } + }); + return found; +} + +/** May evaluating this subtree throw? Over-approximate: any call-like or await counts. */ +export function mayThrow(root: Node): boolean { + return ( + containsKind(root, SyntaxKind.CallExpression) || + containsKind(root, SyntaxKind.NewExpression) || + containsKind(root, SyntaxKind.AwaitExpression) || + containsKind(root, SyntaxKind.TaggedTemplateExpression) + ); +} diff --git a/src/dataflow/defuse.ts b/src/dataflow/defuse.ts new file mode 100644 index 0000000..2fc4a84 --- /dev/null +++ b/src/dataflow/defuse.ts @@ -0,0 +1,575 @@ +/** + * Stage 3 — variable identity (k-limited access paths) and local def-use (the DDG), split in two + * along the AST/data boundary: + * + * - `extractFunctionFacts` (AST-bound, ONCE per callable): walks the ts-morph AST and records + * each node's defs/uses, the copy-alias pairs, and the return-value nodes. This is the only + * half that touches the AST, so it can run inside extraction workers. + * - `solveDefUse` (pure data, re-run freely): reaching definitions over the serialized CFG with + * callee global effects overlaid at callsite nodes → labeled DDG edges. The summary fixpoint + * (stage 6) re-runs ONLY this half per iteration — never re-extraction. + * + * Access-path model: `base(.field | [*])*`, where the base is a local, parameter, `this`, + * captured variable, or module binding — identified by its *declaration node* (so shadowed names + * in nested scopes never share a base) and labeled by its name. Module bindings are canonical + * `.` (the same prefix as signatures), which is what lets globals ride the + * SDG across functions and files. Depth is k-limited (`--graph-field-depth`): `x.f.g.h` with k=3 + * becomes `x.f.g.*`, which conservatively aliases every deeper path. + * + * Aliasing (MVP substrate, per issue #2 / SCHEMA_DECISIONS.md): flow-insensitive union-find over + * bases connected by direct copies (`const q = p`); a write through one name weakly updates the + * other. Points-to-backed aliasing via Jelly's solved state is the staged upgrade (PR F). + * + * Def-use: classic forward may reaching-definitions. Strong (killing) defs are whole-base writes + * to locals/params; every field write is weak. Captured/module/this bases get a synthetic def at + * ENTRY (their value on function entry) — the same convention the SDG uses when it targets ENTRY + * with global PARAM_IN edges. Reads inside nested callables of variables they capture are + * attributed to the declaring statement node (capture-at-declaration). EXIT doubles as the HRB + * formal-out node: return-value nodes and module-global writes get synthetic DDG edges into EXIT. + */ +import { Node, SyntaxKind } from "ts-morph"; +import type { PdgEdge } from "../schema"; +import { fileKeyOf } from "../schema"; +import { isFunctionBoundary } from "./cfg"; +import { + dataAdjacency, + renderPath, + fieldsMayAlias, + type BaseKind, + type CallEffects, + type CallableGraphData, + type DefFact, + type DfNode, + type FunctionCfgBuild, + type NodeFacts, + type PathRef, +} from "./model"; + +// ------------------------------------------------------------------------------------------------ +// Extraction (AST-bound, once per callable) +// ------------------------------------------------------------------------------------------------ + +export interface FunctionFacts { + facts: Array<[number, NodeFacts]>; + aliasPairs: Array<[string, string]>; + returnValueNodes: number[]; +} + +export function extractFunctionFacts(build: FunctionCfgBuild, root: string, k: number): FunctionFacts { + const aliasPairs: Array<[string, string]> = []; + const union = (a: string, b: string): void => { + aliasPairs.push([a, b]); + }; + + const facts: Array<[number, NodeFacts]> = []; + for (const n of build.nodes) { + if (n.kind === "entry" || n.kind === "exit") { + facts.push([n.id, { defs: [], uses: [] }]); + continue; + } + facts.push([n.id, extractFacts(n, build, root, k, union)]); + } + + const returnValueNodes: number[] = []; + const body = getBody(build.fn); + const isExprBody = body !== undefined && !Node.isBlock(body); + for (const n of build.nodes) { + if (!n.ast) continue; + if (Node.isReturnStatement(n.ast) && n.ast.getExpression()) returnValueNodes.push(n.id); + else if (isExprBody && n.ast === body) returnValueNodes.push(n.id); // arrow expression body + } + + return { facts, aliasPairs, returnValueNodes }; +} + +function kLimit(fields: string[], k: number): string[] { + return fields.length > k ? [...fields.slice(0, k), "*"] : fields; +} + +function enclosingCallable(node: Node): Node | undefined { + let cur: Node | undefined = node.getParent(); + while (cur) { + if ( + Node.isFunctionDeclaration(cur) || + Node.isFunctionExpression(cur) || + Node.isArrowFunction(cur) || + Node.isMethodDeclaration(cur) || + Node.isConstructorDeclaration(cur) || + Node.isGetAccessorDeclaration(cur) || + Node.isSetAccessorDeclaration(cur) + ) + return cur; + cur = cur.getParent(); + } + return undefined; +} + +/** Resolve an identifier (in value position) to a trackable base, relative to callable `fn`. */ +function resolveBase(id: Node, fn: Node, root: string): PathRef | null { + let sym = id.getSymbol(); + if (!sym) return null; + const aliased = sym.getAliasedSymbol(); + if (aliased) sym = aliased; + const decls = sym.getDeclarations(); + const decl = decls && decls.length ? decls[0] : undefined; + if (!decl) return null; + + // Only variable-like declarations are dataflow bases. Callables, classes, enums, interfaces, + // namespaces and type aliases are code/type identities, not mutable values we track. + const isVarLike = + Node.isVariableDeclaration(decl) || Node.isBindingElement(decl) || Node.isParameterDeclaration(decl); + if (!isVarLike) return null; + + const name = (decl as unknown as { getName?: () => string }).getName?.() ?? id.getText(); + const declFn = enclosingCallable(decl); + if (declFn === fn) { + const kind: BaseKind = Node.isParameterDeclaration(decl) ? "param" : "local"; + return { key: `${kind}:${decl.getStart()}`, label: name, baseKind: kind, fields: [] }; + } + if (declFn === undefined) { + // Module-level binding. Only project-internal modules become canonical global paths. + const sf = decl.getSourceFile(); + const fp = sf.getFilePath(); + if (sf.isDeclarationFile() || fp.includes("/node_modules/")) return null; + const { modulePrefix } = fileKeyOf(fp, root); + const canonical = `${modulePrefix}.${name}`; + return { key: canonical, label: canonical, baseKind: "module", fields: [] }; + } + // Declared in some other (enclosing) callable: captured. + return { key: `cap:${decl.getStart()}`, label: name, baseKind: "captured", fields: [] }; +} + +class ExprWalker { + defs: DefFact[] = []; + uses: PathRef[] = []; + + constructor( + private fn: Node, + private root: string, + private k: number, + private union: (a: string, b: string) => void, + ) {} + + walk(node: Node): void { + if (Node.isIdentifier(node)) { + const b = resolveBase(node, this.fn, this.root); + if (b) this.uses.push(b); + return; + } + if (node.getKind() === SyntaxKind.ThisKeyword) { + this.uses.push({ key: "this", label: "this", baseKind: "this", fields: [] }); + return; + } + if (Node.isPropertyAccessExpression(node) || Node.isElementAccessExpression(node)) { + const p = this.pathOf(node); + if (p) this.uses.push(p); + else for (const c of node.forEachChildAsArray()) this.walk(c); + if (Node.isElementAccessExpression(node)) { + const arg = node.getArgumentExpression(); + if (arg) this.walk(arg); // the index expression is read even when the path resolves + } + return; + } + if (Node.isBinaryExpression(node)) { + const opKind = node.getOperatorToken().getKind(); + if (isAssignmentOperator(opKind)) { + this.assignTarget(node.getLeft(), opKind !== SyntaxKind.EqualsToken); + this.walk(node.getRight()); + if (opKind === SyntaxKind.EqualsToken) this.copyUnion(node.getLeft(), node.getRight()); + return; + } + this.walk(node.getLeft()); + this.walk(node.getRight()); + return; + } + if (Node.isPrefixUnaryExpression(node) || Node.isPostfixUnaryExpression(node)) { + const op = node.getOperatorToken(); + if (op === SyntaxKind.PlusPlusToken || op === SyntaxKind.MinusMinusToken) { + this.assignTarget(node.getOperand(), true); + return; + } + this.walk(node.getOperand()); + return; + } + if (Node.isVariableDeclaration(node)) { + const nameNode = node.getNameNode(); + this.bindingDefs(nameNode); + const init = node.getInitializer(); + if (init) { + this.walk(init); + if (Node.isIdentifier(nameNode)) this.copyUnion(nameNode, init); + } + return; + } + if (Node.isShorthandPropertyAssignment(node)) { + const b = resolveBase(node.getNameNode(), this.fn, this.root); + if (b) this.uses.push(b); + return; + } + if (Node.isPropertyAssignment(node)) { + const init = node.getInitializer(); + if (init) this.walk(init); + const nm = node.getNameNode(); + if (Node.isComputedPropertyName(nm)) this.walk(nm); + return; + } + if (isFunctionBoundary(node)) { + this.captureScan(node); + return; + } + // Generic recursion. Type-position identifiers resolve to type declarations, which + // resolveBase rejects, so descending everywhere else is safe. + for (const c of node.forEachChildAsArray()) this.walk(c); + } + + /** LHS of an assignment / operand of ++ --: a def (plus a use for compound operators). */ + assignTarget(lhs: Node, alsoUses: boolean): void { + const target = unwrapExpr(lhs); + const p = this.pathOf(target) ?? (Node.isIdentifier(target) ? resolveBase(target, this.fn, this.root) : null); + if (target.getKind() === SyntaxKind.ThisKeyword) return; // `this` is not assignable + if (p) { + const strong = p.fields.length === 0 && (p.baseKind === "local" || p.baseKind === "param"); + this.defs.push({ ref: p, strong }); + if (alsoUses) this.uses.push(p); + if (Node.isElementAccessExpression(target)) { + const arg = target.getArgumentExpression(); + if (arg) this.walk(arg); + } + return; + } + if (Node.isObjectLiteralExpression(target) || Node.isArrayLiteralExpression(target)) { + // Destructuring assignment pattern: each element identifier is a def. + for (const el of target.forEachChildAsArray()) this.assignTarget(el, false); + return; + } + // Untrackable target (e.g. `f().x = 1`): its subexpressions are still reads. + this.walk(target); + } + + /** Declaration name / binding pattern: strong defs for every bound name. */ + bindingDefs(nameNode: Node): void { + if (Node.isIdentifier(nameNode)) { + const b = resolveBase(nameNode, this.fn, this.root); + if (b) this.defs.push({ ref: b, strong: b.baseKind === "local" || b.baseKind === "param" }); + return; + } + // Object/array binding pattern: defs for each element name, uses for defaults/computed keys. + for (const el of nameNode.getDescendantsOfKind(SyntaxKind.BindingElement)) { + const n = el.getNameNode(); + if (Node.isIdentifier(n)) { + const b = resolveBase(n, this.fn, this.root); + if (b) this.defs.push({ ref: b, strong: b.baseKind === "local" || b.baseKind === "param" }); + } + const init = el.getInitializer(); + if (init) this.walk(init); + } + } + + /** Build a k-limited access path from a property/element access chain, if the root is trackable. */ + private pathOf(node: Node): PathRef | null { + const fields: string[] = []; + let cur: Node = node; + for (;;) { + cur = unwrapExpr(cur); + if (Node.isPropertyAccessExpression(cur)) { + fields.unshift(cur.getNameNode().getText()); + cur = cur.getExpression(); + } else if (Node.isElementAccessExpression(cur)) { + fields.unshift("[*]"); + cur = cur.getExpression(); + } else { + break; + } + } + let base: PathRef | null = null; + if (Node.isIdentifier(cur)) base = resolveBase(cur, this.fn, this.root); + else if (cur.getKind() === SyntaxKind.ThisKeyword) base = { key: "this", label: "this", baseKind: "this", fields: [] }; + if (!base) return null; + return { ...base, fields: kLimit(fields, this.k) }; + } + + /** `q = p` on bare bases: q and p may alias from here on (flow-insensitive, weak). */ + private copyUnion(lhs: Node, rhs: Node): void { + const l = unwrapExpr(lhs); + const r = unwrapExpr(rhs); + if (!Node.isIdentifier(l) || !Node.isIdentifier(r)) return; + const lb = resolveBase(l, this.fn, this.root); + const rb = resolveBase(r, this.fn, this.root); + if (lb && rb) this.union(lb.key, rb.key); + } + + /** + * A nested callable: don't descend normally — attribute its reads of variables declared + * OUTSIDE it (captured locals, module bindings, `this`) to the declaring node (capture edges). + */ + captureScan(fnNode: Node): void { + fnNode.forEachDescendant((n) => { + if (Node.isIdentifier(n)) { + // Skip property-name positions; the receiver carries the read. + const parent = n.getParent(); + if (parent && Node.isPropertyAccessExpression(parent) && parent.getNameNode() === n) return; + if (parent && Node.isPropertyAssignment(parent) && parent.getNameNode() === n) return; + const b = resolveBase(n, this.fn, this.root); + if (!b) return; + // Only reads that escape the nested callable count: filter decls physically inside it. + const decl = declOf(n); + if (decl && decl.getStart() >= fnNode.getStart() && decl.getEnd() <= fnNode.getEnd()) return; + this.uses.push(b); + } else if (n.getKind() === SyntaxKind.ThisKeyword) { + this.uses.push({ key: "this", label: "this", baseKind: "this", fields: [] }); + } + }); + } +} + +function declOf(id: Node): Node | undefined { + let sym = id.getSymbol(); + if (!sym) return undefined; + const aliased = sym.getAliasedSymbol(); + if (aliased) sym = aliased; + const decls = sym.getDeclarations(); + return decls && decls.length ? decls[0] : undefined; +} + +function unwrapExpr(n: Node): Node { + let cur = n; + for (;;) { + if ( + Node.isParenthesizedExpression(cur) || + Node.isAsExpression(cur) || + Node.isNonNullExpression(cur) || + Node.isSatisfiesExpression(cur) + ) { + cur = cur.getExpression(); + } else { + return cur; + } + } +} + +function isAssignmentOperator(k: SyntaxKind): boolean { + return k >= SyntaxKind.FirstAssignment && k <= SyntaxKind.LastAssignment; +} + +function extractFacts( + n: DfNode, + build: FunctionCfgBuild, + root: string, + k: number, + union: (a: string, b: string) => void, +): NodeFacts { + const w = new ExprWalker(build.fn, root, k, union); + const ast = n.ast; + if (!ast) return { defs: [], uses: [] }; + + if (n.kind === "param") { + if (Node.isParameterDeclaration(ast)) { + w.bindingDefs(ast.getNameNode()); + const init = ast.getInitializer(); + if (init) w.walk(init); + } + return { defs: w.defs, uses: w.uses }; + } + + if (Node.isCatchClause(ast)) { + const v = ast.getVariableDeclaration(); + if (v) w.bindingDefs(v.getNameNode()); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isIfStatement(ast) || Node.isWhileStatement(ast) || Node.isDoStatement(ast) || Node.isSwitchStatement(ast)) { + w.walk(ast.getExpression()); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isForStatement(ast)) { + const init = ast.getInitializer(); + if (init) w.walk(init); + const cond = ast.getCondition(); + if (cond) w.walk(cond); + const incr = ast.getIncrementor(); + if (incr) w.walk(incr); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isForOfStatement(ast) || Node.isForInStatement(ast)) { + const init = ast.getInitializer(); + if (init) { + if (Node.isVariableDeclarationList(init)) { + for (const d of init.getDeclarations()) w.bindingDefs(d.getNameNode()); + } else { + w.assignTarget(init, false); + } + } + w.walk(ast.getExpression()); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isReturnStatement(ast) || Node.isThrowStatement(ast)) { + const e = (ast as unknown as { getExpression: () => Node | undefined }).getExpression(); + if (e) w.walk(e); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isFunctionDeclaration(ast) || Node.isClassDeclaration(ast)) { + // Nested declaration statement: binds its name; body reads of outer state are capture uses. + const nm = (ast as unknown as { getNameNode?: () => Node | undefined }).getNameNode?.(); + if (nm && Node.isIdentifier(nm)) { + const b = resolveBase(nm, build.fn, root); + if (b) w.defs.push({ ref: b, strong: true }); + } + w.captureScan(ast); + return { defs: w.defs, uses: w.uses }; + } + if (Node.isBreakStatement(ast) || Node.isContinueStatement(ast) || Node.isDebuggerStatement(ast)) { + return { defs: [], uses: [] }; + } + // Expression statement, variable statement, arrow expression body, or any other leaf. + w.walk(ast); + return { defs: w.defs, uses: w.uses }; +} + +function getBody(fn: Node): Node | undefined { + return (fn as unknown as { getBody?: () => Node | undefined }).getBody?.(); +} + +// ------------------------------------------------------------------------------------------------ +// Solve (pure data — safe to re-run every fixpoint iteration, on any thread) +// ------------------------------------------------------------------------------------------------ + +export interface SolveResult { + ddg: PdgEdge[]; + /** The facts with callee effects and ENTRY ambient defs overlaid (what summaries read). */ + effective: Map; +} + +export function solveDefUse(data: CallableGraphData, callEffects: Map): SolveResult { + // --- union-find over base keys (copy aliases, replayed from extraction) --- + const parent = new Map(); + const find = (x: string): string => { + let r = x; + while (parent.has(r) && parent.get(r) !== r) r = parent.get(r) as string; + return r; + }; + for (const [a, b] of data.aliasPairs) { + const ra = find(a); + const rb = find(b); + if (ra !== rb) parent.set(ra, rb); + } + + // --- overlay: base facts + callee global effects at callsite nodes (never mutate the base) --- + const effective = new Map(); + for (const [id, f] of data.facts) effective.set(id, { defs: [...f.defs], uses: [...f.uses] }); + for (const [nodeId, eff] of callEffects) { + const f = effective.get(nodeId); + if (!f) continue; + for (const g of eff.reads) f.uses.push({ key: g.key, label: g.key, baseKind: "module", fields: g.fields }); + for (const g of eff.writes) + f.defs.push({ ref: { key: g.key, label: g.key, baseKind: "module", fields: g.fields }, strong: false }); + } + + // --- ENTRY defs for ambient bases (module / captured / this): their value on entry --- + const entryFacts = effective.get(data.entryId) as NodeFacts; + const ambient = new Map(); + for (const f of effective.values()) { + for (const u of f.uses) if (u.baseKind !== "local" && u.baseKind !== "param") ambient.set(u.key, u); + for (const d of f.defs) if (d.ref.baseKind !== "local" && d.ref.baseKind !== "param") ambient.set(d.ref.key, d.ref); + } + for (const [key, ref] of ambient) { + entryFacts.defs.push({ ref: { key, label: ref.label, baseKind: ref.baseKind, fields: [] }, strong: false }); + } + + // --- def universe + GEN/KILL --- + interface DefEntry { + node: number; + ref: PathRef; + strong: boolean; + } + const universe: DefEntry[] = []; + const genOf = new Map>(); + for (const n of data.nodes) { + const gen = new Set(); + for (const d of effective.get(n.id)?.defs ?? []) { + gen.add(universe.length); + universe.push({ node: n.id, ref: d.ref, strong: d.strong }); + } + genOf.set(n.id, gen); + } + const killOf = new Map>(); + for (const n of data.nodes) { + const kill = new Set(); + for (const d of effective.get(n.id)?.defs ?? []) { + if (!d.strong) continue; + for (const [i, u] of universe.entries()) { + if (u.node !== n.id && find(u.ref.key) === find(d.ref.key)) kill.add(i); + } + } + killOf.set(n.id, kill); + } + + // --- worklist --- + const { succ, pred } = dataAdjacency(data); + const inOf = new Map>(); + const outOf = new Map>(); + for (const n of data.nodes) { + inOf.set(n.id, new Set()); + outOf.set(n.id, new Set()); + } + const work: number[] = data.nodes.map((n) => n.id); + while (work.length) { + const id = work.shift() as number; + const inSet = new Set(); + for (const p of pred.get(id) ?? []) for (const d of outOf.get(p) ?? []) inSet.add(d); + const outSet = new Set(genOf.get(id)); + const kill = killOf.get(id) as Set; + for (const d of inSet) if (!kill.has(d)) outSet.add(d); + inOf.set(id, inSet); + const prev = outOf.get(id) as Set; + if (outSet.size !== prev.size || [...outSet].some((d) => !prev.has(d))) { + outOf.set(id, outSet); + for (const s of succ.get(id) ?? []) if (!work.includes(s)) work.push(s); + } + } + + // --- DDG edges: def → use of a may-aliasing path --- + const mayAlias = (a: PathRef, b: PathRef): boolean => + find(a.key) === find(b.key) && fieldsMayAlias(a.fields, b.fields); + const ddg: PdgEdge[] = []; + const seen = new Set(); + for (const n of data.nodes) { + const f = effective.get(n.id) as NodeFacts; + if (!f.uses.length) continue; + const reaching = inOf.get(n.id) as Set; + for (const u of f.uses) { + for (const di of reaching) { + const d = universe[di] as DefEntry; + if (!mayAlias(d.ref, u)) continue; + const k2 = `${d.node}>${n.id}>${renderPath(u)}`; + if (seen.has(k2)) continue; + seen.add(k2); + ddg.push({ source: d.node, target: n.id, type: "DDG", var: renderPath(u) }); + } + } + } + + // --- formal-out routing: EXIT doubles as the SDG formal-out node --- + // PARAM_OUT edges source at the callee's EXIT, so the value that leaves the function must flow + // INTO it: return-value nodes carry the return value, module-global writes are live-out state. + // Without these edges a slice descending a PARAM_OUT would dead-end at EXIT. + for (const r of [...data.returnValueNodes].sort((a, b) => a - b)) { + const k2 = `${r}>${data.exitId}>return`; + if (!seen.has(k2)) { + seen.add(k2); + ddg.push({ source: r, target: data.exitId, type: "DDG", var: "return" }); + } + } + for (const n of data.nodes) { + if (n.id === data.entryId) continue; + for (const d of effective.get(n.id)?.defs ?? []) { + if (d.ref.baseKind !== "module") continue; + const rendered = renderPath(d.ref); + const k2 = `${n.id}>${data.exitId}>${rendered}`; + if (!seen.has(k2)) { + seen.add(k2); + ddg.push({ source: n.id, target: data.exitId, type: "DDG", var: rendered }); + } + } + } + + return { ddg, effective }; +} diff --git a/src/dataflow/dominance.ts b/src/dataflow/dominance.ts new file mode 100644 index 0000000..9851e19 --- /dev/null +++ b/src/dataflow/dominance.ts @@ -0,0 +1,140 @@ +/** + * Stage 2 — post-dominators and control dependence. + * + * Post-dominators via the Cooper–Harper–Kennedy iterative algorithm run on the REVERSE CFG + * rooted at EXIT. Control dependence via Ferrante–Ottenstein–Warren: for each CFG edge (a, b) + * where b does not post-dominate a, every node from b up the post-dominator tree to (exclusive) + * ipdom(a) is control-dependent on a. + * + * The CFG is augmented with the standard ENTRY → EXIT edge for this computation only, so + * straight-line statements come out control-dependent on ENTRY (the function's outermost + * control region). The augmented edge is never emitted. + * + * Infinite loops need no special-casing here: stage 1 always emits the loop-exit `false` edge + * (even when the condition is literally `true`), which keeps EXIT the unique post-dominance root. + */ +import type { PdgEdge } from "../schema"; +import type { FunctionCfgBuild } from "./model"; + +/** Immediate post-dominator per node id (EXIT maps to itself). */ +export function postDominators(build: FunctionCfgBuild): Map { + const { exitId } = build; + const edges = augmentedEdges(build); + // Reverse-CFG adjacency: from EXIT we walk against the CFG edges. + const predOfReverse = new Map(); // reverse-graph successors = CFG predecessors + for (const n of build.nodes) predOfReverse.set(n.id, []); + for (const [a, b] of edges) predOfReverse.get(b)?.push(a); // in the reverse graph, b → a + + // Postorder of the reverse graph from EXIT (iterative DFS). + const postorder: number[] = []; + const poNum = new Map(); + const visited = new Set([exitId]); + const stack: Array<{ node: number; next: number }> = [{ node: exitId, next: 0 }]; + while (stack.length) { + const top = stack[stack.length - 1] as { node: number; next: number }; + const kids = predOfReverse.get(top.node) as number[]; + if (top.next < kids.length) { + const k = kids[top.next++] as number; + if (!visited.has(k)) { + visited.add(k); + stack.push({ node: k, next: 0 }); + } + } else { + poNum.set(top.node, postorder.length); + postorder.push(top.node); + stack.pop(); + } + } + + const ipdom = new Map(); + ipdom.set(exitId, exitId); + const intersect = (u: number, v: number): number => { + let a = u; + let b = v; + while (a !== b) { + while ((poNum.get(a) as number) < (poNum.get(b) as number)) a = ipdom.get(a) as number; + while ((poNum.get(b) as number) < (poNum.get(a) as number)) b = ipdom.get(b) as number; + } + return a; + }; + + // Successor adjacency (CFG direction), for the intersect step. + const cfgSucc = new Map(); + for (const n of build.nodes) cfgSucc.set(n.id, []); + for (const [a, b] of edges) cfgSucc.get(a)?.push(b); + + // Iterate to fixpoint in reverse postorder (of the reverse graph). + let changed = true; + while (changed) { + changed = false; + for (let i = postorder.length - 1; i >= 0; i--) { + const n = postorder[i] as number; + if (n === exitId) continue; + // "Predecessors" in the reverse graph = CFG successors that already have an ipdom. + let candidate = -1; + for (const s of cfgSucc.get(n) ?? []) { + if (!ipdom.has(s) || !poNum.has(s)) continue; + candidate = candidate === -1 ? s : intersect(candidate, s); + } + if (candidate !== -1 && ipdom.get(n) !== candidate) { + ipdom.set(n, candidate); + changed = true; + } + } + } + return ipdom; +} + +/** Does `b` strictly post-dominate `a`? (walks a's ipdom chain) */ +function strictlyPostDominates(b: number, a: number, ipdom: Map, exitId: number): boolean { + if (a === b) return false; + let cur = ipdom.get(a); + const seen = new Set(); + while (cur !== undefined && !seen.has(cur)) { + if (cur === b) return true; + if (cur === exitId) return b === exitId; + seen.add(cur); + cur = ipdom.get(cur); + } + return false; +} + +/** Ferrante–Ottenstein–Warren control-dependence edges (branch node → dependent node). */ +export function controlDependence(build: FunctionCfgBuild, ipdom: Map): PdgEdge[] { + const out: PdgEdge[] = []; + const seen = new Set(); + for (const [a, b] of augmentedEdges(build)) { + const ia = ipdom.get(a); + if (ia === undefined || !ipdom.has(b)) continue; // node can't reach EXIT — skip (gate-checked) + if (b === ia || strictlyPostDominates(b, a, ipdom, build.exitId)) continue; + let runner = b; + let guard = build.nodes.length + 1; + while (runner !== ia && guard-- > 0) { + const k = `${a}>${runner}`; + if (!seen.has(k) && a !== runner) { + seen.add(k); + out.push({ source: a, target: runner, type: "CDG" }); + } + runner = ipdom.get(runner) as number; + } + } + return out; +} + +// --- helpers --- + +/** CFG edges as (source, target) pairs, deduped, plus the augmented ENTRY → EXIT edge. */ +function augmentedEdges(build: FunctionCfgBuild): Array<[number, number]> { + const seen = new Set(); + const out: Array<[number, number]> = []; + const push = (a: number, b: number): void => { + const k = `${a}>${b}`; + if (seen.has(k)) return; + seen.add(k); + out.push([a, b]); + }; + for (const e of build.edges) push(e.source, e.target); + push(build.entryId, build.exitId); + return out; +} + diff --git a/src/dataflow/extract.ts b/src/dataflow/extract.ts new file mode 100644 index 0000000..751ad53 --- /dev/null +++ b/src/dataflow/extract.ts @@ -0,0 +1,91 @@ +/** + * Stages 1–4 extraction — the AST-bound half of the level-3 pipeline, per callable: + * CFG (stage 1) → post-dominance + control dependence (stage 2) → def/use facts (stage 3's + * extraction half) → the serializable CallableGraphData projection. + * + * This is the unit that fans out over the worker pool: it has zero cross-function dependencies + * (embarrassingly parallel), and its output is plain data, so everything downstream — the + * reaching-defs solve, the summary wavefront, SDG assembly, emission — never touches an AST + * again. The same function serves the sequential (--jobs 1) path against the main-thread + * project, which is what makes N-vs-1 differential testing meaningful. + */ +import { Node, type Project } from "ts-morph"; +import { computeSignatureForDecl, type GraphNode } from "../schema"; +import { buildCfg } from "./cfg"; +import { extractFunctionFacts } from "./defuse"; +import { controlDependence, postDominators } from "./dominance"; +import type { CallableGraphData, FunctionCfgBuild } from "./model"; + +/** Extract the full per-callable data product, or null when the callable has no body. */ +export function extractCallableData(signature: string, fn: Node, path: string, root: string, k: number): CallableGraphData | null { + const build = buildCfg(signature, fn); + if (!build) return null; + + const cdg = controlDependence(build, postDominators(build)).sort( + (a, b) => a.source - b.source || a.target - b.target, + ); + const { facts, aliasPairs, returnValueNodes } = extractFunctionFacts(build, root, k); + + return { + signature, + path, + nodes: build.nodes.map((n) => emitNode(n.id, n.kind, n.ast, build)), + edges: [...build.edges].sort((a, b) => a.source - b.source || a.target - b.target || a.kind.localeCompare(b.kind)), + cdg, + entryId: build.entryId, + exitId: build.exitId, + paramIds: build.paramIds, + hasRestParam: hasRestParam(build), + facts, + aliasPairs, + returnValueNodes, + }; +} + +/** + * Walk source files and index callable declarations by canonical signature — the same + * computeSignatureForDecl the symbol table and call graph use, so keys byte-match. For + * `const f = () => {}` the signature keys the VariableDeclaration but the CFG is built from the + * initializer (the node that owns parameters and body). Restrict with `onlyFiles` (absolute + * paths) when a worker owns just a partition of the project. + */ +export function indexCallableDecls(project: Project, root: string, onlyFiles?: Set): Map { + const idx = new Map(); + for (const sf of project.getSourceFiles()) { + const fp = sf.getFilePath(); + if (sf.isDeclarationFile() || fp.includes("/node_modules/")) continue; + if (onlyFiles && !onlyFiles.has(fp)) continue; + sf.forEachDescendant((n) => { + if ( + Node.isFunctionDeclaration(n) || + Node.isMethodDeclaration(n) || + Node.isConstructorDeclaration(n) || + Node.isGetAccessorDeclaration(n) || + Node.isSetAccessorDeclaration(n) + ) { + const sig = computeSignatureForDecl(n, root); + if (sig && !idx.has(sig)) idx.set(sig, n); + } else if (Node.isVariableDeclaration(n)) { + const init = n.getInitializer(); + if (init && (Node.isArrowFunction(init) || Node.isFunctionExpression(init))) { + const sig = computeSignatureForDecl(n, root); + if (sig && !idx.has(sig)) idx.set(sig, init); + } + } + }); + } + return idx; +} + +function emitNode(id: number, kind: GraphNode["kind"], ast: Node | null, build: FunctionCfgBuild): GraphNode { + const target = ast ?? build.fn; // ENTRY/EXIT carry the whole callable's span + const s = build.sf.getLineAndColumnAtPos(target.getStart()); + const e = build.sf.getLineAndColumnAtPos(target.getEnd()); + return { id, kind, start_line: s.line, start_column: s.column, end_line: e.line, end_column: e.column }; +} + +function hasRestParam(build: FunctionCfgBuild): boolean { + const params = (build.fn as unknown as { getParameters?: () => Node[] }).getParameters?.() ?? []; + const last = params[params.length - 1]; + return last !== undefined && Node.isParameterDeclaration(last) && last.isRestParameter(); +} diff --git a/src/dataflow/index.ts b/src/dataflow/index.ts new file mode 100644 index 0000000..8993537 --- /dev/null +++ b/src/dataflow/index.ts @@ -0,0 +1,437 @@ +/** + * Level 3 — the program-graphs pipeline (stages 1–7 of the dataflow contract), run only at + * `-a 3` after the symbol table exists, structured for the contract's parallel execution model: + * + * startExtraction (stages 1–4) — embarrassingly parallel per callable: fanned out over a + * Bun worker pool, partitioned by file. Kicked off BEFORE the + * call-graph solve on the main thread (core.ts), so the two + * run concurrently and join before summaries. + * buildProgramGraphs (stages 5–7) — joins the extraction, maps call sites onto nodes (needs + * the provider-backfilled callee signatures), then composes + * summaries as a Kahn-style ready-queue WAVEFRONT over the + * Tarjan SCC condensation DAG: an SCC dispatches the moment + * its callee SCCs are done; the SCC (its internal fixpoint) + * is the atomic unit, one worker each. SDG assembly and + * emission close it out. + * + * `--jobs 1` is the fully sequential debug mode (no workers, main-thread project reused, SCCs + * processed in Tarjan order) and the differential oracle: `--jobs N` must emit byte-identical + * output, which holds because ids are span-ordered (never discovery-ordered), every edge list is + * collect-then-sorted, and sccFixpoint is a pure function of its inputs. Worker failure at any + * point degrades to the sequential path — parallelism is an optimization, not a dependency. + * + * Summaries (with their callee dependency edges and the owning module's content hash) are + * persisted to `/graphs_summaries.json` — recorded from day one so incremental + * re-analysis can later consume them; nothing reads them yet. + */ +import * as fs from "node:fs"; +import * as path from "node:path"; +import type { Project } from "ts-morph"; +import type { AnalysisOptions } from "../options"; +import { + PROGRAM_GRAPHS_SCHEMA_VERSION, + fileKeyOf, + type FunctionGraphs, + type GraphNode, + type PdgEdge, + type ProgramGraphs, + type TSCallable, + type TSCallsite, + type TSClass, + type TSModule, + type TSNamespace, +} from "../schema"; +import type { Logger } from "../utils"; +import { extractCallableData, indexCallableDecls } from "./extract"; +import type { CallableGraphData } from "./model"; +import { WorkerPool } from "./pool"; +import { assembleSdg } from "./sdg"; +import { + callAdjacency, + sccFixpoint, + tarjanSccs, + type CallSiteRef, + type FunctionSummary, +} from "./summaries"; +import type { ExtractTask, SolveTask, SolveTaskResult } from "./worker"; + +export { backwardSlice, type SliceCriterion } from "./slice"; + +// ------------------------------------------------------------------------------------------------ +// Stage 1–4 extraction (started early so it overlaps the call-graph solve) +// ------------------------------------------------------------------------------------------------ + +export interface ExtractionHandle { + promise: Promise>; + pool: WorkerPool | null; +} + +export function startExtraction( + project: Project, + symbol_table: Record, + tsConfigFilePath: string | null, + opts: AnalysisOptions, + log: Logger, +): ExtractionHandle { + const callables = collectCallables(symbol_table); + + // Partition callables by owning file (round-robin over the sorted file list) so each worker + // deeply visits only its share of the program. TSCallable.path is the declaration's ABSOLUTE + // file path; the graph data carries the project-relative file key. + const byFile = new Map>(); + for (const [sig, c] of [...callables.entries()].sort(([a], [b]) => a.localeCompare(b))) { + const absPath = c.path; + const arr = byFile.get(absPath) ?? []; + arr.push({ signature: sig, path: fileKeyOf(absPath, opts.input).fileKey, absPath }); + byFile.set(absPath, arr); + } + const files = [...byFile.keys()].sort(); + + // Auto (0) resolves to sequential: each extraction worker must materialize its own + // whole-program ts-morph project (ASTs can't cross the clone boundary), and measurement shows + // that project load dominates the parallelizable graph math well past mid-sized repos — + // e.g. self-analysis (36 files, 211 callables) runs 2.5× SLOWER at -j 14. An explicit -j N is + // therefore an opt-in for large codebases (and how the differential test forces the worker + // path); correctness is guarded either way by the byte-identical N-vs-1 gate. + const jobs = opts.jobs === 0 ? 1 : opts.jobs; + + if (jobs <= 1) { + return { promise: Promise.resolve(extractSequential(project, callables, opts)), pool: null }; + } + const workerCount = Math.max(1, Math.min(jobs, files.length)); + + let pool: WorkerPool; + try { + pool = new WorkerPool(workerCount); + } catch (e) { + log.warn(`graph workers unavailable (${(e as Error).message}); extracting sequentially`); + return { promise: Promise.resolve(extractSequential(project, callables, opts)), pool: null }; + } + + const partitions: Array> = Array.from( + { length: workerCount }, + () => [], + ); + files.forEach((f, i) => partitions[i % workerCount]?.push(...(byFile.get(f) ?? []))); + + const handle: ExtractionHandle = { promise: Promise.resolve(new Map()), pool }; + handle.promise = Promise.all( + partitions + .filter((p) => p.length) + .map((sigs) => { + const task: ExtractTask = { + type: "extract", + root: opts.input, + tsConfigFilePath, + skipTests: opts.skipTests, + k: opts.graphFieldDepth, + sigs, + }; + return pool.exec(task); + }), + ) + .then((chunks) => { + const out = new Map(); + for (const chunk of chunks) for (const d of chunk) out.set(d.signature, d); + if (out.size === 0 && callables.size > 0) { + // Workers "succeeding" with nothing means their view of the project diverged from the + // main thread's — treat it as a failure, never as an empty program. + throw new Error("workers returned no callables"); + } + return out; + }) + .catch((e: Error) => { + // Degrade, never fail: retire the pool (so the wavefront goes sequential too — a pool + // that failed extraction must not be trusted with, or dangle, further tasks) and + // recompute on the main-thread project. + log.warn(`graph extraction workers failed (${e.message}); falling back to sequential`); + handle.pool?.close(); + handle.pool = null; + return extractSequential(project, callables, opts); + }); + + return handle; +} + +function extractSequential( + project: Project, + callables: Map, + opts: AnalysisOptions, +): Map { + const astIndex = indexCallableDecls(project, opts.input); + const out = new Map(); + for (const [sig, c] of [...callables.entries()].sort(([a], [b]) => a.localeCompare(b))) { + const fn = astIndex.get(sig); + if (!fn) continue; // bodiless (interface/abstract/ambient/implicit) or unmatchable + const data = extractCallableData(sig, fn, fileKeyOf(c.path, opts.input).fileKey, opts.input, opts.graphFieldDepth); + if (data) out.set(sig, data); + } + return out; +} + +// ------------------------------------------------------------------------------------------------ +// Stages 5–7 + emission +// ------------------------------------------------------------------------------------------------ + +export async function buildProgramGraphs( + extraction: ExtractionHandle, + symbol_table: Record, + opts: AnalysisOptions, + log: Logger, +): Promise { + try { + const datas = await extraction.promise; + const callables = collectCallables(symbol_table); + log.info( + `program graphs: ${datas.size} callables (of ${callables.size} in the symbol table), ` + + `workers=${extraction.pool ? extraction.pool.size : 1}`, + ); + + // Map recorded call sites onto CFG statement nodes (pure span containment — no AST). The + // callee signatures were backfilled by the call-graph provider while extraction ran. + const callSites = new Map(); + for (const [sig, data] of datas) { + const refs: CallSiteRef[] = []; + for (const site of (callables.get(sig) as TSCallable).call_sites) { + const nodeId = containingNode(data, site); + if (nodeId === null) continue; + refs.push({ nodeId, callee: site.callee_signature, argCount: site.argument_types.length }); + } + refs.sort((a, b) => a.nodeId - b.nodeId || (a.callee ?? "").localeCompare(b.callee ?? "")); + callSites.set(sig, refs); + } + + // Stages 5–6: SCC condensation + the summary wavefront. + const { summaries, ddg, sccCount, largest } = await composeWavefront(datas, callSites, extraction.pool, log); + log.debug(`program graphs: ${sccCount} SCCs, largest ${largest}`); + + // Emission per --graphs selector. + const wantCfg = opts.graphs.includes("cfg"); + const wantPdg = opts.graphs.includes("pdg"); + const wantDfg = opts.graphs.includes("dfg"); + const wantSdg = opts.graphs.includes("sdg"); + + const functions: Record = {}; + for (const [sig, data] of [...datas.entries()].sort(([a], [b]) => a.localeCompare(b))) { + const fg: FunctionGraphs = {}; + if (wantCfg) fg.cfg = { nodes: data.nodes, edges: data.edges }; + if (wantPdg || wantDfg) { + const edges: PdgEdge[] = []; + if (wantPdg) edges.push(...data.cdg); + edges.push(...(ddg.get(sig) ?? [])); + fg.pdg = { + edges: edges.sort( + (a, b) => + a.source - b.source || + a.target - b.target || + a.type.localeCompare(b.type) || + (a.var ?? "").localeCompare(b.var ?? ""), + ), + }; + } + functions[sig] = fg; + } + + const sdg_edges = wantSdg ? assembleSdg(datas, callSites, summaries) : []; + + persistSummaries(opts, symbol_table, callables, summaries, log); + + return { schema_version: PROGRAM_GRAPHS_SCHEMA_VERSION, k_limit: opts.graphFieldDepth, functions, sdg_edges }; + } finally { + extraction.pool?.close(); + } +} + +// ------------------------------------------------------------------------------------------------ +// The summary wavefront: Kahn-style ready queue over the SCC condensation DAG +// ------------------------------------------------------------------------------------------------ + +async function composeWavefront( + datas: Map, + callSites: Map, + pool: WorkerPool | null, + log: Logger, +): Promise<{ summaries: Map; ddg: Map; sccCount: number; largest: number }> { + const sigs = [...datas.keys()].sort(); + const adj = callAdjacency(sigs, callSites, (s) => datas.has(s)); + const sccs = tarjanSccs(sigs, adj); // emitted callees-first + const largest = Math.max(0, ...sccs.map((s) => s.length)); + + const summaries = new Map(); + const ddg = new Map(); + + const solveInline = (members: string[]): void => { + const res = sccFixpoint( + members.map((m) => datas.get(m) as CallableGraphData), + callSites, + summaries, + ); + for (const [k, v] of res.summaries) summaries.set(k, v); + for (const [k, v] of res.ddg) ddg.set(k, v); + }; + + if (!pool) { + // Sequential debug mode: Tarjan order IS a valid wavefront linearization. + for (const scc of sccs) solveInline(scc); + return { summaries, ddg, sccCount: sccs.length, largest }; + } + + // Condensation DAG: per-SCC dependency counters (callee SCCs must finish first) + reverse + // index (callee SCC → dependent caller SCCs) so completions decrement exactly their waiters. + const sccOf = new Map(); + sccs.forEach((scc, i) => scc.forEach((sig) => sccOf.set(sig, i))); + const pendingDeps: number[] = sccs.map(() => 0); + const dependents: number[][] = sccs.map(() => []); + sccs.forEach((scc, i) => { + const calleeSccs = new Set(); + for (const sig of scc) { + for (const callee of adj.get(sig) ?? []) { + const j = sccOf.get(callee); + if (j !== undefined && j !== i) calleeSccs.add(j); + } + } + pendingDeps[i] = calleeSccs.size; + for (const j of calleeSccs) dependents[j]?.push(i); + }); + + const ready: number[] = []; + sccs.forEach((_, i) => { + if (pendingDeps[i] === 0) ready.push(i); + }); + + const dispatch = (i: number): Promise<{ i: number; res: SolveTaskResult }> => { + const members = sccs[i] as string[]; + // Ship only what the SCC needs: its members' data/call sites + its callees' summaries. + const calleeSummaries: Array<[string, FunctionSummary]> = []; + for (const sig of members) { + for (const callee of adj.get(sig) ?? []) { + const s = summaries.get(callee); + if (s) calleeSummaries.push([callee, s]); + } + } + const task: SolveTask = { + type: "solve", + members: members.map((m) => datas.get(m) as CallableGraphData), + callSites: members.map((m) => [m, callSites.get(m) ?? []]), + calleeSummaries, + }; + return pool.exec(task).then((res) => ({ i, res })); + }; + + try { + const inflight = new Map>(); + while (ready.length || inflight.size) { + while (ready.length && inflight.size < pool.size) { + const i = ready.shift() as number; + inflight.set(i, dispatch(i)); + } + const { i, res } = await Promise.race(inflight.values()); + inflight.delete(i); + for (const [k, v] of res.summaries) summaries.set(k, v); + for (const [k, v] of res.ddg) ddg.set(k, v); + for (const dep of dependents[i] ?? []) { + pendingDeps[dep] = (pendingDeps[dep] as number) - 1; + if (pendingDeps[dep] === 0) ready.push(dep); + } + } + } catch (e) { + // Degrade, never fail: redo the whole composition sequentially (pure functions — cheap-ish). + log.warn(`summary wavefront workers failed (${(e as Error).message}); recomposing sequentially`); + summaries.clear(); + ddg.clear(); + for (const scc of sccs) solveInline(scc); + } + + return { summaries, ddg, sccCount: sccs.length, largest }; +} + +// ------------------------------------------------------------------------------------------------ +// Call-site → CFG-node mapping (span containment on serialized nodes) +// ------------------------------------------------------------------------------------------------ + +/** The innermost non-synthetic CFG node whose span contains the recorded call site, or null. */ +function containingNode(data: CallableGraphData, site: TSCallsite): number | null { + let best: { id: number; span: number } | null = null; + for (const n of data.nodes) { + if (n.kind === "entry" || n.kind === "exit") continue; + if (!containsPos(n, site.start_line, site.start_column)) continue; + const span = spanSize(n); + if (!best || span < best.span) best = { id: n.id, span }; + } + return best?.id ?? null; +} + +function containsPos(n: GraphNode, line: number, column: number): boolean { + if (line < n.start_line || line > n.end_line) return false; + if (line === n.start_line && column < n.start_column) return false; + if (line === n.end_line && column >= n.end_column) return false; + return true; +} + +function spanSize(n: GraphNode): number { + return (n.end_line - n.start_line) * 10_000 + (n.end_column - n.start_column); +} + +// ------------------------------------------------------------------------------------------------ +// Summary persistence (dependency-recorded, for later incrementality; write-only today) +// ------------------------------------------------------------------------------------------------ + +function persistSummaries( + opts: AnalysisOptions, + symbol_table: Record, + callables: Map, + summaries: Map, + log: Logger, +): void { + try { + const cacheDir = opts.cacheDir ?? path.join(opts.input, ".codeanalyzer"); + fs.mkdirSync(cacheDir, { recursive: true }); + const entries: Record = {}; + for (const sig of [...summaries.keys()].sort()) { + const c = callables.get(sig); + // TSCallable.path is absolute; the symbol table is keyed by the project-relative file key. + const fileKey = c ? fileKeyOf(c.path, opts.input).fileKey : null; + entries[sig] = { + ...summaries.get(sig), + content_hash: (fileKey && symbol_table[fileKey]?.content_hash) ?? null, + }; + } + const payload = { schema_version: PROGRAM_GRAPHS_SCHEMA_VERSION, k_limit: opts.graphFieldDepth, summaries: entries }; + fs.writeFileSync(path.join(cacheDir, "graphs_summaries.json"), JSON.stringify(payload, null, 2)); + } catch (e) { + log.warn(`could not persist graph summaries: ${(e as Error).message}`); + } +} + +// ------------------------------------------------------------------------------------------------ +// Symbol-table collection (signature → callable), recursing through every container kind +// ------------------------------------------------------------------------------------------------ + +function collectCallables(symbol_table: Record): Map { + const out = new Map(); + for (const mod of Object.values(symbol_table)) collectModule(mod, out); + return out; +} + +function collectModule(mod: TSModule, out: Map): void { + for (const f of Object.values(mod.functions)) collectCallable(f, out); + for (const c of Object.values(mod.classes)) collectClass(c, out); + for (const ns of Object.values(mod.namespaces)) collectNamespace(ns, out); +} + +function collectNamespace(ns: TSNamespace, out: Map): void { + for (const f of Object.values(ns.functions)) collectCallable(f, out); + for (const c of Object.values(ns.classes)) collectClass(c, out); + for (const n of Object.values(ns.namespaces)) collectNamespace(n, out); +} + +function collectClass(c: TSClass, out: Map): void { + for (const m of Object.values(c.methods)) collectCallable(m, out); + for (const ic of Object.values(c.inner_classes)) collectClass(ic, out); +} + +function collectCallable(c: TSCallable, out: Map): void { + out.set(c.signature, c); + for (const ic of Object.values(c.inner_callables)) collectCallable(ic, out); + for (const cl of Object.values(c.inner_classes)) collectClass(cl, out); +} diff --git a/src/dataflow/model.ts b/src/dataflow/model.ts new file mode 100644 index 0000000..8663f6f --- /dev/null +++ b/src/dataflow/model.ts @@ -0,0 +1,149 @@ +/** + * Internal working model shared by the level-3 dataflow stages. The emitted shapes live in + * schema/graphs.ts. + * + * Two tiers, split along the worker boundary: + * - `FunctionCfgBuild` is AST-linked (ts-morph nodes) and never leaves the thread that parsed. + * - `CallableGraphData` is the plain-data projection of everything downstream stages need — + * structured-clone/JSON-safe, so stage-1–4 extraction can fan out over a worker pool and the + * summary wavefront can run on data alone (no AST access after extraction). + */ +import type { Node, SourceFile } from "ts-morph"; +import type { CfgEdge, GraphNode, GraphNodeKind, PdgEdge } from "../schema"; + +// ------------------------------------------------------------------------------------------------ +// AST-linked build product (stage 1, thread-local) +// ------------------------------------------------------------------------------------------------ + +/** A CFG node with its AST link. `ast` is null only for the synthetic ENTRY/EXIT pair. */ +export interface DfNode { + id: number; + kind: GraphNodeKind; + ast: Node | null; +} + +/** The per-callable CFG build product (stage 1), input to fact extraction. */ +export interface FunctionCfgBuild { + signature: string; + /** The function-like AST node (FunctionDeclaration / Method / Ctor / accessor / arrow / fn-expr). */ + fn: Node; + sf: SourceFile; + /** Ordered by id; nodes[0] is ENTRY, nodes[nodes.length - 1] is EXIT. */ + nodes: DfNode[]; + edges: CfgEdge[]; + entryId: number; + exitId: number; + /** node ids of the `param` nodes, in declaration order (the SDG formal-in nodes). */ + paramIds: number[]; +} + +// ------------------------------------------------------------------------------------------------ +// Access paths (plain data — labels every DDG edge and summary entry) +// ------------------------------------------------------------------------------------------------ + +export type BaseKind = "local" | "param" | "this" | "captured" | "module"; + +export interface PathRef { + /** Unique base identity: decl-position for locals/params/captured, canonical path for module, "this". */ + key: string; + /** Human label for the base (the variable name / canonical module path). */ + label: string; + baseKind: BaseKind; + fields: string[]; // "f" | "[*]" | "*" (trailing truncation star) +} + +export function renderPath(p: PathRef): string { + let s = p.label; + for (const f of p.fields) s += f === "[*]" ? "[*]" : `.${f}`; + return s; +} + +/** A global (module-binding) path as carried by summaries: canonical base + fields. */ +export interface GlobalPath { + key: string; // == the canonical `.` label + fields: string[]; +} + +export function renderGlobal(g: GlobalPath): string { + return renderPath({ key: g.key, label: g.key, baseKind: "module", fields: g.fields }); +} + +/** May two field lists overlap? "*" (truncation) matches any tail; "[*]" matches any one step. */ +export function fieldsMayAlias(a: string[], b: string[]): boolean { + const n = Math.min(a.length, b.length); + for (let i = 0; i < n; i++) { + const x = a[i] as string; + const y = b[i] as string; + if (x === "*" || y === "*") return true; + if (x === "[*]" || y === "[*]") continue; // dynamic index may hit any member + if (x !== y) return false; + } + return true; // one path is a prefix of the other (whole-object vs member) +} + +// ------------------------------------------------------------------------------------------------ +// Per-node dataflow facts (plain data) +// ------------------------------------------------------------------------------------------------ + +export interface DefFact { + ref: PathRef; + /** Strong defs kill; only whole-base writes of locals/params qualify (field writes are weak). */ + strong: boolean; +} + +export interface NodeFacts { + defs: DefFact[]; + uses: PathRef[]; +} + +/** Transitive global effects of a call, applied at its callsite node during the solve. */ +export interface CallEffects { + reads: GlobalPath[]; + writes: GlobalPath[]; +} + +// ------------------------------------------------------------------------------------------------ +// The serializable per-callable projection (crosses the worker boundary) +// ------------------------------------------------------------------------------------------------ + +/** + * Everything stages 5–8 and emission need, as plain data: the emitted CFG (nodes carry source + * spans, not AST links), the CDG, the extracted per-node facts, the copy-alias pairs, and the + * structural metadata. Producing this once per callable is the AST-bound work; every fixpoint + * iteration after it is pure data. + */ +export interface CallableGraphData { + signature: string; + /** Owning module file key (project-relative POSIX path with extension). */ + path: string; + /** Emitted-shape nodes, index == id (ENTRY first, EXIT last). */ + nodes: GraphNode[]; + edges: CfgEdge[]; + /** Control-dependence edges (stage 2), computed at extraction time. */ + cdg: PdgEdge[]; + entryId: number; + exitId: number; + paramIds: number[]; + hasRestParam: boolean; + /** Extracted defs/uses per node id (pairs, for clone-safety). */ + facts: Array<[number, NodeFacts]>; + /** Copy-alias unions discovered during extraction (`const q = p`). */ + aliasPairs: Array<[string, string]>; + /** Nodes that produce the function's return value. */ + returnValueNodes: number[]; +} + +/** Successor/predecessor adjacency over serialized CFG edges. */ +export function dataAdjacency(data: CallableGraphData): { succ: Map; pred: Map } { + const succ = new Map(); + const pred = new Map(); + for (const n of data.nodes) { + succ.set(n.id, []); + pred.set(n.id, []); + } + for (const e of data.edges) { + succ.get(e.source)?.push(e.target); + pred.get(e.target)?.push(e.source); + } + return { succ, pred }; +} diff --git a/src/dataflow/pool.ts b/src/dataflow/pool.ts new file mode 100644 index 0000000..1046247 --- /dev/null +++ b/src/dataflow/pool.ts @@ -0,0 +1,104 @@ +/** + * A minimal fixed-size worker pool over Bun's Worker (one in-flight task per worker, FIFO + * queue). Used by the level-3 pipeline for the stage-1–4 extraction fan-out and the SCC + * wavefront. Construction or task failure is surfaced to the caller, which falls back to the + * sequential (--jobs 1) path — parallelism is an optimization, never a correctness dependency. + * + * Failure discipline: a worker that errors is retired (never re-idled), and when the last live + * worker dies every queued task is rejected — a task must never sit in the queue with nothing + * left to run it, or the awaiting pipeline would dangle unresolved and the process could exit + * without emitting output. + */ + +interface Pending { + resolve: (v: unknown) => void; + reject: (e: Error) => void; +} + +/** + * The worker entry URL differs between dev and the compiled binary. Dev/test runs load the + * TypeScript source relative to this module. `bun build --compile` embeds the extra entrypoint + * under its BUILT name and its path relative to the entries' common root (src/) inside the + * `$bunfs` virtual filesystem — `src/dataflow/worker.ts` → `/$bunfs/root/dataflow/worker.js`, + * while `import.meta.url` here is the bundled main (`/$bunfs/root/`). Coupled to the + * `build` script's entry list in package.json; a mismatch degrades to the sequential path via + * the pool's failure fallback (with a warning), never to wrong output. + */ +function workerUrl(): string { + const compiled = import.meta.url.includes("$bunfs"); + return new URL(compiled ? "./dataflow/worker.js" : "./worker.ts", import.meta.url).href; +} + +export class WorkerPool { + private workers = new Set(); + private idle: Worker[] = []; + private pending = new Map(); + private queue: Array<{ msg: unknown; p: Pending }> = []; + private closed = false; + + constructor(size: number) { + const url = workerUrl(); + for (let i = 0; i < size; i++) { + const w = new Worker(url); + w.onmessage = (ev: MessageEvent) => this.settle(w, ev.data as { ok: boolean; result?: unknown; error?: string }); + w.onerror = (ev: ErrorEvent) => this.retire(w, new Error(`worker error: ${ev.message ?? "unknown"}`)); + this.workers.add(w); + this.idle.push(w); + } + } + + get size(): number { + return this.workers.size; + } + + exec(msg: unknown): Promise { + if (this.closed) return Promise.reject(new Error("worker pool is closed")); + if (this.workers.size === 0) return Promise.reject(new Error("worker pool has no live workers")); + return new Promise((resolve, reject) => { + const p: Pending = { resolve: resolve as (v: unknown) => void, reject }; + const w = this.idle.pop(); + if (w) this.dispatch(w, msg, p); + else this.queue.push({ msg, p }); + }); + } + + close(): void { + this.closed = true; + this.rejectQueue(new Error("worker pool closed")); + for (const w of this.workers) w.terminate(); + this.workers.clear(); + this.idle = []; + } + + private dispatch(w: Worker, msg: unknown, p: Pending): void { + this.pending.set(w, p); + w.postMessage(msg); + } + + private settle(w: Worker, reply: { ok: boolean; result?: unknown; error?: string }): void { + const p = this.pending.get(w); + this.pending.delete(w); + const next = this.queue.shift(); + if (next) this.dispatch(w, next.msg, next.p); + else this.idle.push(w); + if (!p) return; + if (reply.ok) p.resolve(reply.result); + else p.reject(new Error(reply.error ?? "worker task failed")); + } + + /** A worker died: fail its task, drop it from the pool, and never strand the queue. */ + private retire(w: Worker, err: Error): void { + const p = this.pending.get(w); + this.pending.delete(w); + this.workers.delete(w); + this.idle = this.idle.filter((x) => x !== w); + w.terminate(); + p?.reject(err); + if (this.workers.size === 0) this.rejectQueue(err); + } + + private rejectQueue(err: Error): void { + for (const q of this.queue) q.p.reject(err); + this.queue = []; + } +} diff --git a/src/dataflow/sdg.ts b/src/dataflow/sdg.ts new file mode 100644 index 0000000..600877f --- /dev/null +++ b/src/dataflow/sdg.ts @@ -0,0 +1,112 @@ +/** + * Stage 7 — SDG assembly: stitch the per-function PDGs with interprocedural edges + * (Horwitz–Reps–Binkley), all keyed by canonical `(signature, node_id)`. Operates purely on the + * serialized CallableGraphData (no AST access). + * + * Call sites are collapsed onto their containing statement node (the node is both actual-in and + * actual-out), so: + * - CALL: callsite statement → callee ENTRY (node 0). + * - PARAM_IN: callsite statement → callee `param` node, var "argN"; module globals the callee + * transitively reads ride the same mechanism targeting the callee ENTRY (where + * stage 3 places their initial defs), var = the global path. + * - PARAM_OUT: callee EXIT → callsite statement, var "return" (always: over-approximate) or the + * written global path. + * - SUMMARY: self-edge on the callsite node, var = the input ("argN" or a global path) whose + * value may transitively flow to the call's result — composed from stage 6. + * + * External / unresolved callees have no graphs to reference (no dangling endpoints — the + * call-graph rule), so they contribute only conservative pass-through SUMMARY self-edges. + */ +import type { SdgEdge } from "../schema"; +import { renderGlobal, type CallableGraphData } from "./model"; +import type { CallSiteRef, FunctionSummary } from "./summaries"; + +export function assembleSdg( + datas: Map, + callSites: Map, + summaries: Map, +): SdgEdge[] { + const out: SdgEdge[] = []; + const seen = new Set(); + const add = (e: SdgEdge): void => { + const k = `${e.source.signature}#${e.source.node}>${e.target.signature}#${e.target.node}>${e.type}>${e.var ?? ""}`; + if (seen.has(k)) return; + seen.add(k); + out.push(e); + }; + + for (const caller of [...datas.keys()].sort()) { + for (const cs of callSites.get(caller) ?? []) { + const at = (node: number): { signature: string; node: number } => ({ signature: caller, node }); + const callee = cs.callee ? datas.get(cs.callee) : undefined; + + if (!callee || !cs.callee) { + // External / unresolved: conservative pass-through — every argument may flow to the result. + for (let i = 0; i < cs.argCount; i++) { + add({ source: at(cs.nodeId), target: at(cs.nodeId), type: "SUMMARY", var: `arg${i}` }); + } + continue; + } + + const calleeSig = cs.callee; + add({ source: at(cs.nodeId), target: { signature: calleeSig, node: callee.entryId }, type: "CALL" }); + + // Positional PARAM_IN edges; extra arguments bind to a trailing rest parameter if there is one. + for (let i = 0; i < cs.argCount; i++) { + let pIdx = i; + if (pIdx >= callee.paramIds.length) { + if (!callee.hasRestParam || callee.paramIds.length === 0) continue; + pIdx = callee.paramIds.length - 1; + } + add({ + source: at(cs.nodeId), + target: { signature: calleeSig, node: callee.paramIds[pIdx] as number }, + type: "PARAM_IN", + var: `arg${i}`, + }); + } + add({ + source: { signature: calleeSig, node: callee.exitId }, + target: at(cs.nodeId), + type: "PARAM_OUT", + var: "return", + }); + + const sum = summaries.get(calleeSig); + if (sum) { + for (const g of sum.global_reads) { + add({ + source: at(cs.nodeId), + target: { signature: calleeSig, node: callee.entryId }, + type: "PARAM_IN", + var: renderGlobal(g), + }); + } + for (const g of sum.global_writes) { + add({ + source: { signature: calleeSig, node: callee.exitId }, + target: at(cs.nodeId), + type: "PARAM_OUT", + var: renderGlobal(g), + }); + } + for (const i of sum.param_flows) { + if (i < cs.argCount) add({ source: at(cs.nodeId), target: at(cs.nodeId), type: "SUMMARY", var: `arg${i}` }); + } + for (const g of sum.globals_to_return) { + add({ source: at(cs.nodeId), target: at(cs.nodeId), type: "SUMMARY", var: g }); + } + } + } + } + + return out.sort( + (a, b) => + a.source.signature.localeCompare(b.source.signature) || + a.source.node - b.source.node || + a.target.signature.localeCompare(b.target.signature) || + a.target.node - b.target.node || + a.type.localeCompare(b.type) || + (a.var ?? "").localeCompare(b.var ?? ""), + ); +} diff --git a/src/dataflow/slice.ts b/src/dataflow/slice.ts new file mode 100644 index 0000000..2e4b1ff --- /dev/null +++ b/src/dataflow/slice.ts @@ -0,0 +1,69 @@ +/** + * Stage 8 client — context-sensitive backward slicing as a query over the emitted SDG + * (the two-phase Horwitz–Reps–Binkley traversal). + * + * Phase 1 ("up"): from the criterion, reverse-traverse intra-function dependence (CDG, DDG), + * SUMMARY edges, and the ascending interprocedural edges (PARAM_IN, CALL) — but never PARAM_OUT, + * so the walk does not descend into callees (their transitive effects are covered by SUMMARY). + * Phase 2 ("down"): from everything phase 1 reached, additionally reverse-traverse PARAM_OUT + * (descending into callees), but no longer PARAM_IN/CALL (no re-ascending — that is what keeps + * the slice context-sensitive). The slice is the union. + */ +import type { ProgramGraphs } from "../schema"; + +export interface SliceCriterion { + signature: string; + node: number; +} + +const keyOf = (sig: string, node: number): string => `${sig}#${node}`; + +interface ReverseEdges { + intra: Map; // CDG ∪ DDG ∪ SUMMARY, reversed + ascend: Map; // PARAM_IN ∪ CALL, reversed + descend: Map; // PARAM_OUT, reversed +} + +function reverseEdges(pg: ProgramGraphs): ReverseEdges { + const intra = new Map(); + const ascend = new Map(); + const descend = new Map(); + const push = (m: Map, from: string, to: string): void => { + const arr = m.get(from) ?? []; + arr.push(to); + m.set(from, arr); + }; + for (const [sig, g] of Object.entries(pg.functions)) { + for (const e of g.pdg?.edges ?? []) push(intra, keyOf(sig, e.target), keyOf(sig, e.source)); + } + for (const e of pg.sdg_edges) { + const from = keyOf(e.target.signature, e.target.node); + const to = keyOf(e.source.signature, e.source.node); + if (e.type === "SUMMARY") push(intra, from, to); + else if (e.type === "PARAM_IN" || e.type === "CALL") push(ascend, from, to); + else push(descend, from, to); + } + return { intra, ascend, descend }; +} + +export function backwardSlice(pg: ProgramGraphs, criterion: SliceCriterion): Set { + const rev = reverseEdges(pg); + const walk = (starts: Iterable, follow: Array>): Set => { + const seen = new Set(starts); + const stack = [...seen]; + while (stack.length) { + const n = stack.pop() as string; + for (const m of follow) { + for (const p of m.get(n) ?? []) { + if (!seen.has(p)) { + seen.add(p); + stack.push(p); + } + } + } + } + return seen; + }; + const phase1 = walk([keyOf(criterion.signature, criterion.node)], [rev.intra, rev.ascend]); + return walk(phase1, [rev.intra, rev.descend]); +} diff --git a/src/dataflow/summaries.ts b/src/dataflow/summaries.ts new file mode 100644 index 0000000..2bb791d --- /dev/null +++ b/src/dataflow/summaries.ts @@ -0,0 +1,257 @@ +/** + * Stages 5–6 — the interprocedural half: SCC condensation of the (frozen, provenance-merged) + * call graph, then bottom-up relational function summaries composed over the condensation DAG. + * + * A summary answers, per callable: which argument positions flow to the return value, which + * module-level globals it (transitively) reads and writes, and which globals flow to its return. + * Summaries are node-granular — dependence is tracked between CFG nodes, not sub-expressions — + * which keeps them sound-leaning and over-approximate (the contract's precision posture). + * + * Everything here is pure data (CallableGraphData + prior summaries): fact extraction happened + * once at stage 1–4 time, so an SCC's fixpoint re-runs only the reaching-defs solve. That is + * what lets `sccFixpoint` be the atomic unit of the wavefront — dispatched to a worker or run + * inline, byte-identically (it is a pure function of its inputs). + * + * Within an SCC (mutual recursion), member summaries are co-defined, iterating to a monotone + * fixpoint. Termination: summary domains are finite — argument indices are bounded by arity and + * global paths are k-limited — and grow monotonically. + * + * External / unresolved callees: conservative pass-through — every argument may flow to the + * result (applied at SDG/SUMMARY emission); their global effects are unmodeled (documented + * unsoundness: npm internals are not analyzed). + */ +import type { PdgEdge } from "../schema"; +import { solveDefUse } from "./defuse"; +import { + renderGlobal, + type CallEffects, + type CallableGraphData, + type GlobalPath, + type NodeFacts, +} from "./model"; + +/** A call site inside a callable, mapped onto its CFG statement node. */ +export interface CallSiteRef { + nodeId: number; + /** Callee signature (symbol-table / external / synthesized key), or null when unresolved. */ + callee: string | null; + argCount: number; +} + +export interface FunctionSummary { + /** Argument indices whose value may flow to the return value. */ + param_flows: number[]; + global_reads: GlobalPath[]; + global_writes: GlobalPath[]; + /** Rendered global paths that may flow to the return value. */ + globals_to_return: string[]; + /** Callee signatures this summary was composed from (recorded for later incrementality). */ + deps: string[]; +} + +export interface SccResult { + summaries: Map; + /** Each member's fixpoint DDG (already reflecting callee global effects) — what the PDG emits. */ + ddg: Map; +} + +/** + * Solve one SCC to its co-defined fixpoint, given the summaries of every callee SCC (which the + * wavefront guarantees are complete). Pure: same inputs ⇒ same outputs, on any thread. + */ +export function sccFixpoint( + members: CallableGraphData[], + callSites: Map, + calleeSummaries: Map, +): SccResult { + const sorted = [...members].sort((a, b) => a.signature.localeCompare(b.signature)); + const memberSigs = new Set(sorted.map((m) => m.signature)); + const summaries = new Map(calleeSummaries); + const ddg = new Map(); + + const selfReferential = + sorted.length > 1 || + (callSites.get(sorted[0]?.signature ?? "") ?? []).some((cs) => cs.callee === sorted[0]?.signature); + + let iterations = 0; + for (;;) { + let changed = false; + for (const data of sorted) { + const effects = effectsFor(callSites.get(data.signature) ?? [], summaries); + const solved = solveDefUse(data, effects); + const next = summarize(data, solved.ddg, solved.effective, callSites.get(data.signature) ?? []); + const prev = summaries.get(data.signature); + if (!prev || !sameSummary(prev, next)) changed = true; + summaries.set(data.signature, next); + ddg.set(data.signature, solved.ddg); + } + iterations++; + if (!changed || !selfReferential) break; + if (iterations > 100) break; // k-limited domains make this unreachable; hard backstop anyway + } + + // Return only this SCC's summaries (callee entries were working state). + const own = new Map(); + for (const sig of memberSigs) own.set(sig, summaries.get(sig) as FunctionSummary); + return { summaries: own, ddg }; +} + +/** Project the current callee summaries onto a function's call sites as global read/write effects. */ +function effectsFor(sites: CallSiteRef[], summaries: Map): Map { + const out = new Map(); + for (const cs of sites) { + if (!cs.callee) continue; + const s = summaries.get(cs.callee); + if (!s) continue; + const cur = out.get(cs.nodeId) ?? { reads: [], writes: [] }; + cur.reads.push(...s.global_reads); + cur.writes.push(...s.global_writes); + out.set(cs.nodeId, cur); + } + return out; +} + +function summarize( + data: CallableGraphData, + ddg: PdgEdge[], + effective: Map, + sites: CallSiteRef[], +): FunctionSummary { + // Forward DDG adjacency (def-node → use-node = "use-node depends on def-node"). + const fwd = new Map>(); + for (const e of ddg) { + if (!fwd.has(e.source)) fwd.set(e.source, new Set()); + fwd.get(e.source)?.add(e.target); + } + const returnValueNodes = new Set(data.returnValueNodes); + const reaches = (starts: number[]): Set => { + const seen = new Set(starts); + const stack = [...starts]; + while (stack.length) { + const n = stack.pop() as number; + for (const s of fwd.get(n) ?? []) { + if (!seen.has(s)) { + seen.add(s); + stack.push(s); + } + } + } + return seen; + }; + const touchesReturn = (starts: number[]): boolean => { + for (const n of reaches(starts)) if (returnValueNodes.has(n)) return true; + return false; + }; + + const param_flows: number[] = []; + for (const [i, pid] of data.paramIds.entries()) { + if (touchesReturn([pid])) param_flows.push(i); + } + + // Global reads/writes: module-kind uses/defs anywhere in the function. Callee effects were + // already overlaid by solveDefUse, so transitive effects fall out for free. ENTRY's synthetic + // ambient defs are initializations, not writes — exclude them. + const readsByKey = new Map(); + const writesByKey = new Map(); + const usedAt = new Map(); + for (const [nodeId, f] of effective) { + for (const u of f.uses) { + if (u.baseKind !== "module") continue; + const g: GlobalPath = { key: u.key, fields: u.fields }; + readsByKey.set(renderGlobal(g), g); + const arr = usedAt.get(u.key) ?? []; + arr.push(nodeId); + usedAt.set(u.key, arr); + } + if (nodeId === data.entryId) continue; + for (const d of f.defs) { + if (d.ref.baseKind !== "module") continue; + const g: GlobalPath = { key: d.ref.key, fields: d.ref.fields }; + writesByKey.set(renderGlobal(g), g); + } + } + + const globals_to_return: string[] = []; + for (const [rendered, g] of readsByKey) { + const nodes = usedAt.get(g.key) ?? []; + if (nodes.length && touchesReturn(nodes)) globals_to_return.push(rendered); + } + + const deps = new Set(); + for (const cs of sites) if (cs.callee) deps.add(cs.callee); + + return { + param_flows: param_flows.sort((a, b) => a - b), + global_reads: sortGlobals([...readsByKey.values()]), + global_writes: sortGlobals([...writesByKey.values()]), + globals_to_return: globals_to_return.sort(), + deps: [...deps].sort(), + }; +} + +function sortGlobals(gs: GlobalPath[]): GlobalPath[] { + return gs.sort((a, b) => renderGlobal(a).localeCompare(renderGlobal(b))); +} + +function sameSummary(a: FunctionSummary, b: FunctionSummary): boolean { + return JSON.stringify(a) === JSON.stringify(b); +} + +// ------------------------------------------------------------------------------------------------ +// Tarjan SCC — emission order is reverse-topological (an SCC is emitted after every SCC it calls). +// ------------------------------------------------------------------------------------------------ + +export function tarjanSccs(nodes: string[], adj: Map): string[][] { + const index = new Map(); + const low = new Map(); + const onStack = new Set(); + const stack: string[] = []; + const out: string[][] = []; + let counter = 0; + + const strongconnect = (v: string): void => { + index.set(v, counter); + low.set(v, counter); + counter++; + stack.push(v); + onStack.add(v); + for (const w of adj.get(v) ?? []) { + if (!index.has(w)) { + strongconnect(w); + low.set(v, Math.min(low.get(v) as number, low.get(w) as number)); + } else if (onStack.has(w)) { + low.set(v, Math.min(low.get(v) as number, index.get(w) as number)); + } + } + if (low.get(v) === index.get(v)) { + const scc: string[] = []; + for (;;) { + const w = stack.pop() as string; + onStack.delete(w); + scc.push(w); + if (w === v) break; + } + out.push(scc); + } + }; + + for (const v of nodes) if (!index.has(v)) strongconnect(v); + return out; +} + +/** Internal-call adjacency (caller → callees with graphs), the condensation input. */ +export function callAdjacency( + sigs: string[], + callSites: Map, + hasGraph: (sig: string) => boolean, +): Map { + const adj = new Map(); + for (const s of sigs) { + const targets = new Set(); + for (const cs of callSites.get(s) ?? []) { + if (cs.callee && hasGraph(cs.callee) && cs.callee !== s) targets.add(cs.callee); + } + adj.set(s, [...targets].sort()); + } + return adj; +} diff --git a/src/dataflow/worker.ts b/src/dataflow/worker.ts new file mode 100644 index 0000000..90d377a --- /dev/null +++ b/src/dataflow/worker.ts @@ -0,0 +1,96 @@ +/** + * Level-3 worker entry — runs on a Bun Worker thread. Two task kinds: + * + * - "extract": stages 1–4 for a partition of the project's files. ts-morph ASTs cannot cross + * the structured-clone boundary, so each worker materializes its own whole-program Project + * (the checker needs every file for cross-file resolution; only the partition's files are + * deeply visited) and returns plain CallableGraphData. + * - "solve": one SCC's summary fixpoint (the wavefront's atomic unit) — pure data in, pure + * data out, via the same sccFixpoint the sequential path uses. + * + * The pool treats any failure here as "fall back to sequential", so this file must never be a + * correctness dependency. + */ +import { Project } from "ts-morph"; +import type { PdgEdge } from "../schema"; +import { defaultCompilerOptions, discoverSourceFiles } from "../syntactic_analysis"; +import { extractCallableData, indexCallableDecls } from "./extract"; +import type { CallableGraphData } from "./model"; +import { sccFixpoint, type CallSiteRef, type FunctionSummary } from "./summaries"; + +export interface ExtractTask { + type: "extract"; + root: string; + tsConfigFilePath: string | null; + skipTests: boolean; + k: number; + /** The callables this worker owns: signature + module file key + absolute file path. */ + sigs: Array<{ signature: string; path: string; absPath: string }>; +} + +export interface SolveTask { + type: "solve"; + members: CallableGraphData[]; + callSites: Array<[string, CallSiteRef[]]>; + calleeSummaries: Array<[string, FunctionSummary]>; +} + +export interface SolveTaskResult { + summaries: Array<[string, FunctionSummary]>; + ddg: Array<[string, PdgEdge[]]>; +} + +export type WorkerTask = ExtractTask | SolveTask; + +// One Project per worker lifetime (keyed in case tasks ever mix targets). +const projects = new Map(); + +function projectFor(root: string, tsConfigFilePath: string | null, skipTests: boolean): Project { + const key = `${root}|${tsConfigFilePath ?? ""}|${skipTests}`; + let project = projects.get(key); + if (project) return project; + project = tsConfigFilePath + ? new Project({ tsConfigFilePath, skipAddingFilesFromTsConfig: true }) + : new Project({ compilerOptions: defaultCompilerOptions() }); + for (const f of discoverSourceFiles(root, skipTests)) { + try { + project.addSourceFileAtPath(f.absPath); + } catch { + // Same tolerance as the symbol-table builder: an unloadable file degrades, never crashes. + } + } + projects.set(key, project); + return project; +} + +function runExtract(task: ExtractTask): CallableGraphData[] { + const project = projectFor(task.root, task.tsConfigFilePath, task.skipTests); + const onlyFiles = new Set(task.sigs.map((s) => s.absPath)); + const idx = indexCallableDecls(project, task.root, onlyFiles); + const out: CallableGraphData[] = []; + for (const s of task.sigs) { + const fn = idx.get(s.signature); + if (!fn) continue; + const data = extractCallableData(s.signature, fn, s.path, task.root, task.k); + if (data) out.push(data); + } + return out; +} + +function runSolve(task: SolveTask): SolveTaskResult { + const res = sccFixpoint(task.members, new Map(task.callSites), new Map(task.calleeSummaries)); + return { summaries: [...res.summaries.entries()], ddg: [...res.ddg.entries()] }; +} + +declare var self: Worker; + +self.onmessage = (ev: MessageEvent) => { + const task = ev.data as WorkerTask; + try { + const result = task.type === "extract" ? runExtract(task) : runSolve(task); + self.postMessage({ ok: true, result }); + } catch (e) { + const err = e as Error; + self.postMessage({ ok: false, error: err.stack ?? err.message }); + } +}; diff --git a/src/index.ts b/src/index.ts index e5ef590..0d4a836 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,7 +11,7 @@ async function main(): Promise { emitSchema(opts); return; } - const app = analyze(opts); + const app = await analyze(opts); await emit(app, opts); } catch (e) { const err = e as Error; diff --git a/src/options/options.ts b/src/options/options.ts index db83fbb..fe909fd 100644 --- a/src/options/options.ts +++ b/src/options/options.ts @@ -1,3 +1,5 @@ +import type { GraphSelector } from "../schema"; + export type EmitTarget = "json" | "neo4j" | "schema"; export type CallGraphProviderName = "union" | "tsc" | "jelly"; @@ -16,8 +18,23 @@ export interface AnalysisOptions { neo4jUser: string; neo4jPassword: string; neo4jDatabase: string | null; - /** Analysis depth requested by the caller (1 = symbol table + call graph [default]; 2 = call graph). */ - analysisLevel: 1 | 2; + /** + * Analysis depth requested by the caller (1 = symbol table + call graph [default]; + * 2 = call graph; 3 = + native program graphs — CFG/PDG/SDG in `program_graphs`). + */ + analysisLevel: 1 | 2 | 3; + /** Which level-3 graph sections to emit (`--graphs`); only consulted at level 3. */ + graphs: GraphSelector[]; + /** k-limit for access-path depth in the level-3 dataflow (`--graph-field-depth`, default 3). */ + graphFieldDepth: number; + /** + * Worker parallelism for the level-3 pipeline (`-j/--jobs`). 0 = auto (currently sequential: + * each extraction worker must materialize its own whole-program ts-morph project, which + * measurably dominates the parallelizable graph math on small/mid repos); an explicit N ≥ 2 + * opts in for large codebases; 1 = fully sequential (the debug mode and differential oracle — + * `--jobs N` output is byte-identical). + */ + jobs: number; /** Restrict analysis to these files (project-relative or absolute). null ⇒ whole project. */ targetFiles: string[] | null; /** Skip test trees (default true). */ diff --git a/src/schema/graphs.ts b/src/schema/graphs.ts new file mode 100644 index 0000000..801eb5c --- /dev/null +++ b/src/schema/graphs.ts @@ -0,0 +1,131 @@ +/** + * The level-3 `program_graphs` contract — CFG / PDG (CDG+DDG) / SDG, per the cross-language + * dataflow-graphs spec. Emitted as an optional top-level section of analysis.json, only at + * `-a 3`, scoped by `--graphs`. + * + * Node identity is the invariant that makes everything joinable: every node is keyed by + * `(signature, node_id)` where `signature` is the SAME signatureOf() key used by symbol_table + * and call_graph, and `node_id` is the index of the owning AST node in source-span order within + * the callable (synthetic ENTRY = 0, EXIT = last). Cross-function edges reference both endpoints + * that way, and — as with the call graph — may never dangle. + * + * The node-kind / edge-kind vocabulary below is shared across the CLDK analyzers (parity + * clause); TS-specific members (`await_resume`, `yield`) are additive and recorded in + * .claude/SCHEMA_DECISIONS.md. + */ + +// Bumped independently of the top-level schema; additive changes only. +export const PROGRAM_GRAPHS_SCHEMA_VERSION = "1.0.0"; + +// ---------------------------------------------------------------------------------------------- +// Nodes +// ---------------------------------------------------------------------------------------------- + +/** + * CFG node kinds. `entry`/`exit` are synthetic (span = the whole callable); `param` nodes are the + * formal-in nodes of the SDG (span = the parameter declaration); everything else is `statement`. + */ +export type GraphNodeKind = "entry" | "exit" | "param" | "statement"; + +export interface GraphNode { + id: number; + kind: GraphNodeKind; + start_line: number; + start_column: number; + end_line: number; + end_column: number; +} + +// ---------------------------------------------------------------------------------------------- +// Intra-function edges +// ---------------------------------------------------------------------------------------------- + +/** CFG_NEXT edge kinds — the shared vocabulary plus the TS-native `await_resume` / `yield`. */ +export type CfgEdgeKind = + | "fallthrough" + | "true" + | "false" + | "switch_case" + | "loop_back" + | "exception" + | "return" + | "break" + | "continue" + | "yield" + | "await_resume"; + +export interface CfgEdge { + source: number; + target: number; + kind: CfgEdgeKind; +} + +/** PDG edge: control dependence (CDG) or data dependence (DDG, labeled with the access path). */ +export interface PdgEdge { + source: number; + target: number; + type: "CDG" | "DDG"; + /** The k-limited access path the dependence carries (DDG only). */ + var?: string; +} + +export interface FunctionCfg { + nodes: GraphNode[]; + edges: CfgEdge[]; +} + +export interface FunctionPdg { + edges: PdgEdge[]; +} + +/** The per-callable graphs, keyed by the callable's canonical signature in `functions`. */ +export interface FunctionGraphs { + cfg?: FunctionCfg; + pdg?: FunctionPdg; +} + +// ---------------------------------------------------------------------------------------------- +// Cross-function (SDG) edges +// ---------------------------------------------------------------------------------------------- + +export interface SdgEndpoint { + signature: string; + node: number; +} + +/** + * SDG edge types (Horwitz–Reps–Binkley): + * - CALL callsite statement → callee ENTRY. + * - PARAM_IN callsite statement → callee `param` node (`var` = "argN"), or → callee ENTRY for + * a module/global binding the callee transitively reads (`var` = the global path). + * - PARAM_OUT callee EXIT → callsite statement (`var` = "return", or the global path written). + * - SUMMARY actual-in → actual-out at the same call site. Call sites are collapsed onto their + * containing statement node, so SUMMARY edges are self-edges on the callsite node; + * `var` names the input ("argN" or a global path) that transitively flows to the + * call's result. + */ +export type SdgEdgeType = "CALL" | "PARAM_IN" | "PARAM_OUT" | "SUMMARY"; + +export interface SdgEdge { + source: SdgEndpoint; + target: SdgEndpoint; + type: SdgEdgeType; + var?: string; +} + +// ---------------------------------------------------------------------------------------------- +// Root section +// ---------------------------------------------------------------------------------------------- + +export interface ProgramGraphs { + schema_version: string; + /** The access-path depth bound (--graph-field-depth) the graphs were built with. */ + k_limit: number; + functions: Record; + /** Cross-function edges only; intra-function edges live in each function's pdg. */ + sdg_edges: SdgEdge[]; +} + +/** The `--graphs` selector values. `dfg` = the DDG subset of the PDG (no separate section). */ +export type GraphSelector = "cfg" | "dfg" | "pdg" | "sdg"; +export const ALL_GRAPHS: GraphSelector[] = ["cfg", "dfg", "pdg", "sdg"]; diff --git a/src/schema/index.ts b/src/schema/index.ts index 9de570f..fde1c87 100644 --- a/src/schema/index.ts +++ b/src/schema/index.ts @@ -2,3 +2,4 @@ // syntactic and semantic phases). export * from "./schema"; export * from "./signatures"; +export * from "./graphs"; diff --git a/src/schema/schema.ts b/src/schema/schema.ts index 056e950..4c75710 100644 --- a/src/schema/schema.ts +++ b/src/schema/schema.ts @@ -395,6 +395,8 @@ export interface TSApplication { call_graph: TSCallEdge[]; external_symbols: Record; synthesized_callables: Record; + /** Level-3 CFG/PDG/SDG section — present only at `-a 3` (see schema/graphs.ts). */ + program_graphs?: import("./graphs").ProgramGraphs; } // ============================================================================================== diff --git a/src/syntactic_analysis/symbolTable.ts b/src/syntactic_analysis/symbolTable.ts index 62f16a6..b7cdbc6 100644 --- a/src/syntactic_analysis/symbolTable.ts +++ b/src/syntactic_analysis/symbolTable.ts @@ -61,7 +61,8 @@ export function buildSymbolTable( return { project, symbol_table, files: buildFiles }; } -function defaultCompilerOptions(): ts.CompilerOptions { +/** The fallback compiler options when the target has no tsconfig (shared with graph workers). */ +export function defaultCompilerOptions(): ts.CompilerOptions { return { target: ts.ScriptTarget.ES2022, module: ts.ModuleKind.ESNext, diff --git a/test/dataflow.test.ts b/test/dataflow.test.ts new file mode 100644 index 0000000..155c23a --- /dev/null +++ b/test/dataflow.test.ts @@ -0,0 +1,425 @@ +/** + * Level-3 gate tests (issue #2): every verification gate from the dataflow contract, asserted + * with exact expected sets on the dataflow-app fixture — CFG, dominance/CDG, DFG, the + * PDG backward-slice gate, summaries (SCC fixpoint), SDG (no dangling endpoints), the + * interprocedural slice, determinism, and the -a 1/-a 2 gating. + */ +import { describe, expect, test } from "bun:test"; +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import { analyze } from "../src/core"; +import { backwardSlice } from "../src/dataflow"; +import type { AnalysisOptions } from "../src/options"; +import type { CfgEdge, FunctionCfg, ProgramGraphs, SdgEdge } from "../src/schema"; + +const FIXTURE = path.resolve(import.meta.dir, "fixtures/dataflow-app"); + +function options(level: 1 | 2 | 3, cacheDir: string, jobs: number): AnalysisOptions { + return { + input: FIXTURE, + output: null, + emit: "json", + appName: null, + neo4jUri: null, + neo4jUser: "neo4j", + neo4jPassword: "", + neo4jDatabase: null, + analysisLevel: level, + graphs: ["cfg", "dfg", "pdg", "sdg"], + graphFieldDepth: 3, + jobs, + targetFiles: null, + skipTests: true, + eager: true, + noBuild: true, + phantoms: true, + callGraphProvider: "tsc", + cacheDir, + verbosity: 0, + }; +} + +async function run(level: 1 | 2 | 3, jobs = 1): Promise>> { + const cacheDir = fs.mkdtempSync(path.join(os.tmpdir(), "cants-dataflow-test-")); + try { + return await analyze(options(level, cacheDir, jobs)); + } finally { + fs.rmSync(cacheDir, { recursive: true, force: true }); + } +} + +const app = await run(3); +const pg = app.program_graphs as ProgramGraphs; + +const cfgOf = (sig: string): FunctionCfg => { + const g = pg.functions[sig]?.cfg; + if (!g) throw new Error(`no cfg for ${sig}`); + return g; +}; +const pdgOf = (sig: string) => { + const g = pg.functions[sig]?.pdg; + if (!g) throw new Error(`no pdg for ${sig}`); + return g.edges; +}; +const cdgSet = (sig: string): string[] => + pdgOf(sig) + .filter((e) => e.type === "CDG") + .map((e) => `${e.source}>${e.target}`) + .sort(); +const ddgHas = (sig: string, source: number, target: number, v: string): boolean => + pdgOf(sig).some((e) => e.type === "DDG" && e.source === source && e.target === target && e.var === v); +const kinds = (sig: string, filter?: (e: CfgEdge) => boolean): CfgEdge[] => + cfgOf(sig).edges.filter(filter ?? (() => true)); +const sdg = (filter: (e: SdgEdge) => boolean): SdgEdge[] => pg.sdg_edges.filter(filter); + +// ------------------------------------------------------------------------------------------------ +// CFG gate +// ------------------------------------------------------------------------------------------------ + +describe("CFG gate", () => { + test("every function: single ENTRY (0) / single EXIT (last), contiguous span-ordered ids", () => { + for (const [sig, g] of Object.entries(pg.functions)) { + const cfg = g.cfg as FunctionCfg; + expect(cfg.nodes.length, sig).toBeGreaterThanOrEqual(2); + expect(cfg.nodes[0]?.kind, sig).toBe("entry"); + expect(cfg.nodes[cfg.nodes.length - 1]?.kind, sig).toBe("exit"); + cfg.nodes.forEach((n, i) => expect(n.id, sig).toBe(i)); + expect(cfg.nodes.filter((n) => n.kind === "entry" || n.kind === "exit"), sig).toHaveLength(2); + } + }); + + test("every node maps to a real source span", () => { + for (const [sig, g] of Object.entries(pg.functions)) { + for (const n of (g.cfg as FunctionCfg).nodes) { + expect(n.start_line, sig).toBeGreaterThanOrEqual(1); + expect(n.end_line, sig).toBeGreaterThanOrEqual(n.start_line); + } + } + }); + + test("every node is reachable from ENTRY and reaches EXIT", () => { + for (const [sig, g] of Object.entries(pg.functions)) { + const cfg = g.cfg as FunctionCfg; + const fwd = new Map(); + const rev = new Map(); + for (const e of cfg.edges) { + fwd.set(e.source, [...(fwd.get(e.source) ?? []), e.target]); + rev.set(e.target, [...(rev.get(e.target) ?? []), e.source]); + } + const bfs = (start: number, adj: Map): Set => { + const seen = new Set([start]); + const q = [start]; + while (q.length) for (const nx of adj.get(q.shift() as number) ?? []) if (!seen.has(nx)) (seen.add(nx), q.push(nx)); + return seen; + }; + const fromEntry = bfs(0, fwd); + const toExit = bfs(cfg.nodes.length - 1, rev); + for (const n of cfg.nodes) { + expect(fromEntry.has(n.id), `${sig}#${n.id} unreachable from ENTRY`).toBe(true); + expect(toExit.has(n.id), `${sig}#${n.id} cannot reach EXIT`).toBe(true); + } + } + }); + + test("if/else lowers to true/false branch edges (classify)", () => { + const e = kinds("src/flow.classify"); + expect(e).toContainEqual({ source: 3, target: 4, kind: "true" }); + expect(e).toContainEqual({ source: 3, target: 5, kind: "false" }); + }); + + test("loops lower with a loop_back edge (sumTo)", () => { + expect(kinds("src/flow.sumTo", (e) => e.kind === "loop_back")).not.toHaveLength(0); + }); + + test("a throwing call inside try edges to the catch node; finally may re-raise (guarded)", () => { + const e = kinds("src/flow.guarded"); + expect(e).toContainEqual({ source: 3, target: 4, kind: "exception" }); // out = parse(s) → catch + expect(e).toContainEqual({ source: 6, target: 8, kind: "exception" }); // finally → outward (EXIT) + }); + + test("throw with no handler edges to EXIT (parse)", () => { + const cfg = cfgOf("src/flow.parse"); + const exit = cfg.nodes.length - 1; + expect(cfg.edges.filter((e) => e.kind === "exception" && e.target === exit)).not.toHaveLength(0); + }); + + test("while (true) still emits the loop-exit false edge — the synthetic post-dominance edge (spin)", () => { + expect(kinds("src/flow.spin")).toContainEqual({ source: 2, target: 6, kind: "false" }); + expect(kinds("src/flow.spin", (e) => e.kind === "break")).toHaveLength(1); + }); + + test("switch lowers to switch_case dispatch plus break edges (pickDay)", () => { + expect(kinds("src/flow.pickDay", (e) => e.kind === "switch_case")).toHaveLength(3); + expect(kinds("src/flow.pickDay", (e) => e.kind === "break")).toHaveLength(2); + }); + + test("await suspends via an await_resume edge (fetchTotal)", () => { + expect(kinds("src/susp.fetchTotal")).toContainEqual({ source: 2, target: 3, kind: "await_resume" }); + }); + + test("yield suspends via a yield edge (numbers)", () => { + expect(kinds("src/susp.numbers")).toContainEqual({ source: 4, target: 5, kind: "yield" }); + }); + + test("short-circuit / optional chaining stay intra-statement (shortCircuit has no branch edges)", () => { + expect(kinds("src/susp.shortCircuit", (e) => e.kind === "true" || e.kind === "false")).toHaveLength(0); + }); +}); + +// ------------------------------------------------------------------------------------------------ +// Dominance gate (control dependence, hand-computed) +// ------------------------------------------------------------------------------------------------ + +describe("dominance gate", () => { + test("classify: exact hand-computed control dependences", () => { + expect(cdgSet("src/flow.classify")).toEqual(["0>1", "0>2", "0>3", "0>6", "3>4", "3>5"]); + }); + + test("early (early return): everything after the guard depends on it", () => { + expect(cdgSet("src/flow.early")).toEqual(["0>1", "0>2", "2>3", "2>4", "2>5"]); + }); +}); + +// ------------------------------------------------------------------------------------------------ +// DFG gate +// ------------------------------------------------------------------------------------------------ + +describe("DFG gate", () => { + test("loop-carried dependency: acc feeds itself around the back edge (sumTo)", () => { + expect(ddgHas("src/flow.sumTo", 4, 4, "acc")).toBe(true); + expect(ddgHas("src/flow.sumTo", 3, 3, "i")).toBe(true); + }); + + test("shadowed variables do not leak edges across scopes (shadow)", () => { + // outer x (node 1) → return (node 4); inner x (node 2) → touch(x) (node 3); never 2 → 4. + expect(ddgHas("src/flow.shadow", 1, 4, "x")).toBe(true); + expect(ddgHas("src/flow.shadow", 2, 3, "x")).toBe(true); + expect(ddgHas("src/flow.shadow", 2, 4, "x")).toBe(false); + }); + + test("copy aliasing: a write through q reaches the read through p (useAlias)", () => { + expect(ddgHas("src/closures.useAlias", 3, 4, "p.f")).toBe(true); + }); + + test("closure capture: the captured local flows into the declaring statement (makeCounter)", () => { + expect(ddgHas("src/closures.makeCounter", 2, 3, "count")).toBe(true); + }); + + test("the closure body has its own graph, with the captured base defined at ENTRY", () => { + const inc = pdgOf("src/closures.makeCounter.inc"); + expect(inc.some((e) => e.type === "DDG" && e.var === "count")).toBe(true); + }); +}); + +// ------------------------------------------------------------------------------------------------ +// PDG gate — the intraprocedural backward slice, exact +// ------------------------------------------------------------------------------------------------ + +describe("PDG slice gate", () => { + test("backward slice of classify's return equals the hand-computed node set", () => { + // Node 2 (`let label = "none"`) is strongly killed on BOTH branches, so it is NOT in the slice. + const slice = backwardSlice(pg, { signature: "src/flow.classify", node: 6 }); + const inClassify = [...slice].filter((k) => k.startsWith("src/flow.classify#")).sort(); + expect(inClassify).toEqual([ + "src/flow.classify#0", + "src/flow.classify#1", + "src/flow.classify#3", + "src/flow.classify#4", + "src/flow.classify#5", + "src/flow.classify#6", + ]); + expect([...slice].every((k) => k.startsWith("src/flow.classify#"))).toBe(true); // no callers → intra only + }); +}); + +// ------------------------------------------------------------------------------------------------ +// Summary + SDG gates +// ------------------------------------------------------------------------------------------------ + +describe("summary and SDG gates", () => { + test("no dangling (signature, node_id) endpoints anywhere in the SDG", () => { + const valid = new Map(); + for (const [sig, g] of Object.entries(pg.functions)) valid.set(sig, (g.cfg as FunctionCfg).nodes.length); + for (const e of pg.sdg_edges) { + for (const end of [e.source, e.target]) { + const n = valid.get(end.signature); + expect(n, `dangling signature ${end.signature}`).toBeDefined(); + expect(end.node, `dangling node ${end.signature}#${end.node}`).toBeLessThan(n as number); + expect(end.node).toBeGreaterThanOrEqual(0); + } + } + }); + + test("CALL edges target the callee ENTRY; positional PARAM_IN edges target `param` nodes", () => { + for (const e of pg.sdg_edges) { + if (e.type === "CALL") expect(e.target.node).toBe(0); + if (e.type === "PARAM_IN" && e.var?.startsWith("arg")) { + const callee = pg.functions[e.target.signature]?.cfg as FunctionCfg; + expect(callee.nodes[e.target.node]?.kind, `${e.target.signature}#${e.target.node}`).toBe("param"); + } + } + }); + + test("PARAM_IN arity: argN edges never exceed the callee's declared parameters", () => { + for (const e of pg.sdg_edges) { + if (e.type !== "PARAM_IN" || !e.var?.startsWith("arg")) continue; + const callee = pg.functions[e.target.signature]?.cfg as FunctionCfg; + const params = callee.nodes.filter((n) => n.kind === "param").length; + expect(params, `${e.target.signature} has no params but receives ${e.var}`).toBeGreaterThan(0); + } + }); + + test("the a → b → c chain: CALL / PARAM_IN / PARAM_OUT stitched at each hop", () => { + expect(pg.sdg_edges).toContainEqual({ + source: { signature: "src/chain.a", node: 2 }, + target: { signature: "src/chain.b", node: 0 }, + type: "CALL", + }); + expect(pg.sdg_edges).toContainEqual({ + source: { signature: "src/chain.a", node: 2 }, + target: { signature: "src/chain.b", node: 1 }, + type: "PARAM_IN", + var: "arg0", + }); + expect(pg.sdg_edges).toContainEqual({ + source: { signature: "src/chain.b", node: 4 }, + target: { signature: "src/chain.a", node: 2 }, + type: "PARAM_OUT", + var: "return", + }); + }); + + test("SUMMARY gate: the composed transitive flow arg0 → return exists at a's callsite", () => { + expect(pg.sdg_edges).toContainEqual({ + source: { signature: "src/chain.a", node: 2 }, + target: { signature: "src/chain.a", node: 2 }, + type: "SUMMARY", + var: "arg0", + }); + }); + + test("cross-module (multi-file) SDG edges exist (viaOtherFile → util.increment)", () => { + expect(pg.sdg_edges).toContainEqual({ + source: { signature: "src/chain.viaOtherFile", node: 2 }, + target: { signature: "src/util.increment", node: 1 }, + type: "PARAM_IN", + var: "arg0", + }); + }); + + test("globals ride the SDG: transitive write and read of src/state.counter at churn's callsites", () => { + expect( + sdg( + (e) => + e.type === "PARAM_OUT" && + e.var === "src/state.counter" && + e.source.signature === "src/state.bump" && + e.target.signature === "src/state.churn", + ), + ).toHaveLength(1); + expect( + sdg( + (e) => + e.type === "PARAM_IN" && + e.var === "src/state.counter" && + e.source.signature === "src/state.churn" && + e.target.signature === "src/state.readCounter" && + e.target.node === 0, + ), + ).toHaveLength(1); + }); + + test("a global written by one callee then read by the next shows up as caller-local DDG (main)", () => { + // bump(3) (node 1) transitively writes counter; a(readCounter()) (node 2) transitively reads it. + expect(ddgHas("src/main.main", 1, 2, "src/state.counter")).toBe(true); + }); + + test("mutual recursion (isEven/isOdd) reaches a fixpoint and stitches both directions", () => { + expect(sdg((e) => e.type === "CALL" && e.source.signature === "src/chain.isEven")).toHaveLength(1); + expect(sdg((e) => e.type === "CALL" && e.source.signature === "src/chain.isOdd")).toHaveLength(1); + }); + + test("interprocedural backward slice of main's return: exact, context-sensitive", () => { + const slice = backwardSlice(pg, { signature: "src/main.main", node: 3 }); + const bySig = new Map(); + for (const k of slice) { + const [sig, n] = k.split("#") as [string, string]; + bySig.set(sig, [...(bySig.get(sig) ?? []), Number(n)].sort((a, b) => a - b)); + } + expect([...bySig.keys()].sort()).toEqual([ + "src/chain.a", + "src/chain.b", + "src/chain.c", + "src/main.main", + "src/state.bump", + "src/state.readCounter", + ]); + expect(bySig.get("src/main.main")).toEqual([0, 1, 2, 3]); + expect(bySig.get("src/chain.a")).toEqual([0, 1, 2, 3, 4]); + expect(bySig.get("src/chain.c")).toEqual([0, 1, 2, 3]); + expect(bySig.get("src/state.bump")).toEqual([0, 1, 2, 3]); + }); +}); + +// ------------------------------------------------------------------------------------------------ +// Determinism + level gating +// ------------------------------------------------------------------------------------------------ + +describe("determinism and gating", () => { + test("two runs on identical content emit byte-identical program_graphs", async () => { + const second = await run(3); + expect(JSON.stringify(second.program_graphs)).toBe(JSON.stringify(pg)); + }); + + test("--jobs N (workers + wavefront) is byte-identical to --jobs 1 (the differential oracle)", async () => { + const parallel = await run(3, 4); + expect(JSON.stringify(parallel.program_graphs)).toBe(JSON.stringify(pg)); + }, 60_000); + + test("-a 1 emits no program_graphs section", async () => { + const level1 = await run(1); + expect(level1.program_graphs).toBeUndefined(); + expect(JSON.stringify(level1)).not.toContain("program_graphs"); + }); + + test("schema_version and k_limit are stamped", () => { + expect(pg.schema_version).toBe("1.0.0"); + expect(pg.k_limit).toBe(3); + }); +}); + +// ------------------------------------------------------------------------------------------------ +// CLI flag validation (strict, non-zero exit — cli-contract.md) +// ------------------------------------------------------------------------------------------------ + +describe("--graphs flag validation", () => { + const cli = (...args: string[]) => + Bun.spawnSync(["bun", "run", path.resolve(import.meta.dir, "../src/index.ts"), ...args], { + cwd: path.resolve(import.meta.dir, ".."), + }); + + test("unknown --graphs value fails with a clear error, never a silent fallback", () => { + const r = cli("-i", FIXTURE, "-a", "3", "--graphs", "cfg,bogus", "--no-build"); + expect(r.exitCode).not.toBe(0); + expect(r.stderr.toString()).toContain("unknown --graphs value 'bogus'"); + }); + + test("--graphs without -a 3 is rejected", () => { + const r = cli("-i", FIXTURE, "--graphs", "cfg", "--no-build"); + expect(r.exitCode).not.toBe(0); + expect(r.stderr.toString()).toContain("--graphs requires --analysis-level 3"); + }); + + test("--graph-field-depth must be a positive integer", () => { + const r = cli("-i", FIXTURE, "-a", "3", "--graph-field-depth", "zero", "--no-build"); + expect(r.exitCode).not.toBe(0); + expect(r.stderr.toString()).toContain("invalid --graph-field-depth"); + }); + + test("--jobs must be a positive integer", () => { + const r = cli("-i", FIXTURE, "-a", "3", "--jobs", "0", "--no-build"); + expect(r.exitCode).not.toBe(0); + expect(r.stderr.toString()).toContain("invalid --jobs"); + }); +}); diff --git a/test/fixtures/dataflow-app/package.json b/test/fixtures/dataflow-app/package.json new file mode 100644 index 0000000..0d9095b --- /dev/null +++ b/test/fixtures/dataflow-app/package.json @@ -0,0 +1,6 @@ +{ + "name": "dataflow-app", + "version": "0.0.0", + "private": true, + "description": "Level-3 dataflow fixture: exercises every stage-1 lowering construct and the shared fixture minimums (dataflow-construction.md)." +} diff --git a/test/fixtures/dataflow-app/src/chain.ts b/test/fixtures/dataflow-app/src/chain.ts new file mode 100644 index 0000000..9dac5dc --- /dev/null +++ b/test/fixtures/dataflow-app/src/chain.ts @@ -0,0 +1,30 @@ +/** Interprocedural chains: a → b → c value flow, cross-file flow, mutual recursion. */ +import { increment } from "./util"; + +export function c(z: number): number { + return z + 1; // the parameter flows to the return value +} + +export function b(y: number): number { + const fromC = c(y); + return fromC; +} + +export function a(x: number): number { + const fromB = b(x); // SUMMARY edge here: arg0 flows through b (and c) to the result + return fromB; +} + +export function viaOtherFile(x: number): number { + return increment(x); // cross-module CALL / PARAM_IN / PARAM_OUT edges +} + +export function isEven(n: number): boolean { + if (n === 0) return true; + return isOdd(n - 1); // mutual recursion: {isEven, isOdd} form an SCC +} + +export function isOdd(n: number): boolean { + if (n === 0) return false; + return isEven(n - 1); +} diff --git a/test/fixtures/dataflow-app/src/closures.ts b/test/fixtures/dataflow-app/src/closures.ts new file mode 100644 index 0000000..522839d --- /dev/null +++ b/test/fixtures/dataflow-app/src/closures.ts @@ -0,0 +1,17 @@ +/** Closure capture and copy aliasing. */ + +export function makeCounter(start: number): () => number { + let count = start; + const inc = (): number => { + count = count + 1; // writes the captured local + return count; + }; + return inc; +} + +export function useAlias(): number { + const p = { f: 1 }; + const q = p; // q and p alias the same object + q.f = 42; // write through q ... + return p.f; // ... must reach this read through p +} diff --git a/test/fixtures/dataflow-app/src/flow.ts b/test/fixtures/dataflow-app/src/flow.ts new file mode 100644 index 0000000..b57b19b --- /dev/null +++ b/test/fixtures/dataflow-app/src/flow.ts @@ -0,0 +1,86 @@ +/** Intraprocedural constructs: branches, loops, early return, exceptions, switch, shadowing. */ + +export function classify(n: number): string { + let label = "none"; + if (n > 0) { + label = "pos"; + } else { + label = "neg"; + } + return label; +} + +export function sumTo(n: number): number { + let acc = 0; + for (let i = 0; i < n; i++) { + acc = acc + i; // loop-carried dependency: acc feeds itself around the back edge + } + return acc; +} + +export function early(n: number): number { + if (n < 0) { + return -1; // early return (multi-exit normalization) + } + const r = n * 2; + return r; +} + +export function parse(s: string): number { + const v = Number(s); + if (Number.isNaN(v)) { + throw new Error(`bad input: ${s}`); + } + return v; +} + +export function guarded(s: string): number { + let out = 0; + try { + out = parse(s); // may throw → exception edge into the catch node + } catch (e) { + out = -1; + } finally { + touch(out); + } + return out; +} + +export function touch(x: number): void { + void x; +} + +export function pickDay(d: number): string { + let name = ""; + switch (d) { + case 0: + name = "sun"; + break; + case 6: + name = "sat"; + break; + default: + name = "weekday"; + } + return name; +} + +export function spin(): number { + let ticks = 0; + while (true) { + ticks = ticks + 1; // infinite loop: the dead loop-exit edge keeps EXIT reachable + if (ticks > 3) { + break; + } + } + return ticks; +} + +export function shadow(): number { + const x = 1; + { + const x = 2; // shadows the outer x — must NOT leak DDG edges across scopes + touch(x); + } + return x; +} diff --git a/test/fixtures/dataflow-app/src/main.ts b/test/fixtures/dataflow-app/src/main.ts new file mode 100644 index 0000000..f37b2da --- /dev/null +++ b/test/fixtures/dataflow-app/src/main.ts @@ -0,0 +1,9 @@ +/** Multi-file flow: state and chain wired together (cross-module SDG edges). */ +import { a } from "./chain"; +import { bump, readCounter } from "./state"; + +export function main(): number { + bump(3); + const r = a(readCounter()); + return r; +} diff --git a/test/fixtures/dataflow-app/src/state.ts b/test/fixtures/dataflow-app/src/state.ts new file mode 100644 index 0000000..d6dc769 --- /dev/null +++ b/test/fixtures/dataflow-app/src/state.ts @@ -0,0 +1,16 @@ +/** Module-level global state: written in one function, read in another. */ + +export let counter = 0; + +export function bump(by: number): void { + counter = counter + by; // global write +} + +export function readCounter(): number { + return counter; // global read that flows to the return value +} + +export function churn(by: number): number { + bump(by); // transitive global write lands on this callsite node + return readCounter(); // transitive global read +} diff --git a/test/fixtures/dataflow-app/src/susp.ts b/test/fixtures/dataflow-app/src/susp.ts new file mode 100644 index 0000000..fcafd7e --- /dev/null +++ b/test/fixtures/dataflow-app/src/susp.ts @@ -0,0 +1,25 @@ +/** Suspension points (await/yield) and intra-statement expression control flow. */ + +export async function compute(x: number): Promise { + return x * 2; +} + +export async function fetchTotal(x: number): Promise { + const a = await compute(x); // await_resume edge out of this statement + return a + 1; +} + +export function* numbers(n: number): Generator { + let i = 0; + while (i < n) { + yield i; // yield (suspend/resume) edge + i = i + 1; + } + return n; +} + +export function shortCircuit(a: { f?: number } | null, b: number): number { + const v = (a && a.f) || b; // short-circuit stays intra-statement (documented rule) + const w = a?.f ?? b; // optional chaining likewise + return v + w; +} diff --git a/test/fixtures/dataflow-app/src/taint.ts b/test/fixtures/dataflow-app/src/taint.ts new file mode 100644 index 0000000..0e71659 --- /dev/null +++ b/test/fixtures/dataflow-app/src/taint.ts @@ -0,0 +1,26 @@ +/** + * Source → sink pair (one raw, one sanitized). The taint client is a staged follow-up + * (issue #2 PR E); the fixture carries the flows now so that PR only adds the query. + */ + +export function source(): string { + return "user-input"; +} + +export function sanitize(s: string): string { + return s.replace(/dangerous/g, ""); +} + +export function sink(s: string): void { + void s; +} + +export function unsafeFlow(): void { + const s = source(); + sink(s); +} + +export function safeFlow(): void { + const s = sanitize(source()); + sink(s); +} diff --git a/test/fixtures/dataflow-app/src/util.ts b/test/fixtures/dataflow-app/src/util.ts new file mode 100644 index 0000000..27ec861 --- /dev/null +++ b/test/fixtures/dataflow-app/src/util.ts @@ -0,0 +1,3 @@ +export function increment(v: number): number { + return v + 1; +} diff --git a/test/fixtures/dataflow-app/tsconfig.json b/test/fixtures/dataflow-app/tsconfig.json new file mode 100644 index 0000000..f0b0c56 --- /dev/null +++ b/test/fixtures/dataflow-app/tsconfig.json @@ -0,0 +1,10 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "strict": false, + "noEmit": true + }, + "include": ["src"] +} diff --git a/test/neo4j-bolt.test.ts b/test/neo4j-bolt.test.ts index 65c8eca..c82e71b 100644 --- a/test/neo4j-bolt.test.ts +++ b/test/neo4j-bolt.test.ts @@ -37,6 +37,9 @@ function optsFor(overrides: Partial = {}): AnalysisOptions { neo4jPassword: "", neo4jDatabase: null, analysisLevel: 1, + graphs: ["cfg", "dfg", "pdg", "sdg"], + graphFieldDepth: 3, + jobs: 1, targetFiles: null, skipTests: true, eager: true, @@ -86,7 +89,7 @@ containerSuite("neo4j bolt writer", () => { test( "full push materializes the whole graph + schema", async () => { - const rows = project(analyze(optsFor()), "sample-app"); + const rows = project(await analyze(optsFor()), "sample-app"); await boltWriter(rows, cfg, log, true); // Every projected node/edge lands (the fixture has no library deps, so endpoints all resolve). @@ -119,7 +122,7 @@ containerSuite("neo4j bolt writer", () => { test( "re-pushing identical analysis is idempotent", async () => { - const rows = project(analyze(optsFor()), "sample-app"); + const rows = project(await analyze(optsFor()), "sample-app"); await boltWriter(rows, cfg, log, true); expect(await num("MATCH (n) RETURN count(n)")).toBe(rows.nodes.length); expect(await num("MATCH ()-[r]->() RETURN count(r)")).toBe(rows.edges.length); @@ -130,7 +133,7 @@ containerSuite("neo4j bolt writer", () => { test( "a full run prunes a module whose source vanished", async () => { - const app = analyze(optsFor()); + const app = await analyze(optsFor()); const victim = Object.keys(app.symbol_table).sort()[0]; delete app.symbol_table[victim]; diff --git a/test/neo4j-schema.test.ts b/test/neo4j-schema.test.ts index 6cbf4c1..ef99b11 100644 --- a/test/neo4j-schema.test.ts +++ b/test/neo4j-schema.test.ts @@ -21,16 +21,17 @@ import type { AnalysisOptions } from "../src/options"; const FIXTURE = path.resolve(import.meta.dir, "fixtures/sample-app"); -function fixtureRows() { +async function fixtureRows() { const cacheDir = fs.mkdtempSync(path.join(os.tmpdir(), "cants-schema-test-")); const opts: AnalysisOptions = { input: FIXTURE, output: null, emit: "json", appName: "sample-app", neo4jUri: null, neo4jUser: "neo4j", neo4jPassword: "", neo4jDatabase: null, - analysisLevel: 1, targetFiles: null, skipTests: true, eager: true, + analysisLevel: 1, graphs: ["cfg", "dfg", "pdg", "sdg"], graphFieldDepth: 3, jobs: 1, + targetFiles: null, skipTests: true, eager: true, noBuild: true, phantoms: true, callGraphProvider: "tsc", cacheDir, verbosity: 0, }; try { - return project(analyze(opts), "sample-app"); + return project(await analyze(opts), "sample-app"); } finally { fs.rmSync(cacheDir, { recursive: true, force: true }); } @@ -49,8 +50,9 @@ function specificLabel(labels: string[]): string { return labels.find((l) => l !== "Symbol" && !markers.has(l)) ?? "Symbol"; } +const rows = await fixtureRows(); + describe("neo4j schema conformance", () => { - const rows = fixtureRows(); test("every emitted node label + property is declared in the schema", () => { for (const node of rows.nodes) { diff --git a/tsconfig.json b/tsconfig.json index 69927ed..4c42630 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,7 +4,7 @@ "module": "ESNext", "moduleResolution": "bundler", "lib": ["ES2022"], - "types": ["node"], + "types": ["node", "bun"], "strict": true, "esModuleInterop": true, "skipLibCheck": true,