From 2c08649627f5ab81e63582af6c108b83df1b82c4 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:34:21 -0400 Subject: [PATCH 01/11] =?UTF-8?q?feat(dataflow):=20stage=201=20=E2=80=94?= =?UTF-8?q?=20exceptional=20statement-level=20CFG=20per=20callable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Level-3 groundwork (#67): hand-built CFG from the stdlib ast with the shared node/edge vocabulary, Python lowering rules (try/except/else/ finally, with, yield/await resume kinds, break/continue, synthetic escape edge for infinite loops), dead-code pruning, and source-span- ordered node ids (ENTRY=0, EXIT=last). Dataflow fixture project and CFG gate tests included. --- codeanalyzer/dataflow/__init__.py | 35 + codeanalyzer/dataflow/cfg.py | 605 ++++++++++++++++++ .../single_functionalities/dataflow/main.py | 95 +++ .../dataflow/pipeline.py | 61 ++ .../single_functionalities/dataflow/state.py | 12 + test/test_dataflow_cfg.py | 193 ++++++ 6 files changed, 1001 insertions(+) create mode 100644 codeanalyzer/dataflow/__init__.py create mode 100644 codeanalyzer/dataflow/cfg.py create mode 100644 test/fixtures/single_functionalities/dataflow/main.py create mode 100644 test/fixtures/single_functionalities/dataflow/pipeline.py create mode 100644 test/fixtures/single_functionalities/dataflow/state.py create mode 100644 test/test_dataflow_cfg.py diff --git a/codeanalyzer/dataflow/__init__.py b/codeanalyzer/dataflow/__init__.py new file mode 100644 index 0000000..8253995 --- /dev/null +++ b/codeanalyzer/dataflow/__init__.py @@ -0,0 +1,35 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Level-3 native dataflow graphs: CFG, PDG (CDG + DDG), and the SDG. + +One pass per module, mirroring the construction ladder: + +- :mod:`cfg` — stage 1, exceptional statement-level CFG per callable; +- :mod:`dominance` — stage 2, post-dominators and control dependence; +- :mod:`access_paths` — stage 3a, the k-limited access-path variable model; +- :mod:`defuse` — stage 3b, reaching definitions → DDG edges; +- :mod:`alias` — stage 5, the type-based may-alias oracle (MVP stub); +- :mod:`scc` — stage 5, Tarjan SCC condensation of the call graph; +- :mod:`summaries` — stage 6, bottom-up formal-in → formal-out summaries; +- :mod:`sdg` — stage 7, parameter nodes and CALL/PARAM_IN/PARAM_OUT/SUMMARY + edges; +- :mod:`slicing` — stage 8, the two-phase context-sensitive backward slice; +- :mod:`builder` — the orchestrator ``build_program_graphs`` wired into + ``Codeanalyzer.analyze`` at ``-a 3``. +""" + +from codeanalyzer.dataflow.cfg import build_cfg # noqa: F401 diff --git a/codeanalyzer/dataflow/cfg.py b/codeanalyzer/dataflow/cfg.py new file mode 100644 index 0000000..0b8a7f1 --- /dev/null +++ b/codeanalyzer/dataflow/cfg.py @@ -0,0 +1,605 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 1 of the level-3 dataflow ladder: the exceptional, statement-level CFG. + +One CFG per callable, lowered from the stdlib ``ast`` tree — the same parse the +symbol-table builder uses, so node spans and callable signatures line up with +the rest of ``analysis.json``. + +Lowering rules (the Python checklist from the CLDK dataflow contract): + +- One synthetic ``ENTRY`` (node id 0) and one synthetic ``EXIT`` (last CFG id). + Multi-exit is normalized: every ``return``/``raise``/fall-off-end gets an + edge to ``EXIT`` with the appropriate kind. +- ``if``/``while``/``for`` headers are their own nodes (kinds ``branch`` / + ``loop``) with ``true``/``false`` out-edges; loop back edges carry + ``loop_back``; ``break``/``continue`` carry their own kinds. +- ``try/except/else/finally``: the try body is lowered in sequence; each + statement that can raise gets an ``exception`` edge to the innermost + enclosing handler chain (or ``EXIT`` when there is none). ``except`` match + clauses are ``handler`` nodes chained by ``false`` edges; an unmatched + exception propagates outward. ``finally`` bodies are lowered once, on the + normal path; abrupt entries (return / unhandled raise / break / continue + observed in the protected region) add corresponding out-edges from the + finally's end. Exceptions raised inside nested ``finally``-protected regions + connect straight to the enclosing handler chain — a documented + over-approximation (the finally body still executes on every normal path, + so its definitions are never lost, only their ordering on pure-exception + paths). +- ``with``/``async with``: the header is a ``statement`` node that defines the + ``as`` targets; the implicit ``__exit__`` try/finally is *not* materialized + (documented over-approximation); body statements keep their exception edges. +- Generators: a statement containing ``yield``/``yield from`` gets its + fall-through successor edge with kind ``yield`` (the resume path) plus a + ``yield`` edge to ``EXIT`` (the generator may never be resumed). + ``await`` marks the successor edge ``await_resume``. +- ``raise`` → ``exception`` edge to the handler chain / EXIT, no fall-through. + ``assert`` gets a fall-through plus an ``exception`` edge. +- Expression-level short-circuit (``and``/``or``/ternary) stays atomic inside + its statement node — the CFG is statement-level by contract. +- Comprehensions are atomic expressions of their statement (their implicit + loop and scope are handled by the access-path model, not the CFG). +- Nested ``def``/``class`` statements are single ``statement`` nodes (the + binding); their bodies get their own CFGs keyed by their own signatures. + Decorators are call-site facts, not CFG nodes. +- Infinite loops (``while True:`` with no break) get a synthetic ``exception`` + edge from the loop header to ``EXIT`` so post-dominance stays well-formed + (in Python any loop can exit via an async signal such as KeyboardInterrupt, + so the edge is semantically honest). +- Statements unreachable from ``ENTRY`` (dead code after a return/raise) are + pruned: they cannot carry dependence. + +Statements are considered able to raise when they contain a call, attribute +access, subscript, explicit ``raise``/``assert``, a ``with`` header, or a +``for`` header (iterator protocol) — over-approximate by design. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple + +# The shared, cross-language node-kind and edge-kind vocabulary. Python adds no +# renamed/repurposed kinds; `yield` / `await_resume` are the contract's own. +NODE_KINDS = ( + "entry", + "exit", + "statement", + "branch", + "loop", + "return", + "raise", + "handler", +) + +EDGE_KINDS = ( + "fallthrough", + "true", + "false", + "switch_case", + "loop_back", + "exception", + "return", + "break", + "continue", + "yield", + "await_resume", +) + + +@dataclass +class CFGNode: + """A statement-level CFG node. ``id`` is assigned in source-span order + after construction (ENTRY = 0, EXIT = last CFG id).""" + + id: int + kind: str + start_line: int = -1 + end_line: int = -1 + start_column: int = -1 + end_column: int = -1 + # The owning AST statement/expression (None for ENTRY/EXIT). Not emitted; + # used by later stages to compute def/use sets. + ast_node: Optional[ast.AST] = field(default=None, repr=False, compare=False) + + +@dataclass(frozen=True) +class CFGEdge: + source: int + target: int + kind: str + + +@dataclass +class ControlFlowGraph: + """CFG of a single callable, keyed externally by the callable signature.""" + + nodes: List[CFGNode] + edges: List[CFGEdge] + entry_id: int + exit_id: int + + def successors(self) -> Dict[int, List[Tuple[int, str]]]: + succ: Dict[int, List[Tuple[int, str]]] = {n.id: [] for n in self.nodes} + for e in self.edges: + succ[e.source].append((e.target, e.kind)) + return succ + + def predecessors(self) -> Dict[int, List[Tuple[int, str]]]: + pred: Dict[int, List[Tuple[int, str]]] = {n.id: [] for n in self.nodes} + for e in self.edges: + pred[e.target].append((e.source, e.kind)) + return pred + + def node_by_id(self, node_id: int) -> CFGNode: + return next(n for n in self.nodes if n.id == node_id) + + +class _TempNode: + """Mutable node used during lowering, renumbered at finalize time.""" + + __slots__ = ("kind", "ast_node", "span", "seq") + + def __init__(self, kind: str, ast_node: Optional[ast.AST], span, seq: int): + self.kind = kind + self.ast_node = ast_node + self.span = span # (start_line, start_col, end_line, end_col) + self.seq = seq + + +def _span_of(node: ast.AST) -> Tuple[int, int, int, int]: + return ( + getattr(node, "lineno", -1), + getattr(node, "col_offset", -1), + getattr(node, "end_lineno", getattr(node, "lineno", -1)), + getattr(node, "end_col_offset", -1), + ) + + +def _contains(node: ast.AST, types: tuple, *, into_nested_defs: bool = False) -> bool: + """True if ``node`` contains an AST node of one of ``types``, without + descending into nested function/class definitions (their bodies belong to + other CFGs) unless requested.""" + stop = (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Lambda) + for child in ast.iter_child_nodes(node): + if isinstance(child, types): + return True + if not into_nested_defs and isinstance(child, stop): + continue + if _contains(child, types, into_nested_defs=into_nested_defs): + return True + return False + + +def _can_raise(stmt: ast.stmt) -> bool: + """Over-approximate: if we can't prove the statement doesn't throw, it + gets the exception edge (contract rule).""" + if isinstance(stmt, (ast.Raise, ast.Assert, ast.With, ast.AsyncWith, ast.For, ast.AsyncFor)): + return True + return _contains(stmt, (ast.Call, ast.Attribute, ast.Subscript, ast.Await)) + + +def _stmt_kind(stmt: ast.stmt) -> str: + if isinstance(stmt, ast.Return): + return "return" + if isinstance(stmt, ast.Raise): + return "raise" + if isinstance(stmt, ast.If): + return "branch" + if isinstance(stmt, (ast.While, ast.For, ast.AsyncFor)): + return "loop" + return "statement" + + +def _resume_kind(stmt: ast.stmt) -> str: + """Edge kind of the statement's normal successor edge: generators resume + after a yield, coroutines after an await.""" + if _contains(stmt, (ast.Yield, ast.YieldFrom)): + return "yield" + if _contains(stmt, (ast.Await,)): + return "await_resume" + return "fallthrough" + + +class _LoopFrame: + __slots__ = ("header", "break_fringe") + + def __init__(self, header: _TempNode): + self.header = header + # (node, kind) dangling edges produced by `break` — connected to the + # loop's successor once the loop is fully lowered. + self.break_fringe: List[Tuple[_TempNode, str]] = [] + + +class _FinallyFrame: + """Tracks a try/finally protected region while its body is lowered. + + ``entry_fringe`` collects the abrupt-exit nodes (return / raise / break / + continue) observed inside the protected region — they become incoming + edges of the finally body, which is how the finally stays reachable when + the try body never completes normally. ``abrupt`` records which exit kinds + were seen so the finally's end re-emits a matching out-edge for each.""" + + __slots__ = ("abrupt", "entry_fringe") + + def __init__(self): + self.abrupt: Set[str] = set() + self.entry_fringe: List[Tuple[_TempNode, str]] = [] + + +class CFGBuilder: + """Lowers one callable's AST into a :class:`ControlFlowGraph`.""" + + def __init__(self) -> None: + self._nodes: List[_TempNode] = [] + self._edges: List[Tuple[_TempNode, _TempNode, str]] = [] + self._seq = 0 + self._loop_stack: List[_LoopFrame] = [] + # Innermost-first chain of exception targets: (first handler node of a + # try's except chain, finally-stack depth when it was pushed). The + # depth lets exception edges mark only the finally frames *inside* the + # protected region as abruptly exited — an exception caught by this + # try's own handler re-enters the normal path. + self._handler_stack: List[Tuple[_TempNode, int]] = [] + self._finally_stack: List[_FinallyFrame] = [] + + # ---------------------------------------------------------------- helpers + + def _new_node(self, kind: str, ast_node: Optional[ast.AST], span=None) -> _TempNode: + node = _TempNode(kind, ast_node, span or (_span_of(ast_node) if ast_node else (-1, -1, -1, -1)), self._seq) + self._seq += 1 + self._nodes.append(node) + return node + + def _connect(self, fringe: List[Tuple[_TempNode, str]], target: _TempNode) -> None: + for source, kind in fringe: + self._edges.append((source, target, kind)) + + def _exception_target(self) -> Optional[_TempNode]: + return self._handler_stack[-1][0] if self._handler_stack else None + + def _mark_exception_transit(self, node: Optional[_TempNode] = None) -> None: + """Mark the finally frames an in-flight exception passes through: + every frame inside the innermost handler's protected region, or all + frames when the exception escapes the function.""" + depth = self._handler_stack[-1][1] if self._handler_stack else 0 + transit = self._finally_stack[depth:] + for frame in transit: + frame.abrupt.add("exception") + if node is not None and transit: + transit[-1].entry_fringe.append((node, "exception")) + + def _add_exception_edge(self, node: _TempNode, exit_node: _TempNode) -> None: + target = self._exception_target() or exit_node + self._edges.append((node, target, "exception")) + self._mark_exception_transit() + + # ----------------------------------------------------------------- build + + def build(self, func: ast.AST) -> ControlFlowGraph: + """``func`` is a FunctionDef / AsyncFunctionDef whose body is lowered. + ENTRY takes the ``def`` line's span; EXIT the end of the callable.""" + entry = self._new_node("entry", None, span=(func.lineno, func.col_offset, func.lineno, func.col_offset)) + end_line = getattr(func, "end_lineno", func.lineno) + end_col = getattr(func, "end_col_offset", -1) + self._exit = self._new_node("exit", None, span=(end_line, end_col, end_line, end_col)) + + fringe = self._lower_block(func.body, [(entry, "fallthrough")]) + # Fall-off-end is an implicit `return None`. + self._connect([(n, "return") for n, _ in fringe], self._exit) + + return self._finalize(entry, self._exit) + + # ------------------------------------------------------------- lowering + + def _lower_block( + self, stmts: List[ast.stmt], fringe: List[Tuple[_TempNode, str]] + ) -> List[Tuple[_TempNode, str]]: + for stmt in stmts: + if not fringe: + # Dead code after return/raise/break/continue: lower it anyway + # (nodes unreachable from ENTRY are pruned at finalize). + pass + fringe = self._lower_stmt(stmt, fringe) + return fringe + + def _lower_stmt( + self, stmt: ast.stmt, fringe: List[Tuple[_TempNode, str]] + ) -> List[Tuple[_TempNode, str]]: + if isinstance(stmt, ast.If): + return self._lower_if(stmt, fringe) + if isinstance(stmt, ast.While): + return self._lower_while(stmt, fringe) + if isinstance(stmt, (ast.For, ast.AsyncFor)): + return self._lower_for(stmt, fringe) + if isinstance(stmt, ast.Try): + return self._lower_try(stmt, fringe) + if isinstance(stmt, (ast.With, ast.AsyncWith)): + return self._lower_with(stmt, fringe) + if isinstance(stmt, ast.Return): + return self._lower_return(stmt, fringe) + if isinstance(stmt, ast.Raise): + return self._lower_raise(stmt, fringe) + if isinstance(stmt, ast.Break): + return self._lower_break(stmt, fringe) + if isinstance(stmt, ast.Continue): + return self._lower_continue(stmt, fringe) + # Simple statement (incl. nested def/class = the binding statement). + node = self._new_node(_stmt_kind(stmt), stmt) + self._connect(fringe, node) + if _can_raise(stmt): + self._add_exception_edge(node, self._exit) + resume = _resume_kind(stmt) + if resume == "yield": + # The generator may be abandoned at any yield. + self._edges.append((node, self._exit, "yield")) + return [(node, resume)] + + def _lower_if(self, stmt: ast.If, fringe): + header = self._new_node("branch", stmt, span=_span_of(stmt.test)) + self._connect(fringe, header) + if _can_raise_expr(stmt.test): + self._add_exception_edge(header, self._exit) + then_fringe = self._lower_block(stmt.body, [(header, "true")]) + if stmt.orelse: + else_fringe = self._lower_block(stmt.orelse, [(header, "false")]) + else: + else_fringe = [(header, "false")] + return then_fringe + else_fringe + + def _lower_while(self, stmt: ast.While, fringe): + header = self._new_node("loop", stmt, span=_span_of(stmt.test)) + self._connect(fringe, header) + if _can_raise_expr(stmt.test): + self._add_exception_edge(header, self._exit) + + frame = _LoopFrame(header) + self._loop_stack.append(frame) + body_fringe = self._lower_block(stmt.body, [(header, "true")]) + self._loop_stack.pop() + self._connect([(n, "loop_back") for n, _ in body_fringe], header) + + # `while True:` / constant-true tests never take the false edge. + always_true = isinstance(stmt.test, ast.Constant) and bool(stmt.test.value) + out = [] if always_true else [(header, "false")] + if stmt.orelse: + out = self._lower_block(stmt.orelse, out) + return out + frame.break_fringe + + def _lower_for(self, stmt, fringe): + header = self._new_node("loop", stmt, span=_span_of(stmt.iter)) + self._connect(fringe, header) + # The iterator protocol can raise. + self._add_exception_edge(header, self._exit) + + frame = _LoopFrame(header) + self._loop_stack.append(frame) + body_fringe = self._lower_block(stmt.body, [(header, "true")]) + self._loop_stack.pop() + self._connect([(n, "loop_back") for n, _ in body_fringe], header) + + out = [(header, "false")] + if stmt.orelse: + out = self._lower_block(stmt.orelse, out) + return out + frame.break_fringe + + def _lower_try(self, stmt: ast.Try, fringe): + has_finally = bool(stmt.finalbody) + finally_frame = _FinallyFrame() if has_finally else None + + handler_entry: Optional[_TempNode] = None + handler_nodes: List[_TempNode] = [] + if stmt.handlers: + for handler in stmt.handlers: + node = self._new_node("handler", handler, span=( + handler.lineno, + handler.col_offset, + getattr(handler.type, "end_lineno", handler.lineno) if handler.type else handler.lineno, + getattr(handler.type, "end_col_offset", -1) if handler.type else -1, + )) + handler_nodes.append(node) + handler_entry = handler_nodes[0] + + if finally_frame is not None: + self._finally_stack.append(finally_frame) + + # Protected region: body (+ else) raises reach this try's handlers. + if handler_entry is not None: + self._handler_stack.append((handler_entry, len(self._finally_stack))) + body_fringe = self._lower_block(stmt.body, fringe) + if stmt.orelse: + body_fringe = self._lower_block(stmt.orelse, body_fringe) + if handler_entry is not None: + self._handler_stack.pop() + + # Handler chain: matched → handler body; unmatched → next handler, + # falling off the chain propagates outward (outer handler or EXIT). + handler_exit_fringes: List[Tuple[_TempNode, str]] = [] + for i, (handler, node) in enumerate(zip(stmt.handlers, handler_nodes)): + hb_fringe = self._lower_block(handler.body, [(node, "true")]) + handler_exit_fringes.extend(hb_fringe) + is_catch_all = handler.type is None + if i + 1 < len(handler_nodes): + self._edges.append((node, handler_nodes[i + 1], "false")) + elif not is_catch_all: + outer = self._exception_target() or self._exit + self._edges.append((node, outer, "exception")) + self._mark_exception_transit(node) + + normal_fringe = body_fringe + handler_exit_fringes + + if finally_frame is not None: + self._finally_stack.pop() + fin_entry = normal_fringe + finally_frame.entry_fringe + fin_fringe = self._lower_block(stmt.finalbody, fin_entry) + # Abrupt completions observed in the protected region re-emerge + # from the finally body's end. + for node, _kind in list(fin_fringe): + if "return" in finally_frame.abrupt: + self._edges.append((node, self._exit, "return")) + if "exception" in finally_frame.abrupt: + target = self._exception_target() or self._exit + self._edges.append((node, target, "exception")) + if "break" in finally_frame.abrupt and self._loop_stack: + self._loop_stack[-1].break_fringe.append((node, "break")) + if "continue" in finally_frame.abrupt and self._loop_stack: + self._edges.append((node, self._loop_stack[-1].header, "continue")) + return fin_fringe + + return normal_fringe + + def _lower_with(self, stmt, fringe): + node = self._new_node("statement", stmt, span=( + stmt.lineno, + stmt.col_offset, + stmt.items[-1].context_expr.end_lineno, + stmt.items[-1].context_expr.end_col_offset, + )) + self._connect(fringe, node) + self._add_exception_edge(node, self._exit) + return self._lower_block(stmt.body, [(node, "fallthrough")]) + + def _lower_return(self, stmt: ast.Return, fringe): + node = self._new_node("return", stmt) + self._connect(fringe, node) + if stmt.value is not None and _can_raise_expr(stmt.value): + self._add_exception_edge(node, self._exit) + if self._finally_stack: + # Routed through the innermost finally; its end re-emits `return`. + for frame in self._finally_stack: + frame.abrupt.add("return") + self._finally_stack[-1].entry_fringe.append((node, "return")) + return [] + self._edges.append((node, self._exit, "return")) + return [] + + def _lower_raise(self, stmt: ast.Raise, fringe): + node = self._new_node("raise", stmt) + self._connect(fringe, node) + target = self._exception_target() or self._exit + self._edges.append((node, target, "exception")) + self._mark_exception_transit(node) + return [] + + def _lower_break(self, stmt: ast.Break, fringe): + node = self._new_node("statement", stmt) + self._connect(fringe, node) + if self._loop_stack: + self._loop_stack[-1].break_fringe.append((node, "break")) + for frame in self._finally_stack: + frame.abrupt.add("break") + if self._finally_stack: + self._finally_stack[-1].entry_fringe.append((node, "break")) + return [] + + def _lower_continue(self, stmt: ast.Continue, fringe): + node = self._new_node("statement", stmt) + self._connect(fringe, node) + if self._loop_stack: + self._edges.append((node, self._loop_stack[-1].header, "continue")) + for frame in self._finally_stack: + frame.abrupt.add("continue") + if self._finally_stack: + self._finally_stack[-1].entry_fringe.append((node, "continue")) + return [] + + # ------------------------------------------------------------- finalize + + def _finalize(self, entry: _TempNode, exit_node: _TempNode) -> ControlFlowGraph: + # 1. Prune nodes unreachable from ENTRY (dead code). + succ: Dict[_TempNode, List[Tuple[_TempNode, str]]] = {n: [] for n in self._nodes} + for s, t, k in self._edges: + succ[s].append((t, k)) + reachable: Set[_TempNode] = set() + stack = [entry] + while stack: + n = stack.pop() + if n in reachable: + continue + reachable.add(n) + for t, _ in succ[n]: + if t not in reachable: + stack.append(t) + reachable.add(exit_node) # EXIT always exists even if nothing reaches it yet + + # 2. Synthetic escape edges: any reachable node that cannot reach EXIT + # sits in an infinite loop; give its loop header an `exception` + # edge to EXIT (documented above). + live_edges = [(s, t, k) for s, t, k in self._edges if s in reachable and t in reachable] + pred: Dict[_TempNode, List[_TempNode]] = {n: [] for n in reachable} + for s, t, _ in live_edges: + pred[t].append(s) + reaches_exit: Set[_TempNode] = set() + stack = [exit_node] + while stack: + n = stack.pop() + if n in reaches_exit: + continue + reaches_exit.add(n) + for p in pred[n]: + if p not in reaches_exit: + stack.append(p) + stuck = [n for n in reachable if n not in reaches_exit] + if stuck: + headers = [n for n in stuck if n.kind == "loop"] or stuck + for header in headers: + live_edges.append((header, exit_node, "exception")) + + # 3. Renumber in source-span order: ENTRY = 0, EXIT = last. + middle = sorted( + (n for n in reachable if n is not entry and n is not exit_node), + key=lambda n: (n.span, n.seq), + ) + ordered = [entry] + middle + [exit_node] + ids = {n: i for i, n in enumerate(ordered)} + + nodes = [ + CFGNode( + id=ids[n], + kind=n.kind, + start_line=n.span[0], + start_column=n.span[1], + end_line=n.span[2], + end_column=n.span[3], + ast_node=n.ast_node, + ) + for n in ordered + ] + seen: Set[Tuple[int, int, str]] = set() + edges: List[CFGEdge] = [] + for s, t, k in sorted(live_edges, key=lambda e: (ids[e[0]], ids[e[1]], e[2])): + key = (ids[s], ids[t], k) + if key in seen: + continue + seen.add(key) + edges.append(CFGEdge(source=ids[s], target=ids[t], kind=k)) + + return ControlFlowGraph( + nodes=nodes, edges=edges, entry_id=ids[entry], exit_id=ids[exit_node] + ) + + +def _can_raise_expr(expr: ast.expr) -> bool: + return isinstance(expr, (ast.Call, ast.Attribute, ast.Subscript, ast.Await)) or _contains( + expr, (ast.Call, ast.Attribute, ast.Subscript, ast.Await) + ) + + +def build_cfg(func: ast.AST) -> ControlFlowGraph: + """Build the exceptional, statement-level CFG of one callable.""" + return CFGBuilder().build(func) diff --git a/test/fixtures/single_functionalities/dataflow/main.py b/test/fixtures/single_functionalities/dataflow/main.py new file mode 100644 index 0000000..32a86fb --- /dev/null +++ b/test/fixtures/single_functionalities/dataflow/main.py @@ -0,0 +1,95 @@ +"""Intraprocedural dataflow constructs with hand-computable graphs. + +Every callable here is referenced by name in the level-3 gate tests +(test_dataflow_*.py); keep line numbers stable when editing. +""" + +from pipeline import chain_a +from state import bump, read_counter + + +def branchy(n): + if n > 0: + x = n + 1 + else: + x = -n + return x + + +def looped(n): + total = 0 + i = 0 + while i < n: + total = total + i + i = i + 1 + return total + + +def early_exit(n): + if n < 0: + return -1 + y = n * 2 + return y + + +def risky(n): + if n < 0: + raise ValueError("negative") + return n + + +def handles(n): + try: + v = risky(n) + ok = 1 + except ValueError: + v = 0 + ok = 0 + finally: + done = True + return v + ok + + +def with_block(path): + with open(path) as fh: + data = fh.read() + return data + + +def comprehend(items): + squares = [i * i for i in items] + i = "not-the-loop-var" + return squares, i + + +def gen(n): + k = 0 + while k < n: + yield k + k = k + 1 + + +async def slow(x): + return x + 1 + + +async def fetch(x): + y = await slow(x) + return y + + +def short_circuit(a, b): + c = a and b + d = a or b + return c, d + + +def infinite(): + while True: + pass + + +def drive(n): + r = chain_a(n) + bump(r) + return read_counter() diff --git a/test/fixtures/single_functionalities/dataflow/pipeline.py b/test/fixtures/single_functionalities/dataflow/pipeline.py new file mode 100644 index 0000000..110fd8a --- /dev/null +++ b/test/fixtures/single_functionalities/dataflow/pipeline.py @@ -0,0 +1,61 @@ +"""Interprocedural fixture: call chain, mutual recursion, aliasing, closures.""" + + +def chain_a(v): + return chain_b(v + 1) + + +def chain_b(v): + return chain_c(v * 2) + + +def chain_c(v): + return v - 3 + + +def even(n): + if n == 0: + return True + return odd(n - 1) + + +def odd(n): + if n == 0: + return False + return even(n - 1) + + +class Box: + def __init__(self, value): + self.value = value + + def get(self): + return self.value + + +def alias_flow(): + p = Box(10) + q = p + q.value = 42 + return p.get() + + +def make_adder(base): + def add(x): + return x + base + return add + + +def use_adder(n): + add5 = make_adder(5) + return add5(n) + + +def mutate(items): + items.append(1) + + +def caller_of_mutate(): + xs = [] + mutate(xs) + return xs diff --git a/test/fixtures/single_functionalities/dataflow/state.py b/test/fixtures/single_functionalities/dataflow/state.py new file mode 100644 index 0000000..3661faa --- /dev/null +++ b/test/fixtures/single_functionalities/dataflow/state.py @@ -0,0 +1,12 @@ +"""Module-global fixture: written in one function, read in another.""" + +counter = 0 + + +def bump(amount): + global counter + counter = counter + amount + + +def read_counter(): + return counter diff --git a/test/test_dataflow_cfg.py b/test/test_dataflow_cfg.py new file mode 100644 index 0000000..36952d6 --- /dev/null +++ b/test/test_dataflow_cfg.py @@ -0,0 +1,193 @@ +"""Stage-1 gate: the exceptional, statement-level CFG. + +Contract assertions (dataflow-graphs § verification gates): +- every node maps to a real source span; +- single ENTRY (id 0) / single EXIT (last id), ids contiguous; +- every node is reachable from ENTRY and reaches EXIT; +- every throwing construct in the fixture produces its exception edges; +- node/edge sets are stable across two runs on identical content. +""" + +import ast +from pathlib import Path + +import pytest + +from codeanalyzer.dataflow.cfg import EDGE_KINDS, NODE_KINDS, ControlFlowGraph, build_cfg + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +def _cfg_of(file_name: str, func_name: str) -> ControlFlowGraph: + tree = ast.parse((FIXTURE / file_name).read_text()) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name: + return build_cfg(node) + raise AssertionError(f"{func_name} not found in {file_name}") + + +def _all_fixture_cfgs(): + cfgs = {} + for file_name in ("main.py", "pipeline.py", "state.py"): + tree = ast.parse((FIXTURE / file_name).read_text()) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + cfgs[f"{file_name}::{node.name}"] = build_cfg(node) + return cfgs + + +def _reachable(cfg: ControlFlowGraph, start: int, forward: bool = True) -> set: + adj = {} + for e in cfg.edges: + a, b = (e.source, e.target) if forward else (e.target, e.source) + adj.setdefault(a, []).append(b) + seen, stack = set(), [start] + while stack: + n = stack.pop() + if n in seen: + continue + seen.add(n) + stack.extend(adj.get(n, [])) + return seen + + +def _edges(cfg: ControlFlowGraph, kind: str = None): + return [e for e in cfg.edges if kind is None or e.kind == kind] + + +def _node_at_line(cfg: ControlFlowGraph, line: int): + matches = [n for n in cfg.nodes if n.start_line == line] + assert matches, f"no CFG node at line {line}" + return matches[0] + + +# --------------------------------------------------------------------- gates + + +def test_every_function_has_single_entry_and_exit_with_contiguous_ids(): + for name, cfg in _all_fixture_cfgs().items(): + entries = [n for n in cfg.nodes if n.kind == "entry"] + exits = [n for n in cfg.nodes if n.kind == "exit"] + assert len(entries) == 1 and entries[0].id == 0, name + assert len(exits) == 1 and exits[0].id == len(cfg.nodes) - 1, name + assert sorted(n.id for n in cfg.nodes) == list(range(len(cfg.nodes))), name + + +def test_every_node_reachable_from_entry_and_reaches_exit(): + for name, cfg in _all_fixture_cfgs().items(): + ids = {n.id for n in cfg.nodes} + assert _reachable(cfg, cfg.entry_id, forward=True) == ids, name + assert _reachable(cfg, cfg.exit_id, forward=False) == ids, name + + +def test_every_node_maps_to_a_real_source_span(): + for name, cfg in _all_fixture_cfgs().items(): + for n in cfg.nodes: + assert n.start_line > 0, f"{name} node {n.id} has no source span" + + +def test_vocabulary_is_the_shared_contract(): + for name, cfg in _all_fixture_cfgs().items(): + for n in cfg.nodes: + assert n.kind in NODE_KINDS, f"{name}: unknown node kind {n.kind}" + for e in cfg.edges: + assert e.kind in EDGE_KINDS, f"{name}: unknown edge kind {e.kind}" + + +def test_stable_across_two_runs_on_identical_content(): + first = {k: (tuple((n.id, n.kind, n.start_line) for n in c.nodes), tuple(c.edges)) + for k, c in _all_fixture_cfgs().items()} + second = {k: (tuple((n.id, n.kind, n.start_line) for n in c.nodes), tuple(c.edges)) + for k, c in _all_fixture_cfgs().items()} + assert first == second + + +# ----------------------------------------------------------- fixture lowering + + +def test_branchy_if_has_true_and_false_edges(): + cfg = _cfg_of("main.py", "branchy") + branch = next(n for n in cfg.nodes if n.kind == "branch") + kinds = {e.kind for e in cfg.edges if e.source == branch.id} + assert {"true", "false"} <= kinds + + +def test_looped_has_loop_back_edge(): + cfg = _cfg_of("main.py", "looped") + header = next(n for n in cfg.nodes if n.kind == "loop") + loop_backs = [e for e in _edges(cfg, "loop_back") if e.target == header.id] + assert loop_backs, "loop-carried back edge missing" + + +def test_early_exit_multi_exit_is_normalized(): + cfg = _cfg_of("main.py", "early_exit") + returns = [n for n in cfg.nodes if n.kind == "return"] + assert len(returns) == 2 + for r in returns: + assert any( + e.source == r.id and e.target == cfg.exit_id and e.kind == "return" + for e in cfg.edges + ), "return node must edge to EXIT with kind=return" + + +def test_risky_raise_has_exception_edge_to_exit(): + cfg = _cfg_of("main.py", "risky") + raise_node = next(n for n in cfg.nodes if n.kind == "raise") + assert any( + e.source == raise_node.id and e.target == cfg.exit_id and e.kind == "exception" + for e in cfg.edges + ) + + +def test_handles_call_exception_edge_targets_handler(): + cfg = _cfg_of("main.py", "handles") + handler = next(n for n in cfg.nodes if n.kind == "handler") + # `v = risky(n)` can raise; its exception edge goes to the handler chain. + call_stmt = _node_at_line(cfg, 43) + assert any( + e.source == call_stmt.id and e.target == handler.id and e.kind == "exception" + for e in cfg.edges + ) + + +def test_handles_finally_is_on_normal_and_handler_paths(): + cfg = _cfg_of("main.py", "handles") + fin = _node_at_line(cfg, 49) # done = True + preds = {e.source for e in cfg.edges if e.target == fin.id} + body_end = _node_at_line(cfg, 44) # ok = 1 + handler_end = _node_at_line(cfg, 47) # ok = 0 + assert body_end.id in preds and handler_end.id in preds + + +def test_with_block_header_defines_scope_and_can_raise(): + cfg = _cfg_of("main.py", "with_block") + with_node = _node_at_line(cfg, 54) + assert any( + e.source == with_node.id and e.kind == "exception" for e in cfg.edges + ), "with header (__enter__) must carry an exception edge" + + +def test_gen_yield_edges(): + cfg = _cfg_of("main.py", "gen") + yield_stmt = _node_at_line(cfg, 68) + out = [(e.target, e.kind) for e in cfg.edges if e.source == yield_stmt.id] + kinds = {k for _, k in out} + assert "yield" in kinds + assert (cfg.exit_id, "yield") in out, "generator may be abandoned at any yield" + + +def test_fetch_await_resume_edge(): + cfg = _cfg_of("main.py", "fetch") + await_stmt = _node_at_line(cfg, 77) + assert any( + e.source == await_stmt.id and e.kind == "await_resume" for e in cfg.edges + ) + + +def test_infinite_loop_gets_synthetic_escape_edge(): + cfg = _cfg_of("main.py", "infinite") + header = next(n for n in cfg.nodes if n.kind == "loop") + assert any( + e.source == header.id and e.target == cfg.exit_id and e.kind == "exception" + for e in cfg.edges + ), "infinite loop header must get the documented synthetic edge to EXIT" From 6af65e0d994967417fc4c2254934a1741243c858 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:36:52 -0400 Subject: [PATCH 02/11] =?UTF-8?q?feat(dataflow):=20stage=202=20=E2=80=94?= =?UTF-8?q?=20post-dominators=20and=20control=20dependence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cooper–Harper–Kennedy iterative post-dominators over the reverse CFG (unique root EXIT, guaranteed by stage 1's synthetic escape edges) and Ferrante–Ottenstein–Warren control dependence with ENTRY as the region root. Gate tests pin exact hand-computed CDG sets for the fixture's if/loop/early-return functions. (#67) --- codeanalyzer/dataflow/dominance.py | 140 +++++++++++++++++++++++++++++ test/test_dataflow_dominance.py | 104 +++++++++++++++++++++ 2 files changed, 244 insertions(+) create mode 100644 codeanalyzer/dataflow/dominance.py create mode 100644 test/test_dataflow_dominance.py diff --git a/codeanalyzer/dataflow/dominance.py b/codeanalyzer/dataflow/dominance.py new file mode 100644 index 0000000..7b6dd4d --- /dev/null +++ b/codeanalyzer/dataflow/dominance.py @@ -0,0 +1,140 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 2 of the level-3 dataflow ladder: dominance and control dependence. + +Post-dominators are computed with the Cooper–Harper–Kennedy iterative +algorithm over the reverse CFG. Infinite loops are already normalized by the +CFG builder (synthetic escape edge to EXIT), so the post-dominator tree always +has the unique root EXIT. + +Control dependence follows Ferrante–Ottenstein–Warren: for each CFG edge +``(a, b)`` where ``b`` does not post-dominate ``a``, every node on the +post-dominator-tree path from ``b`` up to (but not including) ``a``'s +immediate post-dominator is control-dependent on ``a``. + +Nodes with no branch-node control dependence are control-dependent on ENTRY — +the conventional region root, which keeps every statement anchored in the PDG +and gives interprocedural traversals a path from a callee's ENTRY to its +unconditional statements. +""" + +from __future__ import annotations + +from typing import Dict, List, Set, Tuple + +from codeanalyzer.dataflow.cfg import ControlFlowGraph + + +def _postorder(adj: Dict[int, List[int]], root: int) -> List[int]: + """Iterative DFS postorder over ``adj`` from ``root``.""" + order: List[int] = [] + visited: Set[int] = set() + stack: List[Tuple[int, int]] = [(root, 0)] + visited.add(root) + while stack: + node, i = stack.pop() + children = adj.get(node, []) + if i < len(children): + stack.append((node, i + 1)) + child = children[i] + if child not in visited: + visited.add(child) + stack.append((child, 0)) + else: + order.append(node) + return order + + +def post_dominators(cfg: ControlFlowGraph) -> Dict[int, int]: + """Immediate post-dominator of every node, as ``{node: ipdom}``. + + EXIT is its own post-dominator (the tree root). Cooper–Harper–Kennedy + ("A Simple, Fast Dominance Algorithm") run on the reverse CFG. + """ + # Reverse CFG: successors of n are the CFG predecessors of n. + radj: Dict[int, List[int]] = {n.id: [] for n in cfg.nodes} + rpred: Dict[int, List[int]] = {n.id: [] for n in cfg.nodes} + for e in cfg.edges: + if e.source == e.target: + continue # self-loops carry no dominance information + radj[e.target].append(e.source) + rpred[e.source].append(e.target) + + root = cfg.exit_id + post = _postorder(radj, root) + number = {n: i for i, n in enumerate(post)} # postorder number + rpo = list(reversed(post)) # reverse postorder: root first + + ipdom: Dict[int, int] = {root: root} + + def intersect(a: int, b: int) -> int: + while a != b: + while number[a] < number[b]: + a = ipdom[a] + while number[b] < number[a]: + b = ipdom[b] + return a + + changed = True + while changed: + changed = False + for node in rpo: + if node == root: + continue + preds = [p for p in rpred[node] if p in ipdom] + if not preds: + continue + new = preds[0] + for p in preds[1:]: + new = intersect(new, p) + if ipdom.get(node) != new: + ipdom[node] = new + changed = True + + return ipdom + + +def control_dependence(cfg: ControlFlowGraph) -> List[Tuple[int, int]]: + """CDG edges ``(branch_node, dependent_node)`` per Ferrante–Ottenstein– + Warren, plus ENTRY-region edges for nodes with no other controller.""" + ipdom = post_dominators(cfg) + + deps: Set[Tuple[int, int]] = set() + for e in cfg.edges: + a, b = e.source, e.target + if a == b: + continue + # b post-dominates a iff b is an ancestor of a in the pdom tree. + runner = b + stop = ipdom.get(a) + # Walk from b up the post-dominator tree to (not including) ipdom(a). + while runner != stop and runner != a: + deps.add((a, runner)) + nxt = ipdom.get(runner) + if nxt is None or nxt == runner: + break + runner = nxt + + # ENTRY as the region root for otherwise-uncontrolled nodes. + controlled = {t for (_, t) in deps} + for n in cfg.nodes: + if n.id in (cfg.entry_id, cfg.exit_id): + continue + if n.id not in controlled: + deps.add((cfg.entry_id, n.id)) + + return sorted(deps) diff --git a/test/test_dataflow_dominance.py b/test/test_dataflow_dominance.py new file mode 100644 index 0000000..0544135 --- /dev/null +++ b/test/test_dataflow_dominance.py @@ -0,0 +1,104 @@ +"""Stage-2 gate: post-dominators and control dependence. + +Contract assertions: +- the post-dominator tree is a tree with unique root EXIT (infinite loops + included, thanks to the CFG's synthetic escape edge); +- hand-computed control dependences for the fixture's if / loop / + early-return functions match exactly. +""" + +import ast +from pathlib import Path + +from codeanalyzer.dataflow.cfg import ControlFlowGraph, build_cfg +from codeanalyzer.dataflow.dominance import control_dependence, post_dominators + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +def _cfg_of(file_name: str, func_name: str) -> ControlFlowGraph: + tree = ast.parse((FIXTURE / file_name).read_text()) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name: + return build_cfg(node) + raise AssertionError(f"{func_name} not found") + + +def _all_fixture_cfgs(): + cfgs = {} + for file_name in ("main.py", "pipeline.py", "state.py"): + tree = ast.parse((FIXTURE / file_name).read_text()) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + cfgs[f"{file_name}::{node.name}"] = build_cfg(node) + return cfgs + + +def _by_line(cfg: ControlFlowGraph): + """id ↔ line helpers for hand-computed expectations.""" + return {n.start_line: n.id for n in cfg.nodes if n.kind not in ("entry", "exit")} + + +def test_post_dominator_tree_is_rooted_at_exit_for_every_function(): + for name, cfg in _all_fixture_cfgs().items(): + ipdom = post_dominators(cfg) + ids = {n.id for n in cfg.nodes} + assert set(ipdom) == ids, f"{name}: some node has no post-dominator" + assert ipdom[cfg.exit_id] == cfg.exit_id, name + # Tree: walking up from any node terminates at EXIT without cycles. + for n in ids: + seen = set() + cur = n + while cur != cfg.exit_id: + assert cur not in seen, f"{name}: ipdom cycle at {cur}" + seen.add(cur) + cur = ipdom[cur] + + +def test_branchy_control_dependence_exact(): + cfg = _cfg_of("main.py", "branchy") + line = _by_line(cfg) + header, then_s, else_s, ret = line[12], line[13], line[15], line[16] + expected = { + (cfg.entry_id, header), + (cfg.entry_id, ret), + (header, then_s), + (header, else_s), + } + assert set(control_dependence(cfg)) == expected + + +def test_looped_control_dependence_exact(): + cfg = _cfg_of("main.py", "looped") + line = _by_line(cfg) + s_total, s_i, header, s_add, s_inc, ret = ( + line[20], line[21], line[22], line[23], line[24], line[25], + ) + expected = { + (cfg.entry_id, s_total), + (cfg.entry_id, s_i), + (cfg.entry_id, header), + (cfg.entry_id, ret), + (header, s_add), + (header, s_inc), + } + assert set(control_dependence(cfg)) == expected + + +def test_early_exit_control_dependence_exact(): + cfg = _cfg_of("main.py", "early_exit") + line = _by_line(cfg) + header, ret1, s_y, ret2 = line[29], line[30], line[31], line[32] + expected = { + (cfg.entry_id, header), + (header, ret1), + (header, s_y), + (header, ret2), + } + assert set(control_dependence(cfg)) == expected + + +def test_infinite_loop_post_dominance_well_formed(): + cfg = _cfg_of("main.py", "infinite") + ipdom = post_dominators(cfg) + assert set(ipdom) == {n.id for n in cfg.nodes} From 7377dc39f8b9f2ff1000eea5eef570277bcf524c Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:42:44 -0400 Subject: [PATCH 03/11] =?UTF-8?q?feat(dataflow):=20stage=203=20=E2=80=94?= =?UTF-8?q?=20access=20paths,=20reaching=20definitions,=20DDG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit k-limited access-path model with per-scope base classification (local/ param/self/global/capture), header-only facts for compound statements, comprehension scoping, closure-capture and call-mutation rules; classic worklist reaching definitions with strong kills on exact non-wildcard paths; DDG edges via textual interference plus the type-based may-alias oracle (the locked MVP points-to substrate — unknown types conservatively alias, incompatible types don't). Gate tests cover the loop-carried dependency, scope shadowing, and the aliased write/read pair. (#67) --- codeanalyzer/dataflow/access_paths.py | 531 ++++++++++++++++++++++++++ codeanalyzer/dataflow/alias.py | 93 +++++ codeanalyzer/dataflow/defuse.py | 113 ++++++ test/test_dataflow_defuse.py | 144 +++++++ 4 files changed, 881 insertions(+) create mode 100644 codeanalyzer/dataflow/access_paths.py create mode 100644 codeanalyzer/dataflow/alias.py create mode 100644 codeanalyzer/dataflow/defuse.py create mode 100644 test/test_dataflow_defuse.py diff --git a/codeanalyzer/dataflow/access_paths.py b/codeanalyzer/dataflow/access_paths.py new file mode 100644 index 0000000..ddf3c31 --- /dev/null +++ b/codeanalyzer/dataflow/access_paths.py @@ -0,0 +1,531 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 3a of the level-3 dataflow ladder: the access-path variable model. + +An access path is ``base(.field | [*])*`` — ``x``, ``x.f``, ``x.f.g``, +``arr[*]`` (all subscripts collapse to ``[*]``). Depth is k-limited (default +3): ``x.f.g.h`` with k=3 becomes ``x.f.g.*``, which conservatively interferes +with every deeper path. The string form is the ``var`` label of every DDG +edge. + +Bases are classified per function scope: ``local``, ``param``, ``self`` (the +first parameter of a method), ``global`` (module binding — explicit ``global`` +declaration or a free name not bound in an enclosing function), ``capture`` +(free name bound in an enclosing function), and the pseudo-base ````. + +Per-statement facts (defs / uses) follow the documented Python rules: + +- Compound statements contribute only their *header* expressions (the CFG is + statement-level; bodies are separate nodes). +- Comprehension target variables live in their own scope: they are neither + defs nor uses of the enclosing statement (Python 3 semantics), while the + iterable and free names remain uses. +- A nested ``def``/``class`` statement defines its name and *uses* every + enclosing-scope variable the nested body captures (the closure binding is + over-approximated to the definition site) plus decorators and defaults. +- Calls mutate, over-approximately: the receiver base of a method call and + every argument that is itself an access path (a mutable reference) are + weak-defined at the call statement. Sound-leaning by contract; refined + precision is downstream's job. +- ``del x`` is a def (the name is re-bound to "undefined"). +- ``return e`` uses ``e`` and defines the pseudo-path ````. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple + +from codeanalyzer.dataflow.cfg import ControlFlowGraph + +RETURN_PATH = "" + +# Base-kind vocabulary (recorded per function for the SDG's formal nodes). +BASE_KINDS = ("local", "param", "self", "global", "capture") + + +def k_limit(path: str, k: int) -> str: + """Truncate an access path to k dotted components; a truncated path ends + in ``.*`` and interferes with everything deeper (``x.f.g.h`` with k=3 → + ``x.f.g.*``). ``[*]`` rides on its owning component.""" + parts = path.split(".") + if len(parts) <= k: + return path + return ".".join(parts[:k]) + ".*" + + +def interferes(use: str, definition: str) -> bool: + """Path interference without aliasing: exact match, prefix in either + direction (a write to ``x`` reaches a read of ``x.f``; a write to ``x.f`` + reaches a read of ``x``), and truncation wildcards.""" + if use == definition: + return True + u, d = use.rstrip("*").rstrip("."), definition.rstrip("*").rstrip(".") + return ( + u == d + or u.startswith(d + ".") + or d.startswith(u + ".") + or u.startswith(d + "[") + or d.startswith(u + "[") + ) + + +def suffix_of(path: str) -> str: + """The field suffix after the base — the part aliasing preserves.""" + base_end = len(path) + for i, ch in enumerate(path): + if ch in ".[": + base_end = i + break + return path[base_end:] + + +def base_of(path: str) -> str: + for i, ch in enumerate(path): + if ch in ".[": + return path[:i] + return path + + +@dataclass +class FunctionScope: + """Name classification for one callable.""" + + params: List[str] = field(default_factory=list) + self_name: Optional[str] = None + locals_: Set[str] = field(default_factory=set) + globals_: Set[str] = field(default_factory=set) + captures: Set[str] = field(default_factory=set) + + def kind_of(self, base: str) -> str: + if base == self.self_name: + return "self" + if base in self.params: + return "param" + if base in self.captures: + return "capture" + if base in self.globals_: + return "global" + if base in self.locals_: + return "local" + return "global" # unknown free name: a module/builtin binding + + +@dataclass +class StatementFacts: + """Defs and uses (k-limited access-path strings) of one CFG node.""" + + defs: Set[str] = field(default_factory=set) + uses: Set[str] = field(default_factory=set) + + +def _assigned_names(func: ast.AST) -> Set[str]: + """Names bound anywhere in the function body (not descending into nested + def/class bodies): assignment targets, loop targets, with-as, except-as, + imports, nested def/class names, del targets, walrus targets.""" + names: Set[str] = set() + + def collect_target(t: ast.AST) -> None: + if isinstance(t, ast.Name): + names.add(t.id) + elif isinstance(t, (ast.Tuple, ast.List)): + for el in t.elts: + collect_target(el) + elif isinstance(t, ast.Starred): + collect_target(t.value) + # Attribute/Subscript targets bind no *name*. + + def walk(node: ast.AST) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + names.add(child.name) + continue # nested scope + if isinstance(child, ast.Lambda): + continue + if isinstance(child, ast.Assign): + for t in child.targets: + collect_target(t) + elif isinstance(child, (ast.AugAssign, ast.AnnAssign)): + collect_target(child.target) + elif isinstance(child, (ast.For, ast.AsyncFor)): + collect_target(child.target) + elif isinstance(child, (ast.With, ast.AsyncWith)): + for item in child.items: + if item.optional_vars is not None: + collect_target(item.optional_vars) + elif isinstance(child, ast.ExceptHandler): + if child.name: + names.add(child.name) + elif isinstance(child, (ast.Import, ast.ImportFrom)): + for alias in child.names: + names.add((alias.asname or alias.name).split(".")[0]) + elif isinstance(child, ast.NamedExpr): + collect_target(child.target) + elif isinstance(child, ast.Delete): + for t in child.targets: + collect_target(t) + walk(child) + + walk(func) + return names + + +def _declared(func: ast.AST, decl_type) -> Set[str]: + names: Set[str] = set() + + def walk(node: ast.AST) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Lambda)): + continue + if isinstance(child, decl_type): + names.update(child.names) + walk(child) + + walk(func) + return names + + +def _param_names(func: ast.AST) -> List[str]: + a = func.args + names = [p.arg for p in getattr(a, "posonlyargs", [])] + [p.arg for p in a.args] + if a.vararg: + names.append(a.vararg.arg) + names.extend(p.arg for p in a.kwonlyargs) + if a.kwarg: + names.append(a.kwarg.arg) + return names + + +def free_names(func: ast.AST) -> Set[str]: + """Names the callable reads but does not bind — candidates for capture + (if bound in an enclosing function) or module globals. Includes the free + names of its own nested callables (capture transits scopes).""" + bound = set(_param_names(func)) | _assigned_names(func) | _declared(func, ast.Global) + used: Set[str] = set() + + def walk(node: ast.AST) -> None: + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)): + used.update(free_names(child) - {child.name}) + continue + if isinstance(child, ast.Lambda): + lam_bound = set(_param_names(child)) + for name in _names_loaded(child.body): + if name not in lam_bound: + used.add(name) + continue + if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load): + used.add(child.id) + walk(child) + + walk(func) + return used - bound + + +def _names_loaded(node: ast.AST) -> Set[str]: + out: Set[str] = set() + for n in ast.walk(node): + if isinstance(n, ast.Name) and isinstance(n.ctx, ast.Load): + out.add(n.id) + return out + + +def build_scope(func: ast.AST, enclosing_locals: Set[str]) -> FunctionScope: + """Classify every base name the callable touches. ``enclosing_locals`` is + the union of locals/params of all enclosing callables (for capture vs + global disambiguation).""" + params = _param_names(func) + scope = FunctionScope(params=params) + if params and isinstance(func, (ast.FunctionDef, ast.AsyncFunctionDef)): + decorators = {ast.unparse(d) for d in func.decorator_list} + if params[0] in ("self", "cls") and "staticmethod" not in decorators: + scope.self_name = params[0] + scope.globals_ = _declared(func, ast.Global) + nonlocals = _declared(func, ast.Nonlocal) + scope.locals_ = _assigned_names(func) - scope.globals_ - nonlocals + free = (free_names(func) | nonlocals) - set(params) + scope.captures = {n for n in free if n in enclosing_locals} + scope.globals_ |= free - scope.captures + return scope + + +class _PathExtractor: + """Turns the header expressions of one statement into def/use path sets.""" + + def __init__(self, scope: FunctionScope, k: int): + self.scope = scope + self.k = k + + # -- expression → path (None when the expression is not a path) --------- + + def path_of(self, expr: ast.expr) -> Optional[str]: + if isinstance(expr, ast.Name): + return expr.id + if isinstance(expr, ast.Attribute): + inner = self.path_of(expr.value) + return None if inner is None else k_limit(f"{inner}.{expr.attr}", self.k) + if isinstance(expr, ast.Subscript): + inner = self.path_of(expr.value) + return None if inner is None else k_limit(f"{inner}[*]", self.k) + return None + + # -- uses ---------------------------------------------------------------- + + def uses_in(self, expr: ast.expr) -> Set[str]: + """All access paths read by an expression. Comprehension targets are + scoped out; nested lambda bodies contribute their free names only.""" + uses: Set[str] = set() + self._collect_uses(expr, uses, shadowed=set()) + return uses + + def _collect_uses(self, expr: ast.expr, out: Set[str], shadowed: Set[str]) -> None: + if isinstance(expr, ast.Name): + if isinstance(expr.ctx, ast.Load) and expr.id not in shadowed: + out.add(expr.id) + return + if isinstance(expr, (ast.Attribute, ast.Subscript)): + p = self.path_of(expr) + if p is not None and base_of(p) not in shadowed: + out.add(p) + if isinstance(expr, ast.Subscript): + self._collect_uses(expr.slice, out, shadowed) + return + # Not a pure path (e.g. f(x).g): fall through to children. + if isinstance(expr, (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)): + inner_shadow = set(shadowed) + for comp in expr.generators: + # The iterable of the first generator evaluates in the + # enclosing scope; targets shadow from then on. + self._collect_uses(comp.iter, out, inner_shadow) + inner_shadow |= _names_loaded_targets(comp.target) + for cond in comp.ifs: + self._collect_uses(cond, out, inner_shadow) + if isinstance(expr, ast.DictComp): + self._collect_uses(expr.key, out, inner_shadow) + self._collect_uses(expr.value, out, inner_shadow) + else: + self._collect_uses(expr.elt, out, inner_shadow) + return + if isinstance(expr, ast.Lambda): + lam_shadow = shadowed | set(_param_names(expr)) + self._collect_uses(expr.body, out, lam_shadow) + return + for child in ast.iter_child_nodes(expr): + if isinstance(child, ast.expr): + self._collect_uses(child, out, shadowed) + elif isinstance(child, (ast.comprehension, ast.keyword)): + for sub in ast.iter_child_nodes(child): + if isinstance(sub, ast.expr): + self._collect_uses(sub, out, shadowed) + + # -- defs ---------------------------------------------------------------- + + def defs_of_target(self, target: ast.expr) -> Set[str]: + defs: Set[str] = set() + if isinstance(target, ast.Name): + defs.add(target.id) + elif isinstance(target, (ast.Attribute, ast.Subscript)): + p = self.path_of(target) + if p is not None: + defs.add(p) + elif isinstance(target, (ast.Tuple, ast.List)): + for el in target.elts: + defs.update(self.defs_of_target(el)) + elif isinstance(target, ast.Starred): + defs.update(self.defs_of_target(target.value)) + return defs + + def target_reads(self, target: ast.expr) -> Set[str]: + """Reads implied by a compound target: ``p.f = v`` reads ``p``; + ``a[i] = v`` reads ``a`` and ``i``.""" + reads: Set[str] = set() + if isinstance(target, (ast.Attribute, ast.Subscript)): + inner = self.path_of(target.value) + if inner is not None: + reads.add(inner) + else: + self._collect_uses(target.value, reads, set()) + if isinstance(target, ast.Subscript): + self._collect_uses(target.slice, reads, set()) + elif isinstance(target, (ast.Tuple, ast.List)): + for el in target.elts: + reads.update(self.target_reads(el)) + elif isinstance(target, ast.Starred): + reads.update(self.target_reads(target.value)) + return reads + + # -- call mutation (documented over-approximation) ----------------------- + + def mutation_defs(self, expr: ast.expr) -> Set[str]: + defs: Set[str] = set() + for call in _calls_in(expr): + if isinstance(call.func, ast.Attribute): + receiver = self.path_of(call.func.value) + if receiver is not None: + defs.add(receiver) + for arg in list(call.args) + [kw.value for kw in call.keywords]: + p = self.path_of(arg) + if p is not None: + defs.add(p) + return defs + + def receiver_uses(self, expr: ast.expr) -> Set[str]: + """Whole-object reads at call sites: a method call reads its receiver + (dispatch + any field the callee touches — the alias oracle matches + field writes through other names against this bare-base use).""" + uses: Set[str] = set() + for call in _calls_in(expr): + if isinstance(call.func, ast.Attribute): + receiver = self.path_of(call.func.value) + if receiver is not None: + uses.add(receiver) + return uses + + +def _names_loaded_targets(target: ast.expr) -> Set[str]: + out: Set[str] = set() + for n in ast.walk(target): + if isinstance(n, ast.Name): + out.add(n.id) + return out + + +def _calls_in(expr: ast.expr) -> List[ast.Call]: + calls: List[ast.Call] = [] + stack: List[ast.AST] = [expr] + while stack: + node = stack.pop() + if isinstance(node, ast.Call): + calls.append(node) + for child in ast.iter_child_nodes(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Lambda)): + continue + stack.append(child) + return calls + + +def statement_facts( + cfg: ControlFlowGraph, func: ast.AST, scope: FunctionScope, k: int +) -> Dict[int, StatementFacts]: + """Defs/uses per CFG node id. Compound statements contribute only their + header expressions; ENTRY defines every param/self/global/capture base + the function touches (the incoming state).""" + ex = _PathExtractor(scope, k) + facts: Dict[int, StatementFacts] = {} + + for node in cfg.nodes: + f = StatementFacts() + stmt = node.ast_node + + def call_fx(expr: ast.expr) -> None: + """Call effects on the current facts: over-approximate mutation + defs plus the whole-object receiver read.""" + f.defs |= ex.mutation_defs(expr) + f.uses |= ex.receiver_uses(expr) + + if node.kind == "entry": + f.defs = set(scope.params) | set(scope.captures) + if scope.self_name: + f.defs.add(scope.self_name) + # Globals the function reads arrive with the incoming state too. + f.defs |= scope.globals_ + elif stmt is None: + pass # exit + elif isinstance(stmt, ast.Assign): + f.uses = ex.uses_in(stmt.value) + for t in stmt.targets: + f.defs |= ex.defs_of_target(t) + f.uses |= ex.target_reads(t) + call_fx(stmt.value) + elif isinstance(stmt, ast.AugAssign): + f.uses = ex.uses_in(stmt.value) | ex.defs_of_target(stmt.target) | ex.target_reads(stmt.target) + f.defs = ex.defs_of_target(stmt.target) + call_fx(stmt.value) + elif isinstance(stmt, ast.AnnAssign): + if stmt.value is not None: + f.uses = ex.uses_in(stmt.value) + f.defs = ex.defs_of_target(stmt.target) + f.uses |= ex.target_reads(stmt.target) + call_fx(stmt.value) + elif isinstance(stmt, ast.Return): + if stmt.value is not None: + f.uses = ex.uses_in(stmt.value) + call_fx(stmt.value) + f.defs.add(RETURN_PATH) + elif isinstance(stmt, ast.If): + f.uses = ex.uses_in(stmt.test) + call_fx(stmt.test) + elif isinstance(stmt, ast.While): + f.uses = ex.uses_in(stmt.test) + call_fx(stmt.test) + elif isinstance(stmt, (ast.For, ast.AsyncFor)): + f.uses = ex.uses_in(stmt.iter) + f.defs = ex.defs_of_target(stmt.target) + call_fx(stmt.iter) + f.uses |= ex.target_reads(stmt.target) + elif isinstance(stmt, (ast.With, ast.AsyncWith)): + for item in stmt.items: + f.uses |= ex.uses_in(item.context_expr) + call_fx(item.context_expr) + if item.optional_vars is not None: + f.defs |= ex.defs_of_target(item.optional_vars) + elif isinstance(stmt, ast.ExceptHandler): + if stmt.type is not None: + f.uses = ex.uses_in(stmt.type) + if stmt.name: + f.defs.add(stmt.name) + elif isinstance(stmt, (ast.Raise, ast.Assert)): + for sub in ast.iter_child_nodes(stmt): + if isinstance(sub, ast.expr): + f.uses |= ex.uses_in(sub) + call_fx(sub) + elif isinstance(stmt, ast.Expr): + f.uses = ex.uses_in(stmt.value) + call_fx(stmt.value) + elif isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)): + f.defs.add(stmt.name) + captured = free_names(stmt) & (scope.locals_ | set(scope.params) | scope.captures) + f.uses |= captured + for d in stmt.decorator_list: + f.uses |= ex.uses_in(d) + for default in list(stmt.args.defaults) + [ + d for d in stmt.args.kw_defaults if d is not None + ]: + f.uses |= ex.uses_in(default) + elif isinstance(stmt, ast.ClassDef): + f.defs.add(stmt.name) + for d in list(stmt.decorator_list) + list(stmt.bases): + f.uses |= ex.uses_in(d) + elif isinstance(stmt, ast.Delete): + for t in stmt.targets: + f.defs |= ex.defs_of_target(t) + elif isinstance(stmt, (ast.Import, ast.ImportFrom)): + for alias in stmt.names: + f.defs.add((alias.asname or alias.name).split(".")[0]) + elif isinstance(stmt, (ast.Global, ast.Nonlocal, ast.Pass, ast.Break, ast.Continue)): + pass + else: # pragma: no cover — future statement kinds stay sound + for sub in ast.iter_child_nodes(stmt): + if isinstance(sub, ast.expr): + f.uses |= ex.uses_in(sub) + + f.defs = {k_limit(p, k) for p in f.defs} + f.uses = {k_limit(p, k) for p in f.uses} + facts[node.id] = f + + return facts diff --git a/codeanalyzer/dataflow/alias.py b/codeanalyzer/dataflow/alias.py new file mode 100644 index 0000000..7be8441 --- /dev/null +++ b/codeanalyzer/dataflow/alias.py @@ -0,0 +1,93 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 5a of the level-3 dataflow ladder: the may-alias oracle. + +Python has no in-process Andersen-style points-to library, so the locked +substrate decision (#67) is the **type-based MVP stub**: two access paths may +alias iff they share a non-empty field suffix and their bases' inferred types +are compatible — where an unknown type is compatible with everything +(sound-leaning by contract). Bare locals never alias each other (Python has +no pointers to locals; closure and global sharing ride the capture/global +mechanisms instead). + +The oracle is frozen: downstream stages call :meth:`may_alias` and never +reach into its internals, so upgrading to a real points-to substrate later is +a drop-in replacement. + +Type information comes from the symbol table Jedi already populated +(``PyVariableDeclaration.type`` / ``PyCallableParameter.type``); the oracle +works with whatever subset is present. +""" + +from __future__ import annotations + +from typing import Dict, Optional + +from codeanalyzer.dataflow.access_paths import base_of, suffix_of + + +def _normalize(type_name: Optional[str]) -> Optional[str]: + if not type_name: + return None + t = type_name.strip() + # `Optional[X]`, `X | None`, quotes, module prefixes: compare last simple name. + for wrapper in ("Optional[", "typing.Optional["): + if t.startswith(wrapper) and t.endswith("]"): + t = t[len(wrapper):-1] + t = t.split("|")[0].strip() + t = t.split("[")[0].strip() + return t.split(".")[-1] or None + + +class TypeBasedAliasOracle: + """``may_alias(p1, p2)`` for access paths in one function scope. + + ``base_types`` maps base names to their inferred type names (absent or + ``None`` = unknown = may alias anything with the same suffix). + """ + + def __init__(self, base_types: Optional[Dict[str, Optional[str]]] = None): + self._types = {k: _normalize(v) for k, v in (base_types or {}).items()} + + def may_alias(self, path_a: str, path_b: str) -> bool: + if path_a == path_b: + return True + suffix_a, suffix_b = suffix_of(path_a), suffix_of(path_b) + if not suffix_a and not suffix_b: + # Two distinct bare bases never alias (locals are not + # addressable); base sharing rides assignments in the DDG. + return False + # Field-sensitive up to prefix compatibility: identical suffixes may + # denote one location; a bare base (whole-object read/write) observes + # every field of its object, so an empty suffix is prefix-compatible + # with any; wildcards from k-truncation match anything deeper. + sa = suffix_a.rstrip("*").rstrip(".") + sb = suffix_b.rstrip("*").rstrip(".") + prefix_compatible = ( + sa == sb + or sa.startswith(sb) + or sb.startswith(sa) + or suffix_a.endswith("*") + or suffix_b.endswith("*") + ) + if not prefix_compatible: + return False + type_a = self._types.get(base_of(path_a)) + type_b = self._types.get(base_of(path_b)) + if type_a is None or type_b is None: + return True # unknown: conservatively compatible + return type_a == type_b diff --git a/codeanalyzer/dataflow/defuse.py b/codeanalyzer/dataflow/defuse.py new file mode 100644 index 0000000..3e83d65 --- /dev/null +++ b/codeanalyzer/dataflow/defuse.py @@ -0,0 +1,113 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 3b of the level-3 dataflow ladder: reaching definitions → DDG edges. + +Classic forward may-analysis with a worklist over the statement-level CFG. +SSA is an implementation shortcut some ecosystems get for free; the contract +is the def-use edges, and Python hand-builds them. + +Kill discipline (sound-leaning): + +- A def of a bare local/param path strong-kills earlier defs of the exact + same path. +- Defs of attribute paths strong-kill only the identical path string (a write + through one name never kills a potentially-aliased other name). +- Subscript (``[*]``) and k-truncated (``.*``) paths are weak updates — they + kill nothing. + +A use matches a reaching def when the paths interfere textually (exact / +prefix / wildcard — :func:`access_paths.interferes`) or when the may-alias +oracle says two suffixed paths can denote one location. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, List, Set, Tuple + +from codeanalyzer.dataflow.access_paths import StatementFacts, interferes, suffix_of +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.cfg import ControlFlowGraph + + +@dataclass(frozen=True) +class DDGEdge: + source: int # the def node + target: int # the use node + var: str # the access path being read + + +def _strong_kill(path: str) -> bool: + return not path.endswith("*") + + +def reaching_definitions( + cfg: ControlFlowGraph, facts: Dict[int, StatementFacts] +) -> Dict[int, Set[Tuple[str, int]]]: + """IN sets: ``{node: {(path, def_node), ...}}`` via worklist iteration.""" + preds = cfg.predecessors() + node_ids = [n.id for n in cfg.nodes] + + gen: Dict[int, Set[Tuple[str, int]]] = {} + for nid in node_ids: + gen[nid] = {(d, nid) for d in facts[nid].defs} + + in_sets: Dict[int, Set[Tuple[str, int]]] = {nid: set() for nid in node_ids} + out_sets: Dict[int, Set[Tuple[str, int]]] = {nid: set() for nid in node_ids} + + worklist = list(node_ids) + while worklist: + nid = worklist.pop(0) + new_in: Set[Tuple[str, int]] = set() + for p, _ in preds[nid]: + new_in |= out_sets[p] + strong = {d for d in facts[nid].defs if _strong_kill(d)} + new_out = {(p, m) for (p, m) in new_in if p not in strong} | gen[nid] + if new_in != in_sets[nid] or new_out != out_sets[nid]: + in_sets[nid] = new_in + out_sets[nid] = new_out + succ = cfg.successors()[nid] + for s, _ in succ: + if s not in worklist: + worklist.append(s) + return in_sets + + +def ddg_edges( + cfg: ControlFlowGraph, + facts: Dict[int, StatementFacts], + oracle: TypeBasedAliasOracle, +) -> List[DDGEdge]: + """Def-use edges: for every use at node n, an edge from each reaching def + whose path interferes (textually or through may-alias).""" + in_sets = reaching_definitions(cfg, facts) + edges: Set[DDGEdge] = set() + for node in cfg.nodes: + uses = facts[node.id].uses + if not uses: + continue + reaching = in_sets[node.id] + # A (path, n) pair reaches n itself only through a real cycle, so a + # self-edge here is precisely the loop-carried dependency. + for use in uses: + for def_path, def_node in reaching: + if interferes(use, def_path) or ( + (suffix_of(use) or suffix_of(def_path)) + and oracle.may_alias(use, def_path) + ): + edges.add(DDGEdge(source=def_node, target=node.id, var=use)) + return sorted(edges, key=lambda e: (e.source, e.target, e.var)) diff --git a/test/test_dataflow_defuse.py b/test/test_dataflow_defuse.py new file mode 100644 index 0000000..7c5d121 --- /dev/null +++ b/test/test_dataflow_defuse.py @@ -0,0 +1,144 @@ +"""Stage-3 gate: access paths and reaching-definitions DDG edges. + +Contract assertions: +- every DDG edge connects a node that writes the path to a node that reads + an interfering path; +- the loop-carried dependency (``total = total + i`` in a loop) produces the + loop-carried (self/cyclic) edge; +- comprehension target variables do not leak defs or uses into the enclosing + scope (shadowing gate); +- the aliasing fixture (two names, write through one, read through the other) + produces the may-alias edge. +""" + +import ast +from pathlib import Path + +from codeanalyzer.dataflow.access_paths import ( + RETURN_PATH, + build_scope, + k_limit, + statement_facts, +) +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.cfg import build_cfg +from codeanalyzer.dataflow.defuse import ddg_edges + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +def _analyzed(file_name: str, func_name: str, k: int = 3, base_types=None): + tree = ast.parse((FIXTURE / file_name).read_text()) + + def find(node, enclosing): + for child in ast.walk(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)) and child.name == func_name: + return child + raise AssertionError(f"{func_name} not found") + + func = find(tree, set()) + cfg = build_cfg(func) + scope = build_scope(func, enclosing_locals=set()) + facts = statement_facts(cfg, func, scope, k) + edges = ddg_edges(cfg, facts, TypeBasedAliasOracle(base_types or {})) + return cfg, facts, edges + + +def _line_of(cfg, node_id): + return cfg.node_by_id(node_id).start_line + + +def test_k_limit_contract_example(): + assert k_limit("x.f.g.h", 3) == "x.f.g.*" + assert k_limit("x.f.g", 3) == "x.f.g" + assert k_limit("arr[*].f", 3) == "arr[*].f" + + +def test_every_ddg_edge_connects_a_real_def_to_a_real_use(): + for file_name, func in ( + ("main.py", "branchy"), + ("main.py", "looped"), + ("main.py", "handles"), + ("pipeline.py", "alias_flow"), + ("state.py", "bump"), + ): + cfg, facts, edges = _analyzed(file_name, func) + for e in edges: + assert e.var in facts[e.target].uses, f"{func}: {e} is not a read" + assert facts[e.source].defs, f"{func}: {e} source defines nothing" + + +def test_branchy_defs_reach_the_return_through_both_arms(): + cfg, facts, edges = _analyzed("main.py", "branchy") + x_edges = [e for e in edges if e.var == "x"] + sources = {_line_of(cfg, e.source) for e in x_edges} + targets = {_line_of(cfg, e.target) for e in x_edges} + assert sources == {13, 15}, "both arms' defs of x must reach the join" + assert targets == {16} + + +def test_looped_produces_the_loop_carried_edge(): + cfg, facts, edges = _analyzed("main.py", "looped") + # total = total + i (line 23) reads its own previous-iteration def. + assert any( + _line_of(cfg, e.source) == 23 and _line_of(cfg, e.target) == 23 and e.var == "total" + for e in edges + ), "loop-carried dependency missing" + # i = i + 1 (line 24) feeds the loop test (line 22) around the back edge. + assert any( + _line_of(cfg, e.source) == 24 and _line_of(cfg, e.target) == 22 and e.var == "i" + for e in edges + ) + + +def test_comprehension_targets_do_not_leak_across_scopes(): + cfg, facts, edges = _analyzed("main.py", "comprehend") + comp_line, assign_line, ret_line = 60, 61, 62 + comp_node = next(n for n in cfg.nodes if n.start_line == comp_line) + # The comprehension defines squares only — its `i` is its own scope. + assert "i" not in facts[comp_node.id].defs + assert "i" not in facts[comp_node.id].uses + assert "items" in facts[comp_node.id].uses + # The `i` read at the return resolves to line 61, never line 60. + i_edges = [e for e in edges if e.var == "i" and _line_of(cfg, e.target) == ret_line] + assert {_line_of(cfg, e.source) for e in i_edges} == {assign_line} + + +def test_alias_flow_write_through_one_name_reaches_read_through_other(): + cfg, facts, edges = _analyzed( + "pipeline.py", "alias_flow", base_types={"p": "Box", "q": "Box"} + ) + # q.value = 42 (line 39) must reach the whole-object read of p at + # p.get() (line 40) through the type-based may-alias oracle. + assert any( + _line_of(cfg, e.source) == 39 and _line_of(cfg, e.target) == 40 + for e in edges + ), "may-alias edge from q.value write to p read missing" + + +def test_alias_edge_suppressed_when_types_are_incompatible(): + cfg, facts, edges = _analyzed( + "pipeline.py", "alias_flow", base_types={"p": "Box", "q": "int"} + ) + alias_edges = [ + e + for e in edges + if _line_of(cfg, e.source) == 39 and _line_of(cfg, e.target) == 40 and e.var.startswith("p") + ] + assert not alias_edges, "incompatible types must not alias" + + +def test_bump_reads_incoming_global_and_param(): + cfg, facts, edges = _analyzed("state.py", "bump") + assign = next(n for n in cfg.nodes if n.start_line == 8) + assert {"counter", "amount"} <= facts[assign.id].uses + assert "counter" in facts[assign.id].defs + entry_edges = {e.var for e in edges if e.source == cfg.entry_id and e.target == assign.id} + assert {"counter", "amount"} <= entry_edges + + +def test_return_defines_the_return_pseudo_path(): + cfg, facts, edges = _analyzed("main.py", "early_exit") + for n in cfg.nodes: + if n.kind == "return": + assert RETURN_PATH in facts[n.id].defs From d21fc0a1b08137b4f6d04ee498a49596a01b1edf Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:43:54 -0400 Subject: [PATCH 04/11] =?UTF-8?q?feat(dataflow):=20stage=204=20=E2=80=94?= =?UTF-8?q?=20PDG=20assembly=20and=20exact=20backward-slice=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PDG = CDG ∪ DDG per callable over the same node ids; intraprocedural backward slice as reverse reachability. Gate pins hand-computed exact slices: the early-return arm is excluded from the other arm's slice, loop slices close over the loop-carried dependency. (#67) --- codeanalyzer/dataflow/pdg.py | 99 ++++++++++++++++++++++++++++++++++++ test/test_dataflow_pdg.py | 68 +++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 codeanalyzer/dataflow/pdg.py create mode 100644 test/test_dataflow_pdg.py diff --git a/codeanalyzer/dataflow/pdg.py b/codeanalyzer/dataflow/pdg.py new file mode 100644 index 0000000..d9bd5fa --- /dev/null +++ b/codeanalyzer/dataflow/pdg.py @@ -0,0 +1,99 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 4 of the level-3 dataflow ladder: PDG assembly. + +Per callable, the PDG is the union of the stage-2 control-dependence edges +(``CDG``) and the stage-3 def-use edges (``DDG``), over the same +``(signature, node_id)`` nodes. Nothing new is computed here — this module is +bookkeeping plus the intraprocedural backward slice that gates it: reverse +reachability over CDG ∪ DDG from a criterion node, expected to match a +hand-computed node set exactly on the fixture. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set + +from codeanalyzer.dataflow.access_paths import ( + FunctionScope, + StatementFacts, + build_scope, + statement_facts, +) +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.cfg import ControlFlowGraph, build_cfg +from codeanalyzer.dataflow.defuse import ddg_edges +from codeanalyzer.dataflow.dominance import control_dependence + + +@dataclass(frozen=True) +class PDGEdge: + source: int + target: int + type: str # "CDG" | "DDG" + var: Optional[str] = None # access path on DDG edges + + +@dataclass +class FunctionPDG: + """One callable's intraprocedural graphs, keyed externally by signature.""" + + cfg: ControlFlowGraph + edges: List[PDGEdge] + scope: FunctionScope + facts: Dict[int, StatementFacts] = field(default_factory=dict) + + +def build_pdg( + func: ast.AST, + enclosing_locals: Set[str], + oracle: TypeBasedAliasOracle, + k: int = 3, +) -> FunctionPDG: + """CFG → dominance → def-use → PDG for one callable.""" + cfg = build_cfg(func) + scope = build_scope(func, enclosing_locals) + facts = statement_facts(cfg, func, scope, k) + + edges: List[PDGEdge] = [ + PDGEdge(source=a, target=b, type="CDG") for a, b in control_dependence(cfg) + ] + edges.extend( + PDGEdge(source=e.source, target=e.target, type="DDG", var=e.var) + for e in ddg_edges(cfg, facts, oracle) + ) + edges.sort(key=lambda e: (e.source, e.target, e.type, e.var or "")) + return FunctionPDG(cfg=cfg, edges=edges, scope=scope, facts=facts) + + +def intraprocedural_backward_slice(pdg: FunctionPDG, criterion: int) -> Set[int]: + """Reverse reachability over CDG ∪ DDG from the criterion node (the + criterion itself is in the slice). The stage-4 gate.""" + reverse: Dict[int, List[int]] = {} + for e in pdg.edges: + reverse.setdefault(e.target, []).append(e.source) + seen: Set[int] = set() + stack = [criterion] + while stack: + n = stack.pop() + if n in seen: + continue + seen.add(n) + stack.extend(reverse.get(n, [])) + return seen diff --git a/test/test_dataflow_pdg.py b/test/test_dataflow_pdg.py new file mode 100644 index 0000000..5f02e30 --- /dev/null +++ b/test/test_dataflow_pdg.py @@ -0,0 +1,68 @@ +"""Stage-4 gate: PDG assembly and the exact intraprocedural backward slice. + +The highest-value test of the intraprocedural half: the backward slice of a +named variable at a named line equals a hand-computed node set — exactly. +It catches both missing control dependences and missing def-use edges. +""" + +import ast +from pathlib import Path + +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.pdg import build_pdg, intraprocedural_backward_slice + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +def _pdg_of(file_name: str, func_name: str): + tree = ast.parse((FIXTURE / file_name).read_text()) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name: + return build_pdg(node, enclosing_locals=set(), oracle=TypeBasedAliasOracle()) + raise AssertionError(f"{func_name} not found") + + +def _id_at_line(pdg, line: int) -> int: + return next(n.id for n in pdg.cfg.nodes if n.start_line == line and n.kind != "entry") + + +def _lines(pdg, ids) -> set: + return { + pdg.cfg.node_by_id(i).start_line + for i in ids + if pdg.cfg.node_by_id(i).kind not in ("entry", "exit") + } + + +def test_pdg_edges_use_only_cdg_and_ddg_types(): + for func in ("branchy", "looped", "early_exit", "handles"): + pdg = _pdg_of("main.py", func) + assert {e.type for e in pdg.edges} <= {"CDG", "DDG"} + for e in pdg.edges: + assert (e.var is not None) == (e.type == "DDG") + + +def test_early_exit_slice_excludes_the_other_arm(): + pdg = _pdg_of("main.py", "early_exit") + criterion = _id_at_line(pdg, 32) # return y + slice_ids = intraprocedural_backward_slice(pdg, criterion) + # Hand-computed: ENTRY, the branch header (29), y = n * 2 (31), and the + # criterion itself. `return -1` (30) is control-dependent on the same + # branch but contributes nothing to y — it must NOT appear. + assert _lines(pdg, slice_ids) == {29, 31, 32} + assert pdg.cfg.entry_id in slice_ids + assert _id_at_line(pdg, 30) not in slice_ids + + +def test_branchy_slice_includes_both_arms_and_the_branch(): + pdg = _pdg_of("main.py", "branchy") + criterion = _id_at_line(pdg, 16) # return x + slice_ids = intraprocedural_backward_slice(pdg, criterion) + assert _lines(pdg, slice_ids) == {12, 13, 15, 16} + + +def test_looped_slice_of_return_total_is_the_whole_loop(): + pdg = _pdg_of("main.py", "looped") + criterion = _id_at_line(pdg, 25) # return total + slice_ids = intraprocedural_backward_slice(pdg, criterion) + assert _lines(pdg, slice_ids) == {20, 21, 22, 23, 24, 25} From 789bd1c7b64be44bde474076104022d0ca5d1467 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:47:19 -0400 Subject: [PATCH 05/11] =?UTF-8?q?feat(dataflow):=20stage=205=20=E2=80=94?= =?UTF-8?q?=20alias=20oracle=20wiring,=20Tarjan=20SCC,=20global=20qualific?= =?UTF-8?q?ation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Iterative Tarjan SCC condensation of the frozen call-graph oracle (reverse topological schedule for bottom-up summaries); call mutations become suffixed weak defs so caller-visible mutation is distinguishable from local rebinding; global bases gain module::name qualification for the interprocedural build. (#67) --- codeanalyzer/dataflow/access_paths.py | 40 ++++++++++-- codeanalyzer/dataflow/pdg.py | 3 +- codeanalyzer/dataflow/scc.py | 91 +++++++++++++++++++++++++++ test/test_dataflow_scc.py | 30 +++++++++ 4 files changed, 159 insertions(+), 5 deletions(-) create mode 100644 codeanalyzer/dataflow/scc.py create mode 100644 test/test_dataflow_scc.py diff --git a/codeanalyzer/dataflow/access_paths.py b/codeanalyzer/dataflow/access_paths.py index ddf3c31..e053124 100644 --- a/codeanalyzer/dataflow/access_paths.py +++ b/codeanalyzer/dataflow/access_paths.py @@ -372,16 +372,19 @@ def target_reads(self, target: ast.expr) -> Set[str]: # -- call mutation (documented over-approximation) ----------------------- def mutation_defs(self, expr: ast.expr) -> Set[str]: + """Weak defs of the *contents* of receiver/argument objects (``xs.*`` + — suffixed, so a call mutation is never confused with a local + rebinding, which is not caller-visible).""" defs: Set[str] = set() for call in _calls_in(expr): if isinstance(call.func, ast.Attribute): receiver = self.path_of(call.func.value) if receiver is not None: - defs.add(receiver) + defs.add(k_limit(receiver + ".*", self.k)) for arg in list(call.args) + [kw.value for kw in call.keywords]: p = self.path_of(arg) if p is not None: - defs.add(p) + defs.add(k_limit(p + ".*", self.k)) return defs def receiver_uses(self, expr: ast.expr) -> Set[str]: @@ -419,12 +422,38 @@ def _calls_in(expr: ast.expr) -> List[ast.Call]: return calls +def qualify_globals(paths: Set[str], scope: FunctionScope, qualifier: str) -> Set[str]: + """Rewrite global bases to their module-qualified form ``module::name`` + (``::`` keeps the qualifier out of the field-path grammar). Builtins stay + bare — they carry no cross-module dataflow worth modeling.""" + import builtins as _builtins + + out: Set[str] = set() + for p in paths: + b = base_of(p) + if ( + "::" not in b + and b != RETURN_PATH + and scope.kind_of(b) == "global" + and not hasattr(_builtins, b) + ): + out.add(f"{qualifier}::{b}" + p[len(b):]) + else: + out.add(p) + return out + + def statement_facts( - cfg: ControlFlowGraph, func: ast.AST, scope: FunctionScope, k: int + cfg: ControlFlowGraph, + func: ast.AST, + scope: FunctionScope, + k: int, + global_qualifier: Optional[str] = None, ) -> Dict[int, StatementFacts]: """Defs/uses per CFG node id. Compound statements contribute only their header expressions; ENTRY defines every param/self/global/capture base - the function touches (the incoming state).""" + the function touches (the incoming state). With ``global_qualifier`` set + (the interprocedural build), global bases become ``module::name``.""" ex = _PathExtractor(scope, k) facts: Dict[int, StatementFacts] = {} @@ -526,6 +555,9 @@ def call_fx(expr: ast.expr) -> None: f.defs = {k_limit(p, k) for p in f.defs} f.uses = {k_limit(p, k) for p in f.uses} + if global_qualifier is not None: + f.defs = qualify_globals(f.defs, scope, global_qualifier) + f.uses = qualify_globals(f.uses, scope, global_qualifier) facts[node.id] = f return facts diff --git a/codeanalyzer/dataflow/pdg.py b/codeanalyzer/dataflow/pdg.py index d9bd5fa..09c0e59 100644 --- a/codeanalyzer/dataflow/pdg.py +++ b/codeanalyzer/dataflow/pdg.py @@ -65,11 +65,12 @@ def build_pdg( enclosing_locals: Set[str], oracle: TypeBasedAliasOracle, k: int = 3, + global_qualifier: Optional[str] = None, ) -> FunctionPDG: """CFG → dominance → def-use → PDG for one callable.""" cfg = build_cfg(func) scope = build_scope(func, enclosing_locals) - facts = statement_facts(cfg, func, scope, k) + facts = statement_facts(cfg, func, scope, k, global_qualifier) edges: List[PDGEdge] = [ PDGEdge(source=a, target=b, type="CDG") for a, b in control_dependence(cfg) diff --git a/codeanalyzer/dataflow/scc.py b/codeanalyzer/dataflow/scc.py new file mode 100644 index 0000000..77d2560 --- /dev/null +++ b/codeanalyzer/dataflow/scc.py @@ -0,0 +1,91 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 5b of the level-3 dataflow ladder: SCC condensation of the call graph. + +The call graph is a frozen oracle (level-1 Jedi edges, provenance-merged with +level-2 PyCG when enabled); Tarjan condenses it into strongly connected +components, and the condensation DAG in reverse topological order is the +bottom-up processing schedule for summary composition — callees before +callers, one monotone fixpoint per SCC (mutual recursion). + +Iterative Tarjan (no recursion — real projects overflow Python's stack), with +sorted tie-breaking so the schedule is deterministic. +""" + +from __future__ import annotations + +from typing import Dict, List, Set, Tuple + + +def strongly_connected_components( + nodes: List[str], edges: List[Tuple[str, str]] +) -> List[List[str]]: + """Tarjan SCCs in reverse topological order (callees before callers). + Deterministic: nodes are visited in sorted order and members sorted.""" + adj: Dict[str, List[str]] = {n: [] for n in nodes} + for s, t in sorted(set(edges)): + if s in adj and t in adj: + adj[s].append(t) + + index_of: Dict[str, int] = {} + lowlink: Dict[str, int] = {} + on_stack: Set[str] = set() + stack: List[str] = [] + sccs: List[List[str]] = [] + counter = [0] + + for root in sorted(adj): + if root in index_of: + continue + # Iterative DFS: (node, iterator position over successors). + work: List[Tuple[str, int]] = [(root, 0)] + while work: + node, i = work.pop() + if i == 0: + index_of[node] = lowlink[node] = counter[0] + counter[0] += 1 + stack.append(node) + on_stack.add(node) + recurse = False + successors = adj[node] + while i < len(successors): + succ = successors[i] + i += 1 + if succ not in index_of: + work.append((node, i)) + work.append((succ, 0)) + recurse = True + break + if succ in on_stack: + lowlink[node] = min(lowlink[node], index_of[succ]) + if recurse: + continue + if lowlink[node] == index_of[node]: + component: List[str] = [] + while True: + member = stack.pop() + on_stack.discard(member) + component.append(member) + if member == node: + break + sccs.append(sorted(component)) + if work: + parent = work[-1][0] + lowlink[parent] = min(lowlink[parent], lowlink[node]) + + # Tarjan emits SCCs in reverse topological order already. + return sccs diff --git a/test/test_dataflow_scc.py b/test/test_dataflow_scc.py new file mode 100644 index 0000000..1f85328 --- /dev/null +++ b/test/test_dataflow_scc.py @@ -0,0 +1,30 @@ +"""Stage-5 gate: SCC condensation of the call-graph oracle.""" + +from codeanalyzer.dataflow.scc import strongly_connected_components + + +def test_mutual_recursion_forms_one_scc(): + nodes = ["main", "even", "odd", "leaf"] + edges = [("main", "even"), ("even", "odd"), ("odd", "even"), ("even", "leaf")] + sccs = strongly_connected_components(nodes, edges) + assert ["even", "odd"] in sccs + assert ["leaf"] in sccs and ["main"] in sccs + + +def test_reverse_topological_order_callees_first(): + nodes = ["a", "b", "c"] + edges = [("a", "b"), ("b", "c")] + sccs = strongly_connected_components(nodes, edges) + pos = {tuple(s): i for i, s in enumerate(sccs)} + assert pos[("c",)] < pos[("b",)] < pos[("a",)] + + +def test_deterministic_across_runs(): + nodes = ["m", "x", "y", "z"] + edges = [("m", "x"), ("x", "y"), ("y", "x"), ("y", "z"), ("z", "y")] + assert strongly_connected_components(nodes, edges) == strongly_connected_components( + nodes, edges + ) + # x-y-z all collapse into one SCC (x↔y, y↔z), members sorted. + sccs = strongly_connected_components(nodes, edges) + assert ["x", "y", "z"] in sccs From c6f990f385fca8187638ca0564d9f6cb5dcbad4e Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:53:04 -0400 Subject: [PATCH 06/11] =?UTF-8?q?feat(dataflow):=20stages=206=E2=80=937=20?= =?UTF-8?q?=E2=80=94=20function=20summaries=20and=20SDG=20assembly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relational summaries (params/captures/read-globals → return/mutations/ written-globals) composed bottom-up over the Tarjan condensation DAG, monotone fixpoint within SCCs, callee global footprints injected at callsites and reaching definitions re-solved; HRB parameter structure (formal/actual in/out nodes in the owning function's id space after EXIT), CALL/PARAM_IN/PARAM_OUT edges, SUMMARY edges from composed flows, globals as extra formals, closure captures bound at definition sites; builder maps symbol-table signatures to AST by (file, line) and treats the call graph and Jedi callsite resolutions as frozen oracles. Gates: arity, no dangling endpoints, transitive-chain SUMMARY, cross- file global flow, deterministic double-run. (#67) --- codeanalyzer/dataflow/builder.py | 255 +++++++++++++++++ codeanalyzer/dataflow/defuse.py | 4 +- codeanalyzer/dataflow/sdg.py | 424 +++++++++++++++++++++++++++++ codeanalyzer/dataflow/summaries.py | 217 +++++++++++++++ test/test_dataflow_sdg.py | 188 +++++++++++++ 5 files changed, 1086 insertions(+), 2 deletions(-) create mode 100644 codeanalyzer/dataflow/builder.py create mode 100644 codeanalyzer/dataflow/sdg.py create mode 100644 codeanalyzer/dataflow/summaries.py create mode 100644 test/test_dataflow_sdg.py diff --git a/codeanalyzer/dataflow/builder.py b/codeanalyzer/dataflow/builder.py new file mode 100644 index 0000000..e867606 --- /dev/null +++ b/codeanalyzer/dataflow/builder.py @@ -0,0 +1,255 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""The level-3 orchestrator: symbol table + call graph → program graphs. + +``build_program_graphs`` is the single entry point ``Codeanalyzer.analyze`` +calls at ``-a 3``. It re-parses each module file with the stdlib ``ast`` (the +same parser the symbol table used), maps every ``PyCallable`` to its def node +by ``(file, start_line)`` — which is what guarantees graph nodes join back to +symbol-table signatures — then runs the construction ladder: + + per callable: CFG → dominance → facts (module-qualified globals) + whole program: SCC condensation → summary fixpoint → SDG assembly + +The call graph and Jedi-resolved callsites are frozen oracles: targets are +looked up, never re-inferred. Callables whose AST cannot be recovered (file +changed on disk, decorators moving line numbers, generated code) are skipped +with a warning — their callers still treat them as external pass-through, so +the result degrades gracefully instead of crashing (contract rule). +""" + +from __future__ import annotations + +import ast +from pathlib import Path +from typing import Dict, List, Optional, Set, Tuple + +from codeanalyzer.dataflow.access_paths import _PathExtractor, _calls_in +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.pdg import build_pdg +from codeanalyzer.dataflow.sdg import ProgramGraphsIR, assemble_sdg +from codeanalyzer.dataflow.summaries import CallSite, FunctionInfo, compute_summaries +from codeanalyzer.schema.py_schema import PyApplication, PyCallable, PyClass, PyModule +from codeanalyzer.utils import logger + +DEFAULT_K_LIMIT = 3 + + +def _walk_callables( + module: PyModule, +) -> List[Tuple[PyCallable, Tuple[PyCallable, ...]]]: + """Every callable in the module with its chain of enclosing callables.""" + out: List[Tuple[PyCallable, Tuple[PyCallable, ...]]] = [] + + def from_callable(c: PyCallable, chain: Tuple[PyCallable, ...]) -> None: + out.append((c, chain)) + for inner in (c.inner_callables or {}).values(): + from_callable(inner, chain + (c,)) + for cls in (c.inner_classes or {}).values(): + from_class(cls, chain + (c,)) + + def from_class(cls: PyClass, chain: Tuple[PyCallable, ...]) -> None: + for m in (cls.methods or {}).values(): + from_callable(m, chain) + for inner in (cls.inner_classes or {}).values(): + from_class(inner, chain) + + for fn in (module.functions or {}).values(): + from_callable(fn, ()) + for cls in (module.classes or {}).values(): + from_class(cls, ()) + return out + + +def _locals_of(func: ast.AST) -> Set[str]: + from codeanalyzer.dataflow.access_paths import _assigned_names, _param_names + + return set(_param_names(func)) | _assigned_names(func) + + +def _base_types(c: PyCallable) -> Dict[str, Optional[str]]: + types: Dict[str, Optional[str]] = {} + for p in c.parameters or []: + types[p.name] = p.type + for v in c.local_variables or []: + types.setdefault(v.name, v.type) + return types + + +def _class_index(app: PyApplication) -> Dict[str, PyClass]: + from codeanalyzer.semantic_analysis.call_graph import iter_classes_in_symbol_table + + return {c.signature: c for c in iter_classes_in_symbol_table(app.symbol_table)} + + +def _callable_index(app: PyApplication) -> Dict[str, PyCallable]: + from codeanalyzer.semantic_analysis.call_graph import iter_callables_in_symbol_table + + return {c.signature: c for c in iter_callables_in_symbol_table(app.symbol_table)} + + +def _match_args( + call: ast.Call, + callee: PyCallable, + extractor: _PathExtractor, + receiver_path: Optional[str], +) -> Tuple[Tuple[str, Optional[str]], ...]: + """Positional/keyword-match actual access paths to callee param names. + The receiver (or constructed object) binds the leading self/cls param.""" + params = [p.name for p in (callee.parameters or [])] + pairs: List[Tuple[str, Optional[str]]] = [] + positional = list(params) + if params and params[0] in ("self", "cls"): + if receiver_path is not None: + pairs.append((params[0], receiver_path)) + positional = params[1:] + for name, arg in zip(positional, call.args): + if isinstance(arg, ast.Starred): + break + pairs.append((name, extractor.path_of(arg))) + for kw in call.keywords: + if kw.arg and kw.arg in params: + pairs.append((kw.arg, extractor.path_of(kw.value))) + return tuple(pairs) + + +def build_program_graphs( + app: PyApplication, + k: int = DEFAULT_K_LIMIT, +) -> ProgramGraphsIR: + """Build CFG/PDG per callable and the whole-program SDG.""" + class_idx = _class_index(app) + callable_idx = _callable_index(app) + + infos: Dict[str, FunctionInfo] = {} + func_asts: Dict[str, ast.AST] = {} + + for file_key, module in sorted(app.symbol_table.items()): + path = Path(module.file_path) + try: + tree = ast.parse(path.read_text()) + except (OSError, SyntaxError) as exc: + logger.warning(f"level 3: skipping {path} (unparseable: {exc})") + continue + + def_index: Dict[int, ast.AST] = {} + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + def_index[node.lineno] = node + + for pycallable, chain in _walk_callables(module): + func = def_index.get(pycallable.start_line) + if func is None or func.name != pycallable.name: + logger.warning( + f"level 3: no AST match for {pycallable.signature} " + f"({path}:{pycallable.start_line}); treated as external" + ) + continue + + enclosing_locals: Set[str] = set() + for enclosing in chain: + enclosing_ast = def_index.get(enclosing.start_line) + if enclosing_ast is not None: + enclosing_locals |= _locals_of(enclosing_ast) + + oracle = TypeBasedAliasOracle(_base_types(pycallable)) + pdg = build_pdg( + func, + enclosing_locals=enclosing_locals, + oracle=oracle, + k=k, + global_qualifier=module.module_name, + ) + infos[pycallable.signature] = FunctionInfo( + signature=pycallable.signature, pdg=pdg, oracle=oracle + ) + func_asts[pycallable.signature] = func + + # Callsites and nested defs, now that every signature is known. + for sig, info in infos.items(): + pycallable = callable_idx[sig] + func = func_asts[sig] + extractor = _PathExtractor(info.pdg.scope, k) + + calls_by_pos: Dict[Tuple[int, int], Tuple[int, ast.Call]] = {} + calls_by_line: Dict[int, Tuple[int, ast.Call]] = {} + for node in info.pdg.cfg.nodes: + if node.ast_node is None: + continue + for call in _calls_in(node.ast_node): + pos = (call.lineno, call.col_offset) + calls_by_pos.setdefault(pos, (node.id, call)) + calls_by_line.setdefault(call.lineno, (node.id, call)) + + for site in pycallable.call_sites or []: + target = site.callee_signature + if not target: + continue + if target in class_idx and target not in infos: + target = f"{target}.__init__" # constructor → its initializer + if target not in infos: + continue # external or unrecovered: pass-through posture + + located = calls_by_pos.get((site.start_line, site.start_column)) + if located is None: + located = calls_by_line.get(site.start_line) + if located is None: + continue + node_id, call = located + + receiver_path: Optional[str] = None + if isinstance(call.func, ast.Attribute): + receiver_path = extractor.path_of(call.func.value) + elif site.is_constructor_call: + # p = Box(...) binds the constructed object (self) to p. + owner = info.pdg.cfg.node_by_id(node_id).ast_node + if ( + isinstance(owner, ast.Assign) + and len(owner.targets) == 1 + and isinstance(owner.targets[0], (ast.Name, ast.Attribute)) + ): + receiver_path = extractor.path_of(owner.targets[0]) + + info.call_sites.append( + CallSite( + node_id=node_id, + targets=(target,), + arg_paths=_match_args(call, callable_idx[target], extractor, receiver_path), + line=site.start_line, + ) + ) + + for node in info.pdg.cfg.nodes: + if isinstance(node.ast_node, (ast.FunctionDef, ast.AsyncFunctionDef)): + nested_sig = f"{sig}.{node.ast_node.name}" + if nested_sig in infos: + info.nested_defs.append((node.id, nested_sig)) + + call_edges = [ + (e.source, e.target) + for e in app.call_graph + if e.source in infos and e.target in infos + ] + # Callsite resolutions are part of the same oracle (they may include + # constructor retargets the edge list lacks). + for sig, info in infos.items(): + for cs in info.call_sites: + for t in cs.targets: + call_edges.append((sig, t)) + + summaries = compute_summaries(infos, sorted(set(call_edges))) + return assemble_sdg(infos, summaries, k) diff --git a/codeanalyzer/dataflow/defuse.py b/codeanalyzer/dataflow/defuse.py index 3e83d65..62432ab 100644 --- a/codeanalyzer/dataflow/defuse.py +++ b/codeanalyzer/dataflow/defuse.py @@ -60,6 +60,7 @@ def reaching_definitions( ) -> Dict[int, Set[Tuple[str, int]]]: """IN sets: ``{node: {(path, def_node), ...}}`` via worklist iteration.""" preds = cfg.predecessors() + succs = cfg.successors() node_ids = [n.id for n in cfg.nodes] gen: Dict[int, Set[Tuple[str, int]]] = {} @@ -80,8 +81,7 @@ def reaching_definitions( if new_in != in_sets[nid] or new_out != out_sets[nid]: in_sets[nid] = new_in out_sets[nid] = new_out - succ = cfg.successors()[nid] - for s, _ in succ: + for s, _ in succs[nid]: if s not in worklist: worklist.append(s) return in_sets diff --git a/codeanalyzer/dataflow/sdg.py b/codeanalyzer/dataflow/sdg.py new file mode 100644 index 0000000..e2e9d5e --- /dev/null +++ b/codeanalyzer/dataflow/sdg.py @@ -0,0 +1,424 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 7 of the level-3 dataflow ladder: SDG assembly (Horwitz–Reps–Binkley). + +Parameter-passing structure per function and callsite: + +- **formal_in** nodes: one per parameter (var = the parameter name), one per + captured variable (``:name``), one per transitively-read global + (``:module::name``); +- **formal_out** nodes: the return value (````), each caller-visibly + mutated parameter, each written global; +- **actual_in / actual_out** nodes at each callsite, mirroring the callee's + formals that the callsite binds (positional/keyword-matched arguments, the + receiver as ``self``, globals from the callee's summary footprint); +- closure captures bind at the nested function's *definition* statement: an + ``actual_in`` at the def node, ``PARAM_IN`` to the nested callable's + ```` formal. + +Parameter nodes share the owning function's node-id space, allocated after +EXIT (the CFG keeps its ``ENTRY = 0 … EXIT = last CFG id`` contract; parameter +nodes are PDG/SDG-level, deterministically ordered). Intra-function wiring +(defs → formal_out, formal_in → uses, defs → actual_in, actual_out → callsite) +is emitted as ordinary DDG/CDG edges of the function's PDG; cross-function +``CALL`` / ``PARAM_IN`` / ``PARAM_OUT`` edges and same-signature ``SUMMARY`` +edges (actual_in → actual_out, encoding the callee's transitive flow) form the +``sdg_edges`` section. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple + +from codeanalyzer.dataflow.access_paths import RETURN_PATH, base_of, interferes, suffix_of +from codeanalyzer.dataflow.defuse import DDGEdge +from codeanalyzer.dataflow.pdg import FunctionPDG, PDGEdge +from codeanalyzer.dataflow.summaries import ( + CallSite, + FunctionInfo, + FunctionSummary, + solve_function, +) + +CAPTURE_PREFIX = ":" +GLOBAL_PREFIX = ":" + + +@dataclass +class ParamNode: + id: int + kind: str # formal_in | formal_out | actual_in | actual_out + var: str + call_node: Optional[int] = None # owning callsite statement (actuals) + start_line: int = -1 + end_line: int = -1 + + +@dataclass(frozen=True) +class SDGEdge: + source_sig: str + source_node: int + target_sig: str + target_node: int + type: str # CALL | PARAM_IN | PARAM_OUT | SUMMARY + var: Optional[str] = None + + +@dataclass +class FunctionGraphs: + """One callable's complete level-3 graphs, ready for emission.""" + + pdg: FunctionPDG + ddg: List[DDGEdge] = field(default_factory=list) # augmented, final + param_nodes: List[ParamNode] = field(default_factory=list) + extra_edges: List[PDGEdge] = field(default_factory=list) # param wiring + summary: Optional[FunctionSummary] = None + + +@dataclass +class ProgramGraphsIR: + functions: Dict[str, FunctionGraphs] = field(default_factory=dict) + sdg_edges: List[SDGEdge] = field(default_factory=list) + k_limit: int = 3 + + +def _formal_key_to_var(key: str) -> str: + kind, _, name = key.partition(":") + if kind == "param": + return name + if kind == "capture": + return CAPTURE_PREFIX + name + return GLOBAL_PREFIX + name + + +class _FunctionAssembler: + """Allocates parameter nodes and wiring edges for one function.""" + + def __init__(self, info: FunctionInfo, summary: FunctionSummary, facts, ddg): + self.info = info + self.summary = summary + self.facts = facts + self.ddg = ddg + self.cfg = info.pdg.cfg + self.scope = info.pdg.scope + self.next_id = len(self.cfg.nodes) + self.param_nodes: List[ParamNode] = [] + self.extra: List[PDGEdge] = [] + self.formal_in: Dict[str, int] = {} # var -> node id + self.formal_out: Dict[str, int] = {} + # (call_node, var) -> node id + self.actual_in: Dict[Tuple[int, str], int] = {} + self.actual_out: Dict[Tuple[int, str], int] = {} + entry = self.cfg.node_by_id(self.cfg.entry_id) + exit_ = self.cfg.node_by_id(self.cfg.exit_id) + self._entry_span = (entry.start_line, entry.end_line) + self._exit_span = (exit_.start_line, exit_.end_line) + + def _alloc(self, kind: str, var: str, span, call_node=None) -> int: + nid = self.next_id + self.next_id += 1 + self.param_nodes.append( + ParamNode( + id=nid, kind=kind, var=var, call_node=call_node, + start_line=span[0], end_line=span[1], + ) + ) + return nid + + # ---------------------------------------------------------------- formals + + def build_formals(self) -> None: + scope, summary = self.scope, self.summary + params = list(scope.params) + for p in params: + self.formal_in[p] = self._alloc("formal_in", p, self._entry_span) + for c in sorted(scope.captures): + var = CAPTURE_PREFIX + c + self.formal_in[var] = self._alloc("formal_in", var, self._entry_span) + for g in sorted(summary.global_reads): + var = GLOBAL_PREFIX + g + self.formal_in[var] = self._alloc("formal_in", var, self._entry_span) + + self.formal_out[RETURN_PATH] = self._alloc( + "formal_out", RETURN_PATH, self._exit_span + ) + for p in sorted(summary.mutated_params): + self.formal_out[p] = self._alloc("formal_out", p, self._exit_span) + for g in sorted(summary.global_writes): + var = GLOBAL_PREFIX + g + self.formal_out[var] = self._alloc("formal_out", var, self._exit_span) + + # Wiring: formal_in → first uses (mirror the ENTRY-def DDG edges). + entry = self.cfg.entry_id + for e in self.ddg: + if e.source != entry: + continue + b = base_of(e.var) + if b in self.formal_in: + fid = self.formal_in[b] + elif CAPTURE_PREFIX + b in self.formal_in: + fid = self.formal_in[CAPTURE_PREFIX + b] + elif "::" in b and GLOBAL_PREFIX + b in self.formal_in: + fid = self.formal_in[GLOBAL_PREFIX + b] + else: + continue + self.extra.append(PDGEdge(source=fid, target=e.target, type="DDG", var=e.var)) + + # Wiring: defining nodes → formal_out. + param_names = set(scope.params) + if scope.self_name: + param_names.add(scope.self_name) + for nid, f in self.facts.items(): + if nid == entry: + continue + if RETURN_PATH in f.defs: + self.extra.append( + PDGEdge( + source=nid, + target=self.formal_out[RETURN_PATH], + type="DDG", + var=RETURN_PATH, + ) + ) + for d in f.defs: + b = base_of(d) + if "::" in b and GLOBAL_PREFIX + b in self.formal_out: + self.extra.append( + PDGEdge( + source=nid, + target=self.formal_out[GLOBAL_PREFIX + b], + type="DDG", + var=d, + ) + ) + elif b in param_names and suffix_of(d) and b in self.formal_out: + self.extra.append( + PDGEdge(source=nid, target=self.formal_out[b], type="DDG", var=d) + ) + + # ---------------------------------------------------------------- actuals + + def _defs_reaching_call_matching(self, call_node: int, path: Optional[str]): + """Sources of DDG in-edges of the call node whose var matches the + actual's access path (all of them when the actual is an expression).""" + sources = [] + for e in self.ddg: + if e.target != call_node: + continue + if path is None or interferes(e.var, path) or interferes(path, e.var): + sources.append((e.source, e.var)) + return sources + + def build_actuals( + self, + summaries: Dict[str, FunctionSummary], + formal_ids: Dict[str, Dict[str, int]], + sdg_edges: List[SDGEdge], + ) -> None: + sig = self.info.signature + node_span = { + n.id: (n.start_line, n.end_line) for n in self.cfg.nodes + } + + for cs in sorted(self.info.call_sites, key=lambda c: (c.node_id, c.targets)): + span = node_span.get(cs.node_id, (-1, -1)) + for target in cs.targets: + callee_summary = summaries.get(target) + callee_formals = formal_ids.get(target) + if callee_summary is None or callee_formals is None: + continue # external — conservative pass-through already applies + + # CALL: callsite statement → callee ENTRY. + sdg_edges.append( + SDGEdge( + source_sig=sig, source_node=cs.node_id, + target_sig=target, target_node=0, type="CALL", + ) + ) + + bound_in: Dict[str, int] = {} # formal key -> actual_in id + bound_out: Dict[str, int] = {} # formal key -> actual_out id + + # Argument actual_ins for the callee formals this site binds. + for param, path in cs.arg_paths: + if param not in callee_formals: + continue + key = (cs.node_id, f"{target}::{param}") + if key not in self.actual_in: + aid = self._alloc("actual_in", param, span, cs.node_id) + self.actual_in[key] = aid + self.extra.append( + PDGEdge(source=cs.node_id, target=aid, type="CDG") + ) + for src, var in self._defs_reaching_call_matching( + cs.node_id, path + ): + self.extra.append( + PDGEdge(source=src, target=aid, type="DDG", var=var) + ) + bound_in[f"param:{param}"] = self.actual_in[key] + sdg_edges.append( + SDGEdge( + source_sig=sig, source_node=self.actual_in[key], + target_sig=target, + target_node=callee_formals[param], + type="PARAM_IN", var=param, + ) + ) + + # Global actual_ins from the callee's read footprint. + for g in sorted(callee_summary.global_reads): + fvar = GLOBAL_PREFIX + g + if fvar not in callee_formals: + continue + key = (cs.node_id, f"{target}::{fvar}") + if key not in self.actual_in: + aid = self._alloc("actual_in", fvar, span, cs.node_id) + self.actual_in[key] = aid + self.extra.append( + PDGEdge(source=cs.node_id, target=aid, type="CDG") + ) + for src, var in self._defs_reaching_call_matching( + cs.node_id, g + ): + self.extra.append( + PDGEdge(source=src, target=aid, type="DDG", var=var) + ) + bound_in[f"global:{g}"] = self.actual_in[key] + sdg_edges.append( + SDGEdge( + source_sig=sig, source_node=self.actual_in[key], + target_sig=target, target_node=callee_formals[fvar], + type="PARAM_IN", var=fvar, + ) + ) + + # actual_outs: return, mutated bound params, written globals. + out_specs: List[Tuple[str, str]] = [("return", RETURN_PATH)] + for p in sorted(callee_summary.mutated_params): + if cs.arg_path_of(p) is not None: + out_specs.append((f"param:{p}", p)) + for g in sorted(callee_summary.global_writes): + out_specs.append((f"global:{g}", GLOBAL_PREFIX + g)) + + callee_formal_outs = formal_ids.get(f"{target}", {}) + for key_name, fvar in out_specs: + if fvar not in callee_formal_outs: + continue + key = (cs.node_id, f"{target}::out::{fvar}") + if key not in self.actual_out: + oid = self._alloc("actual_out", fvar, span, cs.node_id) + self.actual_out[key] = oid + self.extra.append( + PDGEdge(source=cs.node_id, target=oid, type="CDG") + ) + self.extra.append( + PDGEdge(source=oid, target=cs.node_id, type="DDG", var=fvar) + ) + bound_out[key_name] = self.actual_out[key] + sdg_edges.append( + SDGEdge( + source_sig=target, + source_node=callee_formal_outs[fvar], + target_sig=sig, target_node=self.actual_out[key], + type="PARAM_OUT", var=fvar, + ) + ) + + # SUMMARY: actual_in → actual_out per callee transitive flow. + for in_key, out_key in sorted(callee_summary.flows): + a_in = bound_in.get(in_key) + a_out = bound_out.get(out_key) + if a_in is not None and a_out is not None: + sdg_edges.append( + SDGEdge( + source_sig=sig, source_node=a_in, + target_sig=sig, target_node=a_out, + type="SUMMARY", var=None, + ) + ) + + # Closure captures: bind at the nested callable's def statement. + for def_node, nested_sig in sorted(self.info.nested_defs): + nested_formals = formal_ids.get(nested_sig) + if not nested_formals: + continue + span = node_span.get(def_node, (-1, -1)) + for fvar, fid in sorted(nested_formals.items()): + if not fvar.startswith(CAPTURE_PREFIX): + continue + name = fvar[len(CAPTURE_PREFIX):] + key = (def_node, f"{nested_sig}::{fvar}") + if key not in self.actual_in: + aid = self._alloc("actual_in", fvar, span, def_node) + self.actual_in[key] = aid + self.extra.append(PDGEdge(source=def_node, target=aid, type="CDG")) + for src, var in self._defs_reaching_call_matching(def_node, name): + self.extra.append( + PDGEdge(source=src, target=aid, type="DDG", var=var) + ) + sdg_edges.append( + SDGEdge( + source_sig=self.info.signature, + source_node=self.actual_in[key], + target_sig=nested_sig, target_node=fid, + type="PARAM_IN", var=fvar, + ) + ) + + +def assemble_sdg( + infos: Dict[str, FunctionInfo], + summaries: Dict[str, FunctionSummary], + k: int, +) -> ProgramGraphsIR: + """Stitch every function's PDG into the whole-program SDG.""" + ir = ProgramGraphsIR(k_limit=k) + + # Pass 1: solve each function against the final summaries and lay out its + # formal nodes (their ids must exist before callsites reference them). + assemblers: Dict[str, _FunctionAssembler] = {} + formal_ids: Dict[str, Dict[str, int]] = {} + for sig in sorted(infos): + info = infos[sig] + summary, facts, ddg = solve_function(info, summaries) + asm = _FunctionAssembler(info, summary, facts, ddg) + asm.build_formals() + assemblers[sig] = asm + formal_ids[sig] = dict(asm.formal_in) + formal_ids[f"{sig}"] = dict(asm.formal_out) + + # Pass 2: callsite actuals and cross-function edges. + sdg_edges: List[SDGEdge] = [] + for sig in sorted(assemblers): + assemblers[sig].build_actuals(summaries, formal_ids, sdg_edges) + + for sig, asm in assemblers.items(): + ir.functions[sig] = FunctionGraphs( + pdg=asm.info.pdg, + ddg=asm.ddg, + param_nodes=asm.param_nodes, + extra_edges=asm.extra, + summary=asm.summary, + ) + + ir.sdg_edges = sorted( + set(sdg_edges), + key=lambda e: (e.source_sig, e.source_node, e.target_sig, e.target_node, e.type, e.var or ""), + ) + return ir diff --git a/codeanalyzer/dataflow/summaries.py b/codeanalyzer/dataflow/summaries.py new file mode 100644 index 0000000..f7d0d2c --- /dev/null +++ b/codeanalyzer/dataflow/summaries.py @@ -0,0 +1,217 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 6 of the level-3 dataflow ladder: bottom-up function summaries. + +A summary is relational: which formal inputs (parameters, captures, read +globals) may flow to which formal outputs (the return value, caller-visible +parameter mutations, written globals). Summaries compose bottom-up over the +SCC condensation DAG of the call-graph oracle; within an SCC (mutual +recursion) all members iterate to a monotone fixpoint — the domains (formal +keys and qualified global names) are finite and effects only grow, so +termination is structural. k-limiting bounds the access-path vocabulary. + +At statement granularity a callsite node is already a transformer (all its +defs depend on all its uses), so the composition step callee summaries +actually contribute is the *global footprint*: a callsite node gains the +callee's transitive global reads as uses and writes as defs, the reaching +definitions are re-solved, and flows are re-derived. External/unmodeled +callees default to conservative pass-through (their argument paths are +already weak-defined and used at the call statement). + +Summary flow keys: ``param:NAME``, ``capture:NAME``, ``global:MODULE::NAME`` +for inputs; ``return``, ``param:NAME`` (mutation), ``global:MODULE::NAME`` +for outputs. +""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple + +from codeanalyzer.dataflow.access_paths import RETURN_PATH, base_of, suffix_of +from codeanalyzer.dataflow.alias import TypeBasedAliasOracle +from codeanalyzer.dataflow.defuse import DDGEdge, ddg_edges +from codeanalyzer.dataflow.pdg import FunctionPDG +from codeanalyzer.dataflow.scc import strongly_connected_components + + +@dataclass(frozen=True) +class CallSite: + """One resolved call at one CFG statement node (builder-provided).""" + + node_id: int + targets: Tuple[str, ...] # callee signatures declared in the symbol table + # callee param name -> actual access path (None: a non-path expression) + arg_paths: Tuple[Tuple[str, Optional[str]], ...] = () + line: int = -1 + + def arg_path_of(self, param: str) -> Optional[str]: + for name, path in self.arg_paths: + if name == param: + return path + return None + + +@dataclass +class FunctionInfo: + """Everything the interprocedural stages need about one callable.""" + + signature: str + pdg: FunctionPDG + oracle: TypeBasedAliasOracle + call_sites: List[CallSite] = field(default_factory=list) + # Nested callables defined at a statement node: (def_node_id, nested_sig). + nested_defs: List[Tuple[int, str]] = field(default_factory=list) + + +@dataclass +class FunctionSummary: + global_reads: Set[str] = field(default_factory=set) + global_writes: Set[str] = field(default_factory=set) + mutated_params: Set[str] = field(default_factory=set) + flows: Set[Tuple[str, str]] = field(default_factory=set) + + def __eq__(self, other): + return ( + isinstance(other, FunctionSummary) + and self.global_reads == other.global_reads + and self.global_writes == other.global_writes + and self.mutated_params == other.mutated_params + and self.flows == other.flows + ) + + +def _is_global(path: str) -> bool: + return "::" in base_of(path) + + +def augmented_facts(info: FunctionInfo, summaries: Dict[str, FunctionSummary]): + """Per-node facts with callee global footprints injected at callsites.""" + facts = {nid: f for nid, f in info.pdg.facts.items()} + out = {} + for nid, f in facts.items(): + out[nid] = type(f)(defs=set(f.defs), uses=set(f.uses)) + for cs in info.call_sites: + for target in cs.targets: + s = summaries.get(target) + if s is None: + continue + out[cs.node_id].uses |= s.global_reads + out[cs.node_id].defs |= s.global_writes + return out + + +def solve_function( + info: FunctionInfo, summaries: Dict[str, FunctionSummary] +) -> Tuple[FunctionSummary, Dict[int, object], List[DDGEdge]]: + """One summary iteration: inject callee footprints, re-solve reaching + definitions, derive flows. Returns (summary, augmented facts, DDG).""" + facts = augmented_facts(info, summaries) + ddg = ddg_edges(info.pdg.cfg, facts, info.oracle) + + # Forward adjacency over DDG ∪ CDG (a statement transforms all its + # inputs into all its outputs — statement-granularity posture). + adj: Dict[int, List[int]] = {} + for e in ddg: + adj.setdefault(e.source, []).append(e.target) + for e in info.pdg.edges: + if e.type == "CDG": + adj.setdefault(e.source, []).append(e.target) + + entry = info.pdg.cfg.entry_id + scope = info.pdg.scope + + # Seeds: the ENTRY-def DDG edges, grouped by formal key. + seeds: Dict[str, Set[int]] = {} + for e in ddg: + if e.source != entry: + continue + b = base_of(e.var) + if b == scope.self_name or b in scope.params: + key = f"param:{b}" + elif b in scope.captures: + key = f"capture:{b}" + elif _is_global(e.var): + key = f"global:{b}" + else: + continue + seeds.setdefault(key, set()).add(e.target) + + def reach(start: Set[int]) -> Set[int]: + seen: Set[int] = set() + stack = list(start) + while stack: + n = stack.pop() + if n in seen: + continue + seen.add(n) + stack.extend(adj.get(n, [])) + return seen + + summary = FunctionSummary() + param_names = set(scope.params) + if scope.self_name: + param_names.add(scope.self_name) + + for nid, f in facts.items(): + if nid == entry: + continue + for d in f.defs: + b = base_of(d) + if _is_global(d): + summary.global_writes.add(b) + elif b in param_names and suffix_of(d): + summary.mutated_params.add(b) + for u in f.uses: + if _is_global(u): + summary.global_reads.add(base_of(u)) + + for key, start in seeds.items(): + for nid in reach(start): + f = facts[nid] + if RETURN_PATH in f.defs: + summary.flows.add((key, "return")) + for d in f.defs: + b = base_of(d) + if _is_global(d): + summary.flows.add((key, f"global:{b}")) + elif b in param_names and suffix_of(d): + summary.flows.add((key, f"param:{b}")) + + return summary, facts, ddg + + +def compute_summaries( + infos: Dict[str, FunctionInfo], + call_edges: List[Tuple[str, str]], +) -> Dict[str, FunctionSummary]: + """Bottom-up composition over the SCC condensation DAG, monotone fixpoint + within each SCC.""" + order = strongly_connected_components(sorted(infos), call_edges) + summaries: Dict[str, FunctionSummary] = {} + for scc in order: + members = [s for s in scc if s in infos] + changed = True + while changed: + changed = False + for sig in members: + new, _, _ = solve_function(infos[sig], summaries) + if summaries.get(sig) != new: + summaries[sig] = new + changed = True + return summaries diff --git a/test/test_dataflow_sdg.py b/test/test_dataflow_sdg.py new file mode 100644 index 0000000..23957db --- /dev/null +++ b/test/test_dataflow_sdg.py @@ -0,0 +1,188 @@ +"""Stage 6–7 gates: summaries and SDG assembly on the dataflow fixture. + +Contract assertions (dataflow-graphs § verification gates): +- summary gate: a composed summary routes a parameter to the return value + across a call chain; the mutual-recursion SCC reaches fixpoint and its + summary is identical across two runs; +- SDG gate: no dangling (signature, node_id) endpoints; PARAM_IN targets + match the callee's declared formals; SUMMARY edges exist for a known + transitive flow; the module-global write/read pair is stitched across + files; closure captures bind at the definition site. +""" + +from pathlib import Path + +import pytest + +from codeanalyzer.dataflow.builder import build_program_graphs +from codeanalyzer.dataflow.sdg import CAPTURE_PREFIX, GLOBAL_PREFIX +from codeanalyzer.options import AnalysisOptions +from codeanalyzer.core import Codeanalyzer + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +@pytest.fixture(scope="module") +def fixture_app(tmp_path_factory): + cache = tmp_path_factory.mktemp("dataflow-cache") + options = AnalysisOptions( + input=FIXTURE, analysis_level=1, no_venv=True, cache_dir=cache + ) + with Codeanalyzer(options) as analyzer: + return analyzer.analyze() + + +@pytest.fixture(scope="module") +def ir(fixture_app): + return build_program_graphs(fixture_app) + + +def _sig(ir_or_app, suffix: str) -> str: + functions = ir_or_app.functions + matches = [s for s in functions if s == suffix or s.endswith("." + suffix)] + assert matches, f"no function graph for *{suffix}: have {sorted(functions)[:10]}..." + assert len(matches) == 1, f"ambiguous suffix {suffix}: {matches}" + return matches[0] + + +def _valid_ids(ir, sig) -> set: + fg = ir.functions[sig] + return {n.id for n in fg.pdg.cfg.nodes} | {p.id for p in fg.param_nodes} + + +# ------------------------------------------------------------- summary gate + + +def test_summary_routes_parameter_through_the_call_chain(ir): + for name in ("chain_a", "chain_b", "chain_c"): + summary = ir.functions[_sig(ir, name)].summary + assert ("param:v", "return") in summary.flows, name + + +def test_mutual_recursion_scc_reaches_identical_fixpoint(fixture_app): + first = build_program_graphs(fixture_app) + second = build_program_graphs(fixture_app) + for name in ("even", "odd"): + s1 = first.functions[_sig(first, name)].summary + s2 = second.functions[_sig(second, name)].summary + assert ("param:n", "return") in s1.flows, name + assert s1 == s2, name + + +def test_bump_summary_records_the_global_write(ir): + summary = ir.functions[_sig(ir, "bump")].summary + assert any(g.endswith("::counter") for g in summary.global_writes) + assert any( + key == "param:amount" and out.startswith("global:") and out.endswith("::counter") + for key, out in summary.flows + ) + + +def test_mutate_summary_records_caller_visible_param_mutation(ir): + summary = ir.functions[_sig(ir, "mutate")].summary + assert "items" in summary.mutated_params + + +# ----------------------------------------------------------------- SDG gate + + +def test_no_dangling_sdg_endpoints(ir): + for e in ir.sdg_edges: + assert e.source_sig in ir.functions, e + assert e.target_sig in ir.functions, e + assert e.source_node in _valid_ids(ir, e.source_sig), e + assert e.target_node in _valid_ids(ir, e.target_sig), e + + +def test_param_in_arity_matches_callee_formals(ir): + for e in ir.sdg_edges: + if e.type != "PARAM_IN": + continue + callee = ir.functions[e.target_sig] + formal = next(p for p in callee.param_nodes if p.id == e.target_node) + assert formal.kind == "formal_in" + assert formal.var == e.var + + +def test_param_out_sources_are_formal_outs(ir): + for e in ir.sdg_edges: + if e.type != "PARAM_OUT": + continue + callee = ir.functions[e.source_sig] + formal = next(p for p in callee.param_nodes if p.id == e.source_node) + assert formal.kind == "formal_out" + + +def test_call_edges_target_callee_entry(ir): + calls = [e for e in ir.sdg_edges if e.type == "CALL"] + assert calls, "no CALL edges assembled" + for e in calls: + assert e.target_node == 0 # ENTRY + + +def test_summary_edge_exists_for_the_transitive_chain_flow(ir): + drive = _sig(ir, "drive") + chain_a = _sig(ir, "chain_a") + # drive's callsite r = chain_a(n): the callee's param:v → return flow + # must surface as an actual_in → actual_out SUMMARY edge at the site. + summaries = [ + e for e in ir.sdg_edges + if e.type == "SUMMARY" and e.source_sig == drive and e.target_sig == drive + ] + assert summaries, "no SUMMARY edge at drive's chain_a callsite" + # And chain_a itself summarizes its call to chain_b. + assert any( + e.type == "SUMMARY" and e.source_sig == chain_a for e in ir.sdg_edges + ) + + +def test_global_flow_is_stitched_across_files(ir): + drive = _sig(ir, "drive") + bump = _sig(ir, "bump") + read_counter = _sig(ir, "read_counter") + # bump's write formal flows out to drive's callsite... + out_edges = [ + e for e in ir.sdg_edges + if e.type == "PARAM_OUT" and e.source_sig == bump and e.target_sig == drive + and (e.var or "").startswith(GLOBAL_PREFIX) + ] + assert out_edges, "bump's global write does not reach drive" + # ...and read_counter's read formal is fed from drive's callsite. + in_edges = [ + e for e in ir.sdg_edges + if e.type == "PARAM_IN" and e.source_sig == drive and e.target_sig == read_counter + and (e.var or "").startswith(GLOBAL_PREFIX) + ] + assert in_edges, "read_counter's global read is not bound at drive" + + +def test_closure_capture_binds_at_definition_site(ir): + make_adder = _sig(ir, "make_adder") + add = _sig(ir, "make_adder.add") + edges = [ + e for e in ir.sdg_edges + if e.type == "PARAM_IN" and e.source_sig == make_adder and e.target_sig == add + and e.var == CAPTURE_PREFIX + "base" + ] + assert edges, "capture formal for `base` is not bound at the def site" + + +def test_mutation_flows_back_through_param_out(ir): + caller = _sig(ir, "caller_of_mutate") + mutate = _sig(ir, "mutate") + edges = [ + e for e in ir.sdg_edges + if e.type == "PARAM_OUT" and e.source_sig == mutate and e.target_sig == caller + and e.var == "items" + ] + assert edges, "mutate's param mutation does not flow back to the caller" + + +def test_assembly_is_deterministic(fixture_app): + a = build_program_graphs(fixture_app) + b = build_program_graphs(fixture_app) + assert a.sdg_edges == b.sdg_edges + for sig in a.functions: + assert [ + (p.id, p.kind, p.var) for p in a.functions[sig].param_nodes + ] == [(p.id, p.kind, p.var) for p in b.functions[sig].param_nodes] From 43e0e69f513002b9626232f8b7a77ebf0335d5d2 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:55:15 -0400 Subject: [PATCH 07/11] =?UTF-8?q?feat(dataflow):=20stage=208a=20=E2=80=94?= =?UTF-8?q?=20two-phase=20context-sensitive=20backward=20slicing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Classic HRB traversal over the assembled SDG: phase 1 ascends and skips across callsites via SUMMARY edges (never PARAM_OUT), phase 2 descends (never PARAM_IN/CALL) — call–return matching without re-descent. Gate pins an exact hand-computed interprocedural slice (caller_of_mutate → mutate) plus cross-file global descent and no-reascend properties. (#67) --- codeanalyzer/dataflow/slicing.py | 93 ++++++++++++++++++++++++++ test/test_dataflow_slicing.py | 110 +++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 codeanalyzer/dataflow/slicing.py create mode 100644 test/test_dataflow_slicing.py diff --git a/codeanalyzer/dataflow/slicing.py b/codeanalyzer/dataflow/slicing.py new file mode 100644 index 0000000..8f50146 --- /dev/null +++ b/codeanalyzer/dataflow/slicing.py @@ -0,0 +1,93 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Stage 8 of the level-3 dataflow ladder: backward slicing as an SDG query. + +The classic Horwitz–Reps–Binkley two-phase traversal, which is what makes the +slice *context-sensitive* without re-descending into callees: + +- **Phase 1** walks backward over every dependence edge **except PARAM_OUT**: + it ascends from the criterion to callers (PARAM_IN/CALL reversed) and steps + *across* callsites through SUMMARY edges, but never descends into a callee. +- **Phase 2** starts from everything phase 1 reached and walks backward over + every edge **except PARAM_IN and CALL**: it descends into callees + (PARAM_OUT reversed) but never re-ascends — which is exactly what prevents + infeasible call–return mismatches. + +Slicing consumes the assembled :class:`~codeanalyzer.dataflow.sdg. +ProgramGraphsIR`; taint is the same labeled traversal with a model pack and +is deliberately left to the CLDK SDK (language-independent once the SDG is +emitted — see #67). +""" + +from __future__ import annotations + +from typing import Dict, List, Set, Tuple + +from codeanalyzer.dataflow.sdg import ProgramGraphsIR + +Node = Tuple[str, int] # (signature, node_id) + + +def _reverse_adjacency(ir: ProgramGraphsIR) -> Dict[Node, List[Tuple[Node, str]]]: + """target → [(source, edge_type)] over intra- and inter-procedural edges.""" + radj: Dict[Node, List[Tuple[Node, str]]] = {} + + def add(src: Node, tgt: Node, kind: str) -> None: + radj.setdefault(tgt, []).append((src, kind)) + + for sig, fg in ir.functions.items(): + for e in fg.pdg.edges: + if e.type == "CDG": + add((sig, e.source), (sig, e.target), "CDG") + for e in fg.ddg: + add((sig, e.source), (sig, e.target), "DDG") + for e in fg.extra_edges: + add((sig, e.source), (sig, e.target), e.type) + for e in ir.sdg_edges: + add( + (e.source_sig, e.source_node), + (e.target_sig, e.target_node), + e.type, + ) + return radj + + +def backward_slice(ir: ProgramGraphsIR, signature: str, node_id: int) -> Set[Node]: + """Context-sensitive backward slice of ``(signature, node_id)``.""" + if signature not in ir.functions: + raise KeyError(f"unknown signature: {signature}") + radj = _reverse_adjacency(ir) + criterion: Node = (signature, node_id) + + def sweep(seeds: Set[Node], skip: Set[str]) -> Set[Node]: + seen: Set[Node] = set() + stack = list(seeds) + while stack: + node = stack.pop() + if node in seen: + continue + seen.add(node) + for src, kind in radj.get(node, ()): + if kind in skip: + continue + if src not in seen: + stack.append(src) + return seen + + phase1 = sweep({criterion}, skip={"PARAM_OUT"}) + phase2 = sweep(phase1, skip={"PARAM_IN", "CALL"}) + return phase1 | phase2 diff --git a/test/test_dataflow_slicing.py b/test/test_dataflow_slicing.py new file mode 100644 index 0000000..b8e546d --- /dev/null +++ b/test/test_dataflow_slicing.py @@ -0,0 +1,110 @@ +"""Stage-8 gate: the two-phase context-sensitive backward slice. + +The client gate demands an *exact* hand-computed node set for a named +criterion — this is the assertion that catches both missing dependence edges +and context-insensitive over-reach. +""" + +from pathlib import Path + +import pytest + +from codeanalyzer.core import Codeanalyzer +from codeanalyzer.dataflow.builder import build_program_graphs +from codeanalyzer.dataflow.slicing import backward_slice +from codeanalyzer.options import AnalysisOptions + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + + +@pytest.fixture(scope="module") +def ir(tmp_path_factory): + cache = tmp_path_factory.mktemp("dataflow-slice-cache") + options = AnalysisOptions( + input=FIXTURE, analysis_level=1, no_venv=True, cache_dir=cache + ) + with Codeanalyzer(options) as analyzer: + return build_program_graphs(analyzer.analyze()) + + +def _sig(ir, suffix: str) -> str: + matches = [s for s in ir.functions if s == suffix or s.endswith("." + suffix)] + assert len(matches) == 1, f"suffix {suffix}: {matches}" + return matches[0] + + +def _cfg_id(ir, sig: str, line: int) -> int: + fg = ir.functions[sig] + return next( + n.id for n in fg.pdg.cfg.nodes if n.start_line == line and n.kind != "entry" + ) + + +def _param_id(ir, sig: str, kind: str, var: str, call_node=None) -> int: + fg = ir.functions[sig] + matches = [ + p.id + for p in fg.param_nodes + if p.kind == kind and p.var == var and (call_node is None or p.call_node == call_node) + ] + assert len(matches) == 1, f"{sig} {kind} {var}: {matches}" + return matches[0] + + +def test_caller_of_mutate_slice_is_exactly_the_hand_computed_set(ir): + caller = _sig(ir, "caller_of_mutate") + mutate = _sig(ir, "mutate") + criterion = _cfg_id(ir, caller, 61) # return xs + + got = backward_slice(ir, caller, criterion) + + call_node = _cfg_id(ir, caller, 60) # mutate(xs) + expected = { + # caller: ENTRY, xs = [], the callsite, the criterion, + (caller, ir.functions[caller].pdg.cfg.entry_id), + (caller, _cfg_id(ir, caller, 59)), + (caller, call_node), + (caller, criterion), + # the module binding `mutate` read at the callsite, + (caller, _param_id(ir, caller, "formal_in", ":pipeline::mutate")), + # the callsite's parameter structure, + (caller, _param_id(ir, caller, "actual_in", "items", call_node)), + (caller, _param_id(ir, caller, "actual_out", "", call_node)), + (caller, _param_id(ir, caller, "actual_out", "items", call_node)), + # mutate (phase-2 descent): ENTRY, items.append(1), its formals. + (mutate, ir.functions[mutate].pdg.cfg.entry_id), + (mutate, _cfg_id(ir, mutate, 55)), + (mutate, _param_id(ir, mutate, "formal_in", "items")), + (mutate, _param_id(ir, mutate, "formal_out", "")), + (mutate, _param_id(ir, mutate, "formal_out", "items")), + } + assert got == expected + + +def test_global_slice_descends_into_the_writing_function(ir): + read_counter = _sig(ir, "read_counter") + bump = _sig(ir, "bump") + criterion = _cfg_id(ir, read_counter, 12) # return counter + + got = backward_slice(ir, read_counter, criterion) + + # The write `counter = counter + amount` (state.py line 8) must be in the + # slice: read_counter ascends to drive's callsite, whose incoming global + # def comes from bump's PARAM_OUT. + assert (bump, _cfg_id(ir, bump, 8)) in got + + +def test_slice_does_not_reascend_into_unrelated_callers(ir): + # Criterion inside chain_c: its slice ascends to chain_b/chain_a/drive, + # but must not pull in unrelated functions like alias_flow or gen. + chain_c = _sig(ir, "chain_c") + criterion = _cfg_id(ir, chain_c, 13) # return v - 3 + got = backward_slice(ir, chain_c, criterion) + sigs = {s for s, _ in got} + assert _sig(ir, "alias_flow") not in sigs + assert _sig(ir, "looped") not in sigs + + +def test_unknown_signature_raises(ir): + with pytest.raises(KeyError): + backward_slice(ir, "no.such.function", 0) From 6479c04780fd0d133cf963fde3ae23f6a0e4442e Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 21:58:30 -0400 Subject: [PATCH 08/11] feat(dataflow): program_graphs emission, -a 3, --graphs, --graph-field-depth program_graphs schema section (PyProgramGraphs and friends, versioned 1.0.0 independently of the application schema) attached to PyApplication; -a extended to 3 (cumulative: level 3 keeps PyCG enrichment); --graphs cfg,dfg,pdg,sdg selector with strict validation (unknown values and level<3 usage exit non-zero, never silently fall back); --graph-field-depth k-limit knob recorded in the output. -a 1/2 emit no program_graphs and their pipeline is untouched. (#67) --- codeanalyzer/__main__.py | 47 ++++++++++- codeanalyzer/core.py | 22 +++++- codeanalyzer/dataflow/builder.py | 98 +++++++++++++++++++++++ codeanalyzer/options/options.py | 4 + codeanalyzer/schema/__init__.py | 20 +++++ codeanalyzer/schema/py_schema.py | 131 +++++++++++++++++++++++++++++++ test/test_dataflow_emission.py | 101 ++++++++++++++++++++++++ 7 files changed, 419 insertions(+), 4 deletions(-) create mode 100644 test/test_dataflow_emission.py diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index 4d9db86..950951b 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -114,11 +114,31 @@ def main( typer.Option( "-a", "--analysis-level", - help="Analysis depth: 1=symbol table+Jedi call graph, 2=+PyCG call graph.", + help="Analysis depth: 1=symbol table+Jedi call graph, 2=+PyCG call " + "graph, 3=+native dataflow graphs (CFG/PDG/SDG).", min=1, - max=2, + max=3, ), ] = 1, + graphs: Annotated[ + str, + typer.Option( + "--graphs", + help="Level 3 only: comma-separated program-graph sections to emit " + "(cfg, dfg, pdg, sdg). Default: all. `dfg` emits the PDG's data " + "edges only; `sdg` implies the dependence edges it stitches.", + ), + ] = "cfg,dfg,pdg,sdg", + graph_field_depth: Annotated[ + int, + typer.Option( + "--graph-field-depth", + help="Level 3 only: k-limit on access-path depth (x.f.g.h with " + "k=3 becomes x.f.g.*). Mandatory bound — it is what guarantees " + "the interprocedural fixpoint terminates.", + min=1, + ), + ] = 3, using_ray: Annotated[ bool, typer.Option("--ray/--no-ray", help="Enable Ray for distributed analysis."), @@ -243,6 +263,27 @@ def main( ), ] = 50, ): + # Flag validation (strict: unrecognized values error out, never fall back). + selected_graphs = [g.strip() for g in graphs.split(",") if g.strip()] + from codeanalyzer.dataflow.builder import VALID_GRAPHS + + unknown_graphs = [g for g in selected_graphs if g not in VALID_GRAPHS] + if unknown_graphs: + logger.error( + f"Unrecognized --graphs value(s): {', '.join(unknown_graphs)} " + f"(valid: {', '.join(VALID_GRAPHS)})." + ) + raise typer.Exit(code=2) + if not selected_graphs: + logger.error("--graphs requires at least one of: " + ", ".join(VALID_GRAPHS)) + raise typer.Exit(code=2) + if analysis_level < 3 and graphs != "cfg,dfg,pdg,sdg": + logger.error("--graphs is a level-3 option; pass -a 3 to emit program graphs.") + raise typer.Exit(code=2) + if analysis_level < 3 and graph_field_depth != 3: + logger.error("--graph-field-depth is a level-3 option; pass -a 3.") + raise typer.Exit(code=2) + options = AnalysisOptions( input=input, output=output, @@ -254,6 +295,8 @@ def main( neo4j_password=neo4j_password, neo4j_database=neo4j_database, analysis_level=analysis_level, + graphs=",".join(selected_graphs), + graph_field_depth=graph_field_depth, using_ray=using_ray, rebuild_analysis=rebuild_analysis, skip_tests=skip_tests, diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index dad43c9..4a42ad2 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -454,10 +454,28 @@ def analyze(self) -> PyApplication: .external_symbols(external_symbols) .build() ) - + + if self.analysis_level >= 3: + # Level 3: native dataflow graphs (CFG/PDG/SDG) over the same + # signatures, gated so -a 1/-a 2 timings stay untouched. + from codeanalyzer.dataflow.builder import ( + build_program_graphs, + to_program_graphs, + ) + + t0_l3 = time.perf_counter() + ir = build_program_graphs(app, k=self.options.graph_field_depth) + app.program_graphs = to_program_graphs( + ir, set(self.options.graphs.split(",")) + ) + logger.info( + "✅ Program graphs: %d functions, %d SDG edges in %.1fs", + len(ir.functions), len(ir.sdg_edges), time.perf_counter() - t0_l3, + ) + # Save to cache self._save_analysis_cache(app, cache_file) - + return app def _load_pyapplication_from_cache(self, cache_file: Path) -> PyApplication: diff --git a/codeanalyzer/dataflow/builder.py b/codeanalyzer/dataflow/builder.py index e867606..01fd256 100644 --- a/codeanalyzer/dataflow/builder.py +++ b/codeanalyzer/dataflow/builder.py @@ -253,3 +253,101 @@ def build_program_graphs( summaries = compute_summaries(infos, sorted(set(call_edges))) return assemble_sdg(infos, summaries, k) + + +VALID_GRAPHS = ("cfg", "dfg", "pdg", "sdg") + + +def to_program_graphs(ir: ProgramGraphsIR, graphs: Set[str]): + """Project the IR onto the ``program_graphs`` schema section, scoped by + the ``--graphs`` selector. ``dfg`` emits the PDG's DDG edges only; + ``sdg`` implies the dependence edges it is stitched over.""" + from codeanalyzer.schema.py_schema import ( + PyCFG, + PyCFGEdge, + PyFunctionGraphs, + PyGraphNode, + PyParamNode, + PyPDG, + PyPDGEdge, + PyProgramGraphs, + PySDGEdge, + PySDGEndpoint, + ) + + want_pdg = bool({"pdg", "sdg"} & graphs) + want_dfg = want_pdg or "dfg" in graphs + functions: Dict[str, "PyFunctionGraphs"] = {} + for sig in sorted(ir.functions): + fg = ir.functions[sig] + out = PyFunctionGraphs() + if "cfg" in graphs: + out.cfg = PyCFG( + nodes=[ + PyGraphNode( + id=n.id, + kind=n.kind, + start_line=n.start_line, + end_line=n.end_line, + start_column=n.start_column, + end_column=n.end_column, + ) + for n in fg.pdg.cfg.nodes + ], + edges=[ + PyCFGEdge(source=e.source, target=e.target, kind=e.kind) + for e in fg.pdg.cfg.edges + ], + ) + edges: List["PyPDGEdge"] = [] + if want_pdg: + edges.extend( + PyPDGEdge(source=e.source, target=e.target, type="CDG") + for e in fg.pdg.edges + if e.type == "CDG" + ) + if want_dfg: + edges.extend( + PyPDGEdge(source=e.source, target=e.target, type="DDG", var=e.var) + for e in fg.ddg + ) + edges.extend( + PyPDGEdge(source=e.source, target=e.target, type=e.type, var=e.var) + for e in fg.extra_edges + if e.type == "DDG" or want_pdg + ) + if edges: + edges.sort(key=lambda e: (e.source, e.target, e.type, e.var or "")) + out.pdg = PyPDG(edges=edges) + if "sdg" in graphs: + out.param_nodes = [ + PyParamNode( + id=p.id, + kind=p.kind, + var=p.var, + call_node=p.call_node, + start_line=p.start_line, + end_line=p.end_line, + ) + for p in fg.param_nodes + ] + functions[sig] = out + + sdg_edges = [] + if "sdg" in graphs: + sdg_edges = [ + PySDGEdge( + source=PySDGEndpoint(signature=e.source_sig, node=e.source_node), + target=PySDGEndpoint(signature=e.target_sig, node=e.target_node), + type=e.type, + var=e.var, + ) + for e in ir.sdg_edges + ] + + return PyProgramGraphs( + schema_version="1.0.0", + k_limit=ir.k_limit, + functions=functions, + sdg_edges=sdg_edges, + ) diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index 4e8662c..bf0feab 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -49,6 +49,10 @@ class AnalysisOptions: neo4j_password: str = "neo4j" neo4j_database: Optional[str] = None analysis_level: int = 1 + # Level-3 dataflow knobs: which program graphs to emit (csv of + # cfg|dfg|pdg|sdg) and the access-path k-limit. + graphs: str = "cfg,dfg,pdg,sdg" + graph_field_depth: int = 3 using_ray: bool = False rebuild_analysis: bool = False skip_tests: bool = True diff --git a/codeanalyzer/schema/__init__.py b/codeanalyzer/schema/__init__.py index bcfa976..5b2315c 100644 --- a/codeanalyzer/schema/__init__.py +++ b/codeanalyzer/schema/__init__.py @@ -5,12 +5,22 @@ PyApplication, PyCallable, PyCallableParameter, + PyCFG, + PyCFGEdge, PyClass, PyClassAttribute, PyComment, PyExternalSymbol, + PyFunctionGraphs, + PyGraphNode, PyImport, PyModule, + PyParamNode, + PyPDG, + PyPDGEdge, + PyProgramGraphs, + PySDGEdge, + PySDGEndpoint, PyVariableDeclaration, ) @@ -25,6 +35,16 @@ "PyCallable", "PyClassAttribute", "PyCallableParameter", + "PyGraphNode", + "PyCFGEdge", + "PyPDGEdge", + "PyParamNode", + "PyCFG", + "PyPDG", + "PyFunctionGraphs", + "PySDGEndpoint", + "PySDGEdge", + "PyProgramGraphs", ] try: diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py index d58ef91..4348ba5 100644 --- a/codeanalyzer/schema/py_schema.py +++ b/codeanalyzer/schema/py_schema.py @@ -369,6 +369,135 @@ class PyExternalSymbol(BaseModel): module: Optional[str] = None # best-effort owning module, e.g. "requests" +@builder +@msgpk +class PyGraphNode(BaseModel): + """A CFG node of one callable's level-3 graphs. ``id`` is the source-span + order index within the callable (synthetic ENTRY = 0, EXIT = last CFG id); + ``(signature, id)`` is the cross-section join key.""" + + id: int + kind: Literal[ + "entry", "exit", "statement", "branch", "loop", "return", "raise", "handler" + ] = "statement" + start_line: int = -1 + end_line: int = -1 + start_column: int = -1 + end_column: int = -1 + + +@builder +@msgpk +class PyCFGEdge(BaseModel): + """Control-flow successor edge (shared cross-language kind vocabulary).""" + + source: int + target: int + kind: Literal[ + "fallthrough", + "true", + "false", + "switch_case", + "loop_back", + "exception", + "return", + "break", + "continue", + "yield", + "await_resume", + ] = "fallthrough" + + +@builder +@msgpk +class PyPDGEdge(BaseModel): + """Dependence edge: control (``CDG``) or data (``DDG``, labeled with the + k-limited access path being read).""" + + source: int + target: int + type: Literal["CDG", "DDG"] = "DDG" + var: Optional[str] = None + + +@builder +@msgpk +class PyParamNode(BaseModel): + """HRB parameter-passing node, sharing the owning callable's id space + (allocated after EXIT). ``call_node`` is the owning callsite statement for + actuals; ``var`` is the parameter name, ````, ``:name``, + or ``:module::name``.""" + + id: int + kind: Literal["formal_in", "formal_out", "actual_in", "actual_out"] + var: str + call_node: Optional[int] = None + start_line: int = -1 + end_line: int = -1 + + +@builder +@msgpk +class PyCFG(BaseModel): + """One callable's control-flow graph.""" + + nodes: List[PyGraphNode] = [] + edges: List[PyCFGEdge] = [] + + +@builder +@msgpk +class PyPDG(BaseModel): + """One callable's dependence edges (over the same node ids as the CFG + plus its parameter nodes).""" + + edges: List[PyPDGEdge] = [] + + +@builder +@msgpk +class PyFunctionGraphs(BaseModel): + """The per-callable level-3 sections, keyed by signature.""" + + cfg: Optional[PyCFG] = None + pdg: Optional[PyPDG] = None + param_nodes: List[PyParamNode] = [] + + +@builder +@msgpk +class PySDGEndpoint(BaseModel): + """A ``(signature, node)`` reference into a function's emitted graphs.""" + + signature: str + node: int + + +@builder +@msgpk +class PySDGEdge(BaseModel): + """Interprocedural dependence edge. ``CALL``/``PARAM_IN``/``PARAM_OUT`` + cross functions; ``SUMMARY`` connects a callsite's actual_in to its + actual_out within the caller (the callee's transitive flow).""" + + source: PySDGEndpoint + target: PySDGEndpoint + type: Literal["CALL", "PARAM_IN", "PARAM_OUT", "SUMMARY"] + var: Optional[str] = None + + +@builder +@msgpk +class PyProgramGraphs(BaseModel): + """The optional level-3 top-level section of ``analysis.json`` (present + only at ``-a 3``), versioned independently of the application schema.""" + + schema_version: str = "1.0.0" + k_limit: int = 3 + functions: Dict[str, PyFunctionGraphs] = {} + sdg_edges: List[PySDGEdge] = [] + + @builder @msgpk class PyApplication(BaseModel): @@ -380,3 +509,5 @@ class PyApplication(BaseModel): # builtin members), keyed by signature. Populated by the analyzer so every # backend (JSON and Neo4j) shares one authoritative external-symbol set. external_symbols: Dict[str, PyExternalSymbol] = {} + # Level-3 native dataflow graphs (CFG/PDG/SDG); None below -a 3. + program_graphs: Optional[PyProgramGraphs] = None diff --git a/test/test_dataflow_emission.py b/test/test_dataflow_emission.py new file mode 100644 index 0000000..02500b2 --- /dev/null +++ b/test/test_dataflow_emission.py @@ -0,0 +1,101 @@ +"""Emission gate: `-a 3` program_graphs in analysis.json, flag validation, +schema round-trip, and the -a 1/-a 2 no-impact guarantee.""" + +import json +from pathlib import Path + +import pytest + +from codeanalyzer.__main__ import app +from codeanalyzer.schema import PyApplication, model_validate_json + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + +ENV = {"NO_COLOR": "1", "TERM": "dumb"} + + +def _invoke(cli_runner, tmp_path, *extra): + out = tmp_path / "out" + cache = tmp_path / "cache" + return out, cli_runner.invoke( + app, + [ + "--input", str(FIXTURE), + "--output", str(out), + "--no-venv", + "--cache-dir", str(cache), + *extra, + ], + env=ENV, + ) + + +def test_level3_emits_validating_program_graphs(cli_runner, tmp_path): + out, result = _invoke(cli_runner, tmp_path, "--analysis-level", "3") + assert result.exit_code == 0, result.output + raw = (out / "analysis.json").read_text() + application = model_validate_json(PyApplication, raw) + pg = application.program_graphs + assert pg is not None + assert pg.schema_version == "1.0.0" + assert pg.k_limit == 3 + assert pg.functions and pg.sdg_edges + # Every function section carries all default graphs. + some = next(iter(pg.functions.values())) + assert some.cfg is not None and some.pdg is not None + # No dangling SDG endpoints at the schema level either. + for e in pg.sdg_edges: + fg = pg.functions[e.source.signature] + ids = {n.id for n in fg.cfg.nodes} | {p.id for p in fg.param_nodes} + assert e.source.node in ids + + +def test_level1_and_level2_do_not_emit_program_graphs(cli_runner, tmp_path): + out, result = _invoke(cli_runner, tmp_path, "--analysis-level", "1") + assert result.exit_code == 0, result.output + data = json.loads((out / "analysis.json").read_text()) + assert data.get("program_graphs") is None + + +def test_graphs_selector_scopes_sections(cli_runner, tmp_path): + out, result = _invoke( + cli_runner, tmp_path, "--analysis-level", "3", "--graphs", "cfg" + ) + assert result.exit_code == 0, result.output + data = json.loads((out / "analysis.json").read_text()) + pg = data["program_graphs"] + assert pg["sdg_edges"] == [] + some = next(iter(pg["functions"].values())) + assert some["cfg"] is not None + assert some["pdg"] is None + assert some["param_nodes"] == [] + + +def test_unrecognized_graphs_value_errors_out(cli_runner, tmp_path): + _, result = _invoke( + cli_runner, tmp_path, "--analysis-level", "3", "--graphs", "cfg,cpg" + ) + assert result.exit_code != 0 + + +def test_graphs_flag_below_level3_errors_out(cli_runner, tmp_path): + _, result = _invoke( + cli_runner, tmp_path, "--analysis-level", "1", "--graphs", "cfg" + ) + assert result.exit_code != 0 + + +def test_graph_field_depth_below_level3_errors_out(cli_runner, tmp_path): + _, result = _invoke( + cli_runner, tmp_path, "--analysis-level", "2", "--graph-field-depth", "5" + ) + assert result.exit_code != 0 + + +def test_graph_field_depth_is_recorded(cli_runner, tmp_path): + out, result = _invoke( + cli_runner, tmp_path, "--analysis-level", "3", "--graph-field-depth", "2" + ) + assert result.exit_code == 0, result.output + data = json.loads((out / "analysis.json").read_text()) + assert data["program_graphs"]["k_limit"] == 2 From 3e8225654b92fdc602ce22c7f82b0fe7035e7a3e Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 22:01:44 -0400 Subject: [PATCH 09/11] =?UTF-8?q?feat(dataflow):=20stage=208b=20=E2=80=94?= =?UTF-8?q?=20CPG=20projection=20through=20the=20Neo4j=20emitter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CFGNode label (merge key id = #) carrying both CFG statements and HRB parameter nodes, plus the shared cross-language edge vocabulary HAS_CFG_NODE / CFG_NEXT / CDG / DDG / PARAM_IN / PARAM_OUT / SUMMARY (deliberately unprefixed — parity clause). Additive schema.neo4j.json bump to 1.2.0; sample app extended so the conformance tests exercise every new row family; count-parity and no-dangling gates on the real fixture at -a 3. CALL stays at the callable level (PY_CALLS twin). (#67) --- codeanalyzer/neo4j/project.py | 96 ++++++++++++++++++++++++++++++++++ codeanalyzer/neo4j/schema.py | 27 +++++++++- schema.neo4j.json | 97 ++++++++++++++++++++++++++++++++++- test/sample_graph_app.py | 90 ++++++++++++++++++++++++++++++++ test/test_dataflow_cpg.py | 74 ++++++++++++++++++++++++++ 5 files changed, 381 insertions(+), 3 deletions(-) create mode 100644 test/test_dataflow_cpg.py diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py index b1d5b65..c1aafc1 100644 --- a/codeanalyzer/neo4j/project.py +++ b/codeanalyzer/neo4j/project.py @@ -74,9 +74,105 @@ def project(app: PyApplication, app_name: str) -> GraphRows: "PY_CALLS", src, tgt, _call_edge_props(e.weight, list(e.provenance or [])) ) + # Level-3 CPG overlay (present only at -a 3): the same program_graphs IR + # projected as :CFGNode nodes and the shared cross-language edge types. + if app.program_graphs is not None: + _project_program_graphs(b, app) + return b.finish() +# ---------------------------------------------------------------------------------------------- +# Level-3 CPG overlay +# ---------------------------------------------------------------------------------------------- + + +def _signature_modules(app: PyApplication) -> dict: + """signature → owning module file_key, for CFGNode `_module` provenance.""" + from codeanalyzer.semantic_analysis.call_graph import _walk_module_callables + + out: dict = {} + for file_key, mod in app.symbol_table.items(): + for c in _walk_module_callables(mod): + out[c.signature] = file_key + return out + + +def _cfg_node_ref(b: RowBuilder, sig: str, node_id: int) -> NodeRef: + return NodeRef("CFGNode", "id", f"{sig}#{node_id}") + + +def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: + """CFG/PDG/SDG rows: node label ``CFGNode`` (merge key ``id`` = + ``#``) and edge types ``HAS_CFG_NODE`` / ``CFG_NEXT`` + (prop ``kind``) / ``CDG`` / ``DDG`` (prop ``var``) / ``PARAM_IN`` / + ``PARAM_OUT`` / ``SUMMARY`` — the shared cross-language vocabulary, so no + ``PY_`` prefix. Parameter nodes ride the same label with their HRB kinds + plus ``var``/``call_node`` props (an additive, recorded extension).""" + pg = app.program_graphs + sig_module = _signature_modules(app) + + for sig, fg in pg.functions.items(): + owner = _sym(sig) + module = sig_module.get(sig) + for n in (fg.cfg.nodes if fg.cfg else []): + ref = b.node( + ["CFGNode"], + "id", + f"{sig}#{n.id}", + prune( + { + "kind": n.kind, + "start_line": n.start_line, + "end_line": n.end_line, + "_module": module, + } + ), + ) + b.edge("HAS_CFG_NODE", owner, ref) + for p in fg.param_nodes or []: + ref = b.node( + ["CFGNode"], + "id", + f"{sig}#{p.id}", + prune( + { + "kind": p.kind, + "var": p.var, + "call_node": p.call_node, + "start_line": p.start_line, + "end_line": p.end_line, + "_module": module, + } + ), + ) + b.edge("HAS_CFG_NODE", owner, ref) + for e in (fg.cfg.edges if fg.cfg else []): + b.edge( + "CFG_NEXT", + _cfg_node_ref(b, sig, e.source), + _cfg_node_ref(b, sig, e.target), + {"kind": e.kind}, + ) + for e in (fg.pdg.edges if fg.pdg else []): + b.edge( + e.type, # CDG | DDG + _cfg_node_ref(b, sig, e.source), + _cfg_node_ref(b, sig, e.target), + prune({"var": e.var}), + ) + + for e in pg.sdg_edges: + if e.type == "CALL": + continue # the callable-level PY_CALLS twin already carries calls + b.edge( + e.type, # PARAM_IN | PARAM_OUT | SUMMARY + _cfg_node_ref(b, e.source.signature, e.source.node), + _cfg_node_ref(b, e.target.signature, e.target.node), + prune({"var": e.var}), + ) + + def _sym(signature: str) -> NodeRef: return NodeRef("PySymbol", "signature", signature) diff --git a/codeanalyzer/neo4j/schema.py b/codeanalyzer/neo4j/schema.py index b897fe1..5085bab 100644 --- a/codeanalyzer/neo4j/schema.py +++ b/codeanalyzer/neo4j/schema.py @@ -35,7 +35,7 @@ from dataclasses import dataclass, field from typing import Dict, List -SCHEMA_VERSION = "1.1.0" +SCHEMA_VERSION = "1.2.0" # PropType ∈ {"string", "integer", "float", "boolean", "string[]", "integer[]"}. @@ -176,6 +176,23 @@ class RelType: "_module": "string", }, ), + # Level-3 CPG overlay (present only at -a 3). The label and edge types + # below are the shared cross-language dataflow vocabulary — deliberately + # NOT PY_-prefixed. `id` = "#"; parameter-passing + # nodes (formal/actual in/out) ride the same label with `var`/`call_node`. + NodeLabel( + "CFGNode", + "CFGNode", + "id", + { + "id": "string", + "kind": "string", + "var": "string", + "call_node": "integer", + **_SPAN, + "_module": "string", + }, + ), ] _DECL_TARGETS = ["PyClass", "PyCallable"] @@ -203,6 +220,14 @@ class RelType: {"imported_names": "string[]", "aliases": "string[]"}, ), RelType("PY_DECORATED_BY", ["PyCallable"], ["PyDecorator"]), + # Level-3 CPG overlay (shared cross-language vocabulary, -a 3 only). + RelType("HAS_CFG_NODE", ["PyCallable"], ["CFGNode"]), + RelType("CFG_NEXT", ["CFGNode"], ["CFGNode"], {"kind": "string"}), + RelType("CDG", ["CFGNode"], ["CFGNode"]), + RelType("DDG", ["CFGNode"], ["CFGNode"], {"var": "string"}), + RelType("PARAM_IN", ["CFGNode"], ["CFGNode"], {"var": "string"}), + RelType("PARAM_OUT", ["CFGNode"], ["CFGNode"], {"var": "string"}), + RelType("SUMMARY", ["CFGNode"], ["CFGNode"]), ] diff --git a/schema.neo4j.json b/schema.neo4j.json index f75ab6e..8098d3e 100644 --- a/schema.neo4j.json +++ b/schema.neo4j.json @@ -1,5 +1,5 @@ { - "schema_version": "1.1.0", + "schema_version": "1.2.0", "generator": "codeanalyzer-python", "marker_labels": [], "node_labels": [ @@ -135,6 +135,20 @@ "end_line": "integer", "_module": "string" } + }, + { + "label": "CFGNode", + "merge_label": "CFGNode", + "key": "id", + "properties": { + "id": "string", + "kind": "string", + "var": "string", + "call_node": "integer", + "start_line": "integer", + "end_line": "integer", + "_module": "string" + } } ], "relationship_types": [ @@ -260,6 +274,84 @@ "PyDecorator" ], "properties": {} + }, + { + "type": "HAS_CFG_NODE", + "from": [ + "PyCallable" + ], + "to": [ + "CFGNode" + ], + "properties": {} + }, + { + "type": "CFG_NEXT", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": { + "kind": "string" + } + }, + { + "type": "CDG", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": {} + }, + { + "type": "DDG", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": { + "var": "string" + } + }, + { + "type": "PARAM_IN", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": { + "var": "string" + } + }, + { + "type": "PARAM_OUT", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": { + "var": "string" + } + }, + { + "type": "SUMMARY", + "from": [ + "CFGNode" + ], + "to": [ + "CFGNode" + ], + "properties": {} } ], "constraints": [ @@ -270,7 +362,8 @@ "CREATE CONSTRAINT pydecorator_name IF NOT EXISTS FOR (x:PyDecorator) REQUIRE x.name IS UNIQUE", "CREATE CONSTRAINT pycallsite_id IF NOT EXISTS FOR (x:PyCallSite) REQUIRE x.id IS UNIQUE", "CREATE CONSTRAINT pyattribute_id IF NOT EXISTS FOR (x:PyAttribute) REQUIRE x.id IS UNIQUE", - "CREATE CONSTRAINT pyvariable_id IF NOT EXISTS FOR (x:PyVariable) REQUIRE x.id IS UNIQUE" + "CREATE CONSTRAINT pyvariable_id IF NOT EXISTS FOR (x:PyVariable) REQUIRE x.id IS UNIQUE", + "CREATE CONSTRAINT cfgnode_id IF NOT EXISTS FOR (x:CFGNode) REQUIRE x.id IS UNIQUE" ], "indexes": [ "CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)", diff --git a/test/sample_graph_app.py b/test/sample_graph_app.py index c9a98c5..69f2221 100644 --- a/test/sample_graph_app.py +++ b/test/sample_graph_app.py @@ -11,12 +11,22 @@ from codeanalyzer.schema import ( PyApplication, PyCallable, + PyCFG, + PyCFGEdge, PyClass, PyClassAttribute, PyComment, PyExternalSymbol, + PyFunctionGraphs, + PyGraphNode, PyImport, PyModule, + PyParamNode, + PyPDG, + PyPDGEdge, + PyProgramGraphs, + PySDGEdge, + PySDGEndpoint, PyVariableDeclaration, ) from codeanalyzer.schema.py_schema import PyCallEdge, PyCallsite @@ -147,10 +157,90 @@ def make_sample_app() -> PyApplication: ), ] + # A miniature level-3 section exercising every CPG row family: + # helper's CFG (entry → callsite stmt → exit), a CDG/DDG pair, its HRB + # parameter nodes, and PARAM_IN/PARAM_OUT/SUMMARY edges into announce. + helper_graphs = PyFunctionGraphs( + cfg=PyCFG( + nodes=[ + PyGraphNode(id=0, kind="entry", start_line=17, end_line=17), + PyGraphNode(id=1, kind="statement", start_line=18, end_line=18), + PyGraphNode(id=2, kind="exit", start_line=20, end_line=20), + ], + edges=[ + PyCFGEdge(source=0, target=1, kind="fallthrough"), + PyCFGEdge(source=1, target=2, kind="return"), + PyCFGEdge(source=1, target=2, kind="exception"), + ], + ), + pdg=PyPDG( + edges=[ + PyPDGEdge(source=0, target=1, type="CDG"), + PyPDGEdge(source=0, target=1, type="DDG", var="url"), + ] + ), + param_nodes=[ + PyParamNode(id=3, kind="formal_out", var="", start_line=20, end_line=20), + PyParamNode(id=4, kind="actual_in", var="self", call_node=1, start_line=18, end_line=18), + PyParamNode(id=5, kind="actual_out", var="", call_node=1, start_line=18, end_line=18), + ], + ) + announce_graphs = PyFunctionGraphs( + cfg=PyCFG( + nodes=[ + PyGraphNode(id=0, kind="entry", start_line=10, end_line=10), + PyGraphNode(id=1, kind="return", start_line=11, end_line=11), + PyGraphNode(id=2, kind="exit", start_line=12, end_line=12), + ], + edges=[ + PyCFGEdge(source=0, target=1, kind="fallthrough"), + PyCFGEdge(source=1, target=2, kind="return"), + ], + ), + pdg=PyPDG(edges=[PyPDGEdge(source=0, target=1, type="CDG")]), + param_nodes=[ + PyParamNode(id=3, kind="formal_in", var="self", start_line=10, end_line=10), + PyParamNode(id=4, kind="formal_out", var="", start_line=12, end_line=12), + ], + ) + program_graphs = PyProgramGraphs( + schema_version="1.0.0", + k_limit=3, + functions={ + "src.service.helper": helper_graphs, + "src.service.Service.announce": announce_graphs, + }, + sdg_edges=[ + PySDGEdge( + source=PySDGEndpoint(signature="src.service.helper", node=1), + target=PySDGEndpoint(signature="src.service.Service.announce", node=0), + type="CALL", + ), + PySDGEdge( + source=PySDGEndpoint(signature="src.service.helper", node=4), + target=PySDGEndpoint(signature="src.service.Service.announce", node=3), + type="PARAM_IN", + var="self", + ), + PySDGEdge( + source=PySDGEndpoint(signature="src.service.Service.announce", node=4), + target=PySDGEndpoint(signature="src.service.helper", node=5), + type="PARAM_OUT", + var="", + ), + PySDGEdge( + source=PySDGEndpoint(signature="src.service.helper", node=4), + target=PySDGEndpoint(signature="src.service.helper", node=5), + type="SUMMARY", + ), + ], + ) + return PyApplication( symbol_table={"src/service.py": service_mod, "src/util.py": util_mod}, call_graph=call_graph, # The ghost edge's target (requests.get) is a library member, recorded as a # first-class external symbol so the projection emits a :PyExternal for it. external_symbols={"requests.get": PyExternalSymbol(name="get", module="requests")}, + program_graphs=program_graphs, ) diff --git a/test/test_dataflow_cpg.py b/test/test_dataflow_cpg.py new file mode 100644 index 0000000..8a303e6 --- /dev/null +++ b/test/test_dataflow_cpg.py @@ -0,0 +1,74 @@ +"""Stage-8b gate: the CPG projection of the level-3 graphs. + +- CFGNode row count equals the JSON section's node count (CFG + parameter + nodes) — the contract's count-parity assertion; +- every CFG_NEXT/CDG/DDG/PARAM_IN/PARAM_OUT/SUMMARY edge endpoint references + an emitted CFGNode id (deferred-edge/no-dangling gate); +- the Cypher snapshot renders and contains the overlay's vocabulary. + +Loading into a live Neo4j is exercised by the (container-gated) bolt tests; +these stay fast and deterministic. +""" + +from pathlib import Path + +import pytest + +from codeanalyzer.core import Codeanalyzer +from codeanalyzer.neo4j import project +from codeanalyzer.neo4j.cypher import render_cypher +from codeanalyzer.options import AnalysisOptions + +FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" + +CPG_EDGE_TYPES = {"CFG_NEXT", "CDG", "DDG", "PARAM_IN", "PARAM_OUT", "SUMMARY"} + + +@pytest.fixture(scope="module") +def level3_app(tmp_path_factory): + cache = tmp_path_factory.mktemp("dataflow-cpg-cache") + options = AnalysisOptions( + input=FIXTURE, analysis_level=3, no_venv=True, cache_dir=cache + ) + with Codeanalyzer(options) as analyzer: + return analyzer.analyze() + + +@pytest.fixture(scope="module") +def rows(level3_app): + return project(level3_app, "dataflow-fixture") + + +def test_cfg_node_count_matches_the_json_section(level3_app, rows): + expected = sum( + len(fg.cfg.nodes if fg.cfg else []) + len(fg.param_nodes or []) + for fg in level3_app.program_graphs.functions.values() + ) + emitted = [n for n in rows.nodes if "CFGNode" in n.labels] + assert expected > 0 + assert len(emitted) == expected + + +def test_no_dangling_cpg_edge_endpoints(rows): + cfg_ids = {n.value for n in rows.nodes if "CFGNode" in n.labels} + cpg_edges = [e for e in rows.edges if e.type in CPG_EDGE_TYPES] + assert cpg_edges, "no CPG edges projected" + for e in cpg_edges: + if e.from_ref.label == "CFGNode": + assert e.from_ref.value in cfg_ids, e + if e.to_ref.label == "CFGNode": + assert e.to_ref.value in cfg_ids, e + + +def test_every_callable_with_graphs_owns_its_cfg_nodes(level3_app, rows): + has_edges = [e for e in rows.edges if e.type == "HAS_CFG_NODE"] + owned = {e.to_ref.value for e in has_edges} + cfg_ids = {n.value for n in rows.nodes if "CFGNode" in n.labels} + assert owned == cfg_ids, "every CFGNode must be owned by its callable" + + +def test_cypher_snapshot_renders_the_overlay(level3_app, rows): + cypher = render_cypher(rows, "dataflow-fixture") + assert ":CFGNode" in cypher + for t in CPG_EDGE_TYPES: + assert t in cypher, f"{t} missing from the snapshot" From ac7acb36504cb2a7c1d84a4e3adb8ea9daf4c25d Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Wed, 1 Jul 2026 22:27:47 -0400 Subject: [PATCH 10/11] docs(dataflow): analysis levels, Architecture & Tooling, schema decisions README gains the level table, the locked level-3 substrate decisions (CFG from stdlib ast, hand-built reaching defs, type-based may-alias MVP, documented unsoundness), a level-3 usage example, and a regenerated --help block; CHANGELOG Unreleased entry; level-3 schema decision log tracked at .claude/SCHEMA_DECISIONS.md as SDK-model input (un-ignored past the global .claude exclude). (#67) --- .claude/SCHEMA_DECISIONS.md | 61 +++++ .gitignore | 5 + CHANGELOG.md | 28 +++ README.md | 486 ++++++++++++++++++------------------ 4 files changed, 334 insertions(+), 246 deletions(-) create mode 100644 .claude/SCHEMA_DECISIONS.md diff --git a/.claude/SCHEMA_DECISIONS.md b/.claude/SCHEMA_DECISIONS.md new file mode 100644 index 0000000..1036a91 --- /dev/null +++ b/.claude/SCHEMA_DECISIONS.md @@ -0,0 +1,61 @@ +# Schema decisions — codeanalyzer-python + +Decision log for schema-affecting choices, kept as input for the CLDK SDK +model work (the frontend skill encodes these as shared Pydantic models). The +level-1/2 schema (`PyApplication`, symbol table, call graph) predates this +log; entries below start at level 3. + +## Level 3 — `program_graphs` (issue #67, schema_version 1.0.0) + +Contract baseline: the CLDK dataflow-graphs contract (shared node kinds, edge +types, JSON shapes; `(signature, node_id)` identity; `CFG_NEXT`/`CDG`/`DDG`/ +`CALL`/`PARAM_IN`/`PARAM_OUT`/`SUMMARY` vocabulary). Divergences and +additions, all additive: + +1. **Parameter nodes are first-class and live in a per-function + `param_nodes` list**, not inside `cfg.nodes`. The contract's CFG gate + (single ENTRY/EXIT, every node reachable-from-ENTRY/reaches-EXIT, EXIT = + last CFG id) stays exact over `cfg.nodes`; HRB parameter-passing nodes + (`formal_in`/`formal_out`/`actual_in`/`actual_out`) share the function's + id space with ids allocated after EXIT, and carry `var` (the parameter + name, ``, `:name`, or `:module::name`) plus + `call_node` (owning callsite statement) for actuals. +2. **SUMMARY edges are emitted in `sdg_edges` with both endpoints in the same + signature** (actual_in → actual_out at one callsite). The contract comment + says "cross-function only"; SUMMARY is inherently intra-function in HRB + form and cannot be typed as a `pdg` edge (`CDG|DDG`), so it rides + `sdg_edges`. CALL/PARAM_IN/PARAM_OUT remain cross-function. +3. **Globals are qualified `module::name`** (double colon keeps the qualifier + out of the field-path grammar `base(.field|[*])*`). Cross-module global + identity holds when access flows through the defining module's functions; + direct `from m import g` rebinding is a documented precision loss. +4. **The return value is the pseudo-path ``**, defined at every + return statement and wired to the `formal_out` of the same name. +5. **Python-specific CFG edge kinds used from the shared vocabulary:** + `yield` (resume successor + abandonment edge to EXIT) and `await_resume`. + No renamed or repurposed kinds; node kinds used: `entry`, `exit`, + `statement`, `branch`, `loop`, `return`, `raise`, `handler`. +6. **Infinite loops get a synthetic `exception` edge header → EXIT** (any + Python loop can exit via an async signal), keeping post-dominance rooted. +7. **Call mutations are suffixed weak defs** (`xs.*`): caller-visible + mutation is distinguishable from local rebinding, which decides + `formal_out` allocation for parameters. +8. **`dfg` has no separate section** (per contract): `--graphs dfg` emits the + PDG with only DDG edges; `sdg` implies the dependence edges it stitches. +9. **Taint (`taint_flows`) is not emitted by this analyzer** — deliberately + deferred to the CLDK SDK, where labeled SDG reachability is shared across + languages; only source/sink/sanitizer model packs are per-language. + +## Level-3 CPG (Neo4j) — schema.neo4j.json 1.2.0 (additive) + +- New label `CFGNode` (merge key `id` = `#`; props + `kind`, `var`, `call_node`, `start_line`, `end_line`, `_module`). Both CFG + statements and parameter nodes ride this one label, distinguished by + `kind` — the parity clause's label set stays minimal. +- New edge types `HAS_CFG_NODE` (PyCallable → CFGNode), `CFG_NEXT` (prop + `kind`), `CDG`, `DDG` (prop `var`), `PARAM_IN`, `PARAM_OUT`, `SUMMARY` — + deliberately **not** `PY_`-prefixed: this vocabulary is the shared + cross-language CPG contract. +- `CALL` SDG edges are not projected: the callable-level `PY_CALLS` twin + already carries calls; callsite-statement granularity is recoverable via + `PY_HAS_CALLSITE`/`PY_RESOLVES_TO`. diff --git a/.gitignore b/.gitignore index ff95233..c9bdeb3 100644 --- a/.gitignore +++ b/.gitignore @@ -187,3 +187,8 @@ node_modules/ # Track this repo's CLAUDE.md even though a global gitignore excludes CLAUDE.md !CLAUDE.md + +# Track the schema decision log (SDK-model input) past a global .claude ignore +!.claude/ +.claude/* +!.claude/SCHEMA_DECISIONS.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 0280167..8616fa8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,34 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- **`--analysis-level 3`: native dataflow graphs** (#67). Whole-program dependence graphs built + in-process from the stdlib `ast` — per-callable exceptional **CFG**s (statement-level, synthetic + ENTRY/EXIT, first-class exception/yield/await edges), **PDG**s (Ferrante–Ottenstein–Warren + control dependence + reaching-definitions data dependence over k-limited access paths), and a + Horwitz–Reps–Binkley **SDG** (formal/actual parameter nodes, CALL/PARAM_IN/PARAM_OUT edges, + SUMMARY edges from bottom-up relational function summaries over the Tarjan SCC condensation; + globals as extra formals, closure captures bound at definition sites). Emitted as the + `program_graphs` section of `analysis.json` (own `schema_version` 1.0.0), keyed by the same + callable signatures as the symbol table and call graph. +- **Context-sensitive backward slicing** as an SDG query (`codeanalyzer.dataflow.slicing`, HRB + two-phase traversal). Taint is deliberately left to the CLDK SDK — post-SDG it is + language-independent labeled reachability. +- **CPG overlay in the Neo4j projection** at level 3: `CFGNode` nodes plus the shared + cross-language `HAS_CFG_NODE`/`CFG_NEXT`/`CDG`/`DDG`/`PARAM_IN`/`PARAM_OUT`/`SUMMARY` edge + vocabulary. `schema.neo4j.json` bumped additively to **1.2.0**. +- **New flags**: `--graphs cfg,dfg,pdg,sdg` (scopes the emitted sections; strict validation — + unknown values or use below `-a 3` exit non-zero) and `--graph-field-depth` (access-path + k-limit, default 3 — the bound that guarantees the interprocedural fixpoint terminates). +- **Alias oracle (MVP)**: type-based may-alias using Jedi-inferred types (unknown types + conservatively alias); frozen behind `may_alias()` for a later points-to upgrade. + +### Changed +- `-a/--analysis-level` now accepts `3`; levels stay cumulative (level 3 includes PyCG + enrichment). `-a 1`/`-a 2` output and timings are unchanged. + ## [0.3.0] - 2026-06-27 ### Added diff --git a/README.md b/README.md index 43130dc..20a49f0 100644 --- a/README.md +++ b/README.md @@ -51,9 +51,13 @@ and merges them with the Jedi-derived edges, also backfilling callees Jedi could - **Symbol table** — modules, classes, functions, methods, variables, decorators, imports, and docstrings, with precise source spans. -- **Call graph** — Jedi's lexical resolver by default, with optional **CodeQL**-resolved edges - (`--codeql`) for RPC / third-party / dynamically-dispatched targets, merged with the Jedi edges; - CodeQL also backfills callees Jedi could not resolve. +- **Call graph** — Jedi's lexical resolver by default (level 1), with optional **PyCG**-resolved + edges merged in at `--analysis-level 2` (provenance-tagged, coupling-aware sharding for large + apps). +- **Dataflow graphs (level 3)** — native, whole-program dependence graphs built from Python's own + `ast`: per-callable exceptional **CFG**s and **PDG**s (control + data dependence), stitched into + a Horwitz–Reps–Binkley **SDG** with parameter/summary edges, emitted as the `program_graphs` + section at `--analysis-level 3` and queryable with a context-sensitive backward slicer. - **Neo4j output** — project the analysis into a labeled property graph: a self-contained `graph.cypher` snapshot, or an **incremental** push to a live database over Bolt. - **Versioned schema** — a machine-readable, version-stamped Neo4j schema contract (`--emit schema`), @@ -143,247 +147,189 @@ $ canpy --help Static Analysis on Python source code using Jedi, PyCG and Tree sitter. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --input -i PATH Path to the │ -│ project root │ -│ directory (not │ -│ required for │ -│ --emit schema). │ -│ --output -o PATH Output directory │ -│ for artifacts. │ -│ --format -f [json|msgpack] Output format │ -│ for --emit json: │ -│ json or msgpack. │ -│ [default: json] │ -│ --emit [json|neo4j|sche Output target: │ -│ ma] json │ -│ (analysis.json, │ -│ default) | neo4j │ -│ (graph.cypher or │ -│ live Bolt push) │ -│ | schema (the │ -│ Neo4j │ -│ schema.json │ -│ contract). │ -│ [default: json] │ -│ --app-name TEXT Logical │ -│ application name │ -│ for the graph │ -│ :PyApplication │ -│ anchor (default: │ -│ input dir name). │ -│ --neo4j-uri TEXT Push the graph │ -│ to a live Neo4j │ -│ over Bolt │ -│ (incremental); │ -│ omit to write │ -│ graph.cypher. │ -│ [env var: │ -│ NEO4J_URI] │ -│ --neo4j-user TEXT Neo4j username. │ -│ [env var: │ -│ NEO4J_USERNAME] │ -│ [default: neo4j] │ -│ --neo4j-password TEXT Neo4j password. │ -│ Prefer the env │ -│ var over the │ -│ flag (the flag │ -│ is visible in │ -│ shell history / │ -│ process list). │ -│ [env var: │ -│ NEO4J_PASSWORD] │ -│ [default: neo4j] │ -│ --neo4j-database TEXT Neo4j database │ -│ name (default: │ -│ server default). │ -│ [env var: │ -│ NEO4J_DATABASE] │ -│ --analysis-level -a INTEGER RANGE Analysis depth: │ -│ [1<=x<=2] 1=symbol │ -│ table+Jedi call │ -│ graph, 2=+PyCG │ -│ call graph. │ -│ [default: 1] │ -│ --ray --no-ray Enable Ray for │ -│ distributed │ -│ analysis. │ -│ [default: │ -│ no-ray] │ -│ --eager --lazy Enable eager or │ -│ lazy analysis. │ -│ Defaults to │ -│ lazy. │ -│ [default: lazy] │ -│ --skip-tests --include-tests Skip test files │ -│ in analysis. │ -│ [default: │ -│ skip-tests] │ -│ --no-venv --venv Skip virtualenv │ -│ creation and │ -│ dependency │ -│ installation; │ -│ resolve imports │ -│ against the │ -│ ambient Python │ -│ environment │ -│ instead. │ -│ [default: venv] │ -│ --file-name PATH Analyze only the │ -│ specified file │ -│ (relative to │ -│ input │ -│ directory). │ -│ --cache-dir -c PATH Directory to │ -│ store analysis │ -│ cache. Defaults │ -│ to │ -│ '.codeanalyzer' │ -│ in the input │ -│ directory. │ -│ --clear-cache --keep-cache Clear cache │ -│ after analysis. │ -│ By default, │ -│ cache is │ -│ retained. │ -│ [default: │ -│ keep-cache] │ -│ -v INTEGER Increase │ -│ verbosity: -v, │ -│ -vv, -vvv │ -│ [default: 0] │ -│ --pycg-shard --no-pycg-shard Shard PyCG │ -│ call-graph │ -│ analysis by │ -│ Python package │ -│ (level 2 only). │ -│ When the project │ -│ exceeds the │ -│ 500-file │ -│ ceiling, PyCG is │ -│ run │ -│ independently │ -│ per top-level │ -│ package with │ -│ cross-package │ -│ imports treated │ -│ as ghost nodes. │ -│ Without this │ -│ flag, projects │ -│ over the ceiling │ -│ fall back to │ -│ Jedi-only edges. │ -│ [default: │ -│ no-pycg-shard] │ -│ --pycg-shard-cei… INTEGER RANGE Maximum files │ -│ [x>=1] per shard when │ -│ --pycg-shard is │ -│ active (default │ -│ 100). Shards │ -│ exceeding this │ -│ limit are │ -│ skipped; their │ -│ call edges are │ -│ omitted from the │ -│ call graph (Jedi │ -│ edges for those │ -│ packages are │ -│ still included). │ -│ Lower values are │ -│ safer for │ -│ packages with │ -│ deep class │ -│ hierarchies or │ -│ heavy import │ -│ graphs. │ -│ [default: 100] │ -│ --pycg-shard-tim… INTEGER RANGE Per-shard │ -│ [x>=0] wall-clock │ -│ timeout in │ -│ seconds when │ -│ --pycg-shard is │ -│ active (default │ -│ 120). A shard │ -│ that exceeds │ -│ this limit is │ -│ skipped │ -│ gracefully. │ -│ PyCG's fixpoint │ -│ is bimodal: it │ -│ either converges │ -│ quickly or │ -│ diverges │ -│ indefinitely, so │ -│ the timeout acts │ -│ as a final │ -│ safety net after │ -│ the file-count │ -│ ceiling. Set to │ -│ 0 to disable. │ -│ POSIX only │ -│ (macOS / Linux); │ -│ ignored on │ -│ Windows. │ -│ [default: 120] │ -│ --pycg-shard-str… [jedi|package] How --pycg-shard │ -│ groups files │ -│ (level 2 only). │ -│ 'jedi' (default) │ -│ partitions the │ -│ Jedi │ -│ module-dependen… │ -│ graph (SCC + │ -│ Louvain) so │ -│ tightly-coupled │ -│ modules │ -│ co-compute and │ -│ few call edges │ -│ are severed │ -│ between shards; │ -│ import cycles │ -│ are never split. │ -│ 'package' uses │ -│ the legacy │ -│ one-shard-per-p… │ -│ grouping. │ -│ [default: jedi] │ -│ --pycg-max-iter INTEGER RANGE Cap on PyCG's │ -│ [x>=-1] fixpoint passes │ -│ per │ -│ shard/project │ -│ (level 2; │ -│ default 50). │ -│ PyCG iterates │ -│ until its │ -│ points-to state │ -│ stops changing, │ -│ but its │ -│ access-path │ -│ domain has no │ -│ convergence │ -│ bound, so heavy │ -│ metaclass/mixin │ -│ code (e.g. an │ -│ ORM) can loop │ -│ with each pass │ -│ costing seconds. │ -│ The cap returns │ -│ a │ -│ sound-but-incom… │ -│ call graph │ -│ instead of │ -│ looping until │ -│ the timeout │ -│ kills it. Set to │ -│ -1 for PyCG's │ -│ unbounded │ -│ run-to-converge… │ -│ behaviour. │ -│ [default: 50] │ -│ --help Show this │ -│ message and │ -│ exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮ +│ --version Show the canpy version │ +│ and exit. │ +│ --input -i PATH Path to the project root │ +│ directory (not required │ +│ for --emit schema). │ +│ --output -o PATH Output directory for │ +│ artifacts. │ +│ --format -f [json|msgpack] Output format for --emit │ +│ json: json or msgpack. │ +│ [default: json] │ +│ --emit [json|neo4j|schema] Output target: json │ +│ (analysis.json, default) │ +│ | neo4j (graph.cypher or │ +│ live Bolt push) | schema │ +│ (the Neo4j schema.json │ +│ contract). │ +│ [default: json] │ +│ --app-name TEXT Logical application name │ +│ for the graph │ +│ :PyApplication anchor │ +│ (default: input dir │ +│ name). │ +│ --neo4j-uri TEXT Push the graph to a live │ +│ Neo4j over Bolt │ +│ (incremental); omit to │ +│ write graph.cypher. │ +│ [env var: NEO4J_URI] │ +│ --neo4j-user TEXT Neo4j username. │ +│ [env var: NEO4J_USERNAME] │ +│ [default: neo4j] │ +│ --neo4j-password TEXT Neo4j password. Prefer │ +│ the env var over the flag │ +│ (the flag is visible in │ +│ shell history / process │ +│ list). │ +│ [env var: NEO4J_PASSWORD] │ +│ [default: neo4j] │ +│ --neo4j-database TEXT Neo4j database name │ +│ (default: server │ +│ default). │ +│ [env var: NEO4J_DATABASE] │ +│ --analysis-level -a INTEGER RANGE [1<=x<=3] Analysis depth: 1=symbol │ +│ table+Jedi call graph, │ +│ 2=+PyCG call graph, │ +│ 3=+native dataflow graphs │ +│ (CFG/PDG/SDG). │ +│ [default: 1] │ +│ --graphs TEXT Level 3 only: │ +│ comma-separated │ +│ program-graph sections to │ +│ emit (cfg, dfg, pdg, │ +│ sdg). Default: all. `dfg` │ +│ emits the PDG's data │ +│ edges only; `sdg` implies │ +│ the dependence edges it │ +│ stitches. │ +│ [default: │ +│ cfg,dfg,pdg,sdg] │ +│ --graph-field-depth INTEGER RANGE [x>=1] Level 3 only: k-limit on │ +│ access-path depth │ +│ (x.f.g.h with k=3 becomes │ +│ x.f.g.*). Mandatory bound │ +│ — it is what guarantees │ +│ the interprocedural │ +│ fixpoint terminates. │ +│ [default: 3] │ +│ --ray --no-ray Enable Ray for │ +│ distributed analysis. │ +│ [default: no-ray] │ +│ --eager --lazy Enable eager or lazy │ +│ analysis. Defaults to │ +│ lazy. │ +│ [default: lazy] │ +│ --skip-tests --include-tests Skip test files in │ +│ analysis. │ +│ [default: skip-tests] │ +│ --no-venv --venv Skip virtualenv creation │ +│ and dependency │ +│ installation; resolve │ +│ imports against the │ +│ ambient Python │ +│ environment instead. │ +│ [default: venv] │ +│ --file-name PATH Analyze only the │ +│ specified file (relative │ +│ to input directory). │ +│ --cache-dir -c PATH Directory to store │ +│ analysis cache. Defaults │ +│ to '.codeanalyzer' in the │ +│ input directory. │ +│ --clear-cache --keep-cache Clear cache after │ +│ analysis. By default, │ +│ cache is retained. │ +│ [default: keep-cache] │ +│ -v INTEGER Increase verbosity: -v, │ +│ -vv, -vvv │ +│ [default: 0] │ +│ --pycg-shard --no-pycg-shard Shard PyCG call-graph │ +│ analysis by Python │ +│ package (level 2 only). │ +│ When the project exceeds │ +│ the 500-file ceiling, │ +│ PyCG is run independently │ +│ per top-level package │ +│ with cross-package │ +│ imports treated as ghost │ +│ nodes. Without this flag, │ +│ projects over the ceiling │ +│ fall back to Jedi-only │ +│ edges. │ +│ [default: no-pycg-shard] │ +│ --pycg-shard-ceiling INTEGER RANGE [x>=1] Maximum files per shard │ +│ when --pycg-shard is │ +│ active (default 100). │ +│ Shards exceeding this │ +│ limit are skipped; their │ +│ call edges are omitted │ +│ from the call graph (Jedi │ +│ edges for those packages │ +│ are still included). │ +│ Lower values are safer │ +│ for packages with deep │ +│ class hierarchies or │ +│ heavy import graphs. │ +│ [default: 100] │ +│ --pycg-shard-timeout INTEGER RANGE [x>=0] Per-shard wall-clock │ +│ timeout in seconds when │ +│ --pycg-shard is active │ +│ (default 120). A shard │ +│ that exceeds this limit │ +│ is skipped gracefully. │ +│ PyCG's fixpoint is │ +│ bimodal: it either │ +│ converges quickly or │ +│ diverges indefinitely, so │ +│ the timeout acts as a │ +│ final safety net after │ +│ the file-count ceiling. │ +│ Set to 0 to disable. │ +│ POSIX only (macOS / │ +│ Linux); ignored on │ +│ Windows. │ +│ [default: 120] │ +│ --pycg-shard-strategy [jedi|package] How --pycg-shard groups │ +│ files (level 2 only). │ +│ 'jedi' (default) │ +│ partitions the Jedi │ +│ module-dependency graph │ +│ (SCC + Louvain) so │ +│ tightly-coupled modules │ +│ co-compute and few call │ +│ edges are severed between │ +│ shards; import cycles are │ +│ never split. 'package' │ +│ uses the legacy │ +│ one-shard-per-package-di… │ +│ grouping. │ +│ [default: jedi] │ +│ --pycg-max-iter INTEGER RANGE [x>=-1] Cap on PyCG's fixpoint │ +│ passes per shard/project │ +│ (level 2; default 50). │ +│ PyCG iterates until its │ +│ points-to state stops │ +│ changing, but its │ +│ access-path domain has no │ +│ convergence bound, so │ +│ heavy metaclass/mixin │ +│ code (e.g. an ORM) can │ +│ loop with each pass │ +│ costing seconds. The cap │ +│ returns a │ +│ sound-but-incomplete call │ +│ graph instead of looping │ +│ until the timeout kills │ +│ it. Set to -1 for PyCG's │ +│ unbounded │ +│ run-to-convergence │ +│ behaviour. │ +│ [default: 50] │ +│ --help Show this message and │ +│ exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -428,6 +374,53 @@ $ canpy --help canpy --input ./my-python-project --eager --cache-dir /path/to/custom-cache ``` +7. **Native dataflow graphs (level 3) — CFG/PDG/SDG + slicing:** + ```sh + canpy --input ./my-python-project -a 3 --output ./out # + program_graphs section + canpy --input ./my-python-project -a 3 --graphs cfg,pdg # scope the emitted sections + canpy --input ./my-python-project -a 3 --graph-field-depth 2 # tighter access-path k-limit + ``` + Level 3 also enriches the Neo4j projection (`--emit neo4j`) with the CPG overlay + (`:CFGNode` nodes and `CFG_NEXT`/`CDG`/`DDG`/`PARAM_IN`/`PARAM_OUT`/`SUMMARY` edges). + +## Analysis levels + +| Level | Flag | What it adds | Cost | +| --- | --- | --- | --- | +| 1 | `-a 1` (default) | Symbol table + Jedi resolver call graph | Cheap | +| 2 | `-a 2` | PyCG call-graph enrichment (provenance-merged) | Moderate | +| 3 | `-a 3` | Native CFG/PDG/SDG (`program_graphs`) + CPG Neo4j overlay + backward slicing | Heavy, whole-program | + +Levels are cumulative — `-a 3` includes level 2's call graph (the SDG is stitched over it). +Nothing at level 3 runs unless requested: `-a 1`/`-a 2` timings and output are unaffected. + +## Architecture & Tooling + +Locked level-3 substrate decisions +([#67](https://github.com/codellm-devkit/codeanalyzer-python/issues/67)): + +- **CFG source:** hand-built from the stdlib `ast` module — the same parse the symbol-table + builder uses, so graph nodes join back to symbol-table signatures by construction. One + synthetic `ENTRY`/`EXIT` per callable, statement-level nodes keyed `(signature, node_id)` + in source-span order, exceptional edges first-class. +- **Def-use source:** hand-built reaching definitions (classic forward worklist) over k-limited + access paths (`--graph-field-depth`, default 3) — no usable SSA library exists for Python. +- **Points-to oracle:** a **type-based may-alias MVP stub** — two access paths may alias iff + their suffixes are prefix-compatible and their bases' Jedi-inferred types are compatible + (unknown types conservatively alias). Frozen behind `may_alias()`; upgrading to a real + points-to substrate is staged follow-up work. Call dispatch comes from the merged + Jedi(+PyCG) call graph, treated as a frozen oracle. +- **Summaries:** relational formal-in → formal-out flows composed bottom-up over the Tarjan + SCC condensation of the call graph, monotone fixpoint within SCCs; globals ride as extra + formals (`:module::name`), closure captures bind at definition sites. +- **Clients:** backward slicing ships in-process (two-phase context-sensitive HRB traversal, + `codeanalyzer.dataflow.slicing`). Taint is deliberately left to the CLDK SDK: once the SDG + is emitted it is language-independent labeled reachability. +- **Precision posture:** sound-leaning and over-approximate — prefer false positives to missed + flows. **Known unsoundness (documented, not silently absorbed):** `eval`/`exec`, reflection + (`getattr`/`setattr` with dynamic names), monkey-patching, C extensions, `import` side + effects, and module top-level statements (globals are modeled as formals instead). + ## Output targets `canpy` builds one analysis in memory and can emit it three ways (`--emit`): @@ -438,8 +431,9 @@ A `PyApplication` document — the canonical CLDK contract: ```jsonc { - "symbol_table": { /* file path → module (classes, functions, variables, imports, …) */ }, - "call_graph": [ /* CALL_DEP edges: { source, target, weight, provenance } keyed by callable signature */ ] + "symbol_table": { /* file path → module (classes, functions, variables, imports, …) */ }, + "call_graph": [ /* CALL_DEP edges: { source, target, weight, provenance } keyed by callable signature */ ], + "program_graphs": { /* -a 3 only: schema_version, k_limit, per-callable { cfg, pdg, param_nodes }, sdg_edges */ } } ``` From ddae36c11985810db59342d3f06812e887494d05 Mon Sep 17 00:00:00 2001 From: Rahul Krishna Date: Thu, 2 Jul 2026 00:16:10 -0400 Subject: [PATCH 11/11] fix(neo4j): namespace the CPG overlay per language (PyCFGNode, PY_* edges) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unprefixed CFGNode/CFG_NEXT/CDG/DDG/PARAM_IN/PARAM_OUT/SUMMARY would mingle analyzers' dependence edges in a Neo4j database holding more than one language's graph — SDK backends scope queries by label/type prefix. The vocabulary stays cross-language in shape (same suffixes, props, semantics) but is PY_-namespaced in the projection like every other row family; the JSON program_graphs section keeps the unprefixed contract since each analysis.json is its own namespace. Decision recorded in .claude/SCHEMA_DECISIONS.md. (#67) --- .claude/SCHEMA_DECISIONS.md | 20 +++++++++++---- CHANGELOG.md | 8 +++--- README.md | 4 ++- codeanalyzer/neo4j/project.py | 32 ++++++++++++------------ codeanalyzer/neo4j/schema.py | 31 ++++++++++++----------- schema.neo4j.json | 46 +++++++++++++++++------------------ test/test_dataflow_cpg.py | 22 ++++++++--------- 7 files changed, 91 insertions(+), 72 deletions(-) diff --git a/.claude/SCHEMA_DECISIONS.md b/.claude/SCHEMA_DECISIONS.md index 1036a91..02731dc 100644 --- a/.claude/SCHEMA_DECISIONS.md +++ b/.claude/SCHEMA_DECISIONS.md @@ -48,14 +48,24 @@ additions, all additive: ## Level-3 CPG (Neo4j) — schema.neo4j.json 1.2.0 (additive) -- New label `CFGNode` (merge key `id` = `#`; props +- New label `PyCFGNode` (merge key `id` = `#`; props `kind`, `var`, `call_node`, `start_line`, `end_line`, `_module`). Both CFG statements and parameter nodes ride this one label, distinguished by `kind` — the parity clause's label set stays minimal. -- New edge types `HAS_CFG_NODE` (PyCallable → CFGNode), `CFG_NEXT` (prop - `kind`), `CDG`, `DDG` (prop `var`), `PARAM_IN`, `PARAM_OUT`, `SUMMARY` — - deliberately **not** `PY_`-prefixed: this vocabulary is the shared - cross-language CPG contract. +- New edge types `PY_HAS_CFG_NODE` (PyCallable → PyCFGNode), `PY_CFG_NEXT` + (prop `kind`), `PY_CDG`, `PY_DDG` (prop `var`), `PY_PARAM_IN`, + `PY_PARAM_OUT`, `PY_SUMMARY`. +- **Namespacing decision (maintainer, 2026-07-02):** the CPG vocabulary is + cross-language in *shape* (same suffix names, properties, semantics) but + **per-language-prefixed in the Neo4j projection**, like every other row + family (`PySymbol`, `PY_CALLS`, …). Rationale: SDK Neo4j backends scope + queries by label/type prefix; unprefixed `DDG`/`CFGNode` in a database + holding multiple languages' graphs would mingle analyzers' dependence + edges with no way to separate them. Each analyzer uses its language tag + (`TS_`/`TSCFGNode` for TypeScript, etc.). The **JSON** `program_graphs` + section keeps the unprefixed shared vocabulary — it lives inside each + analyzer's own `analysis.json`, so there is no shared namespace to + collide in; the SDK strips/adds the prefix at the projection boundary. - `CALL` SDG edges are not projected: the callable-level `PY_CALLS` twin already carries calls; callsite-statement granularity is recoverable via `PY_HAS_CALLSITE`/`PY_RESOLVES_TO`. diff --git a/CHANGELOG.md b/CHANGELOG.md index 8616fa8..f1be928 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,9 +20,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Context-sensitive backward slicing** as an SDG query (`codeanalyzer.dataflow.slicing`, HRB two-phase traversal). Taint is deliberately left to the CLDK SDK — post-SDG it is language-independent labeled reachability. -- **CPG overlay in the Neo4j projection** at level 3: `CFGNode` nodes plus the shared - cross-language `HAS_CFG_NODE`/`CFG_NEXT`/`CDG`/`DDG`/`PARAM_IN`/`PARAM_OUT`/`SUMMARY` edge - vocabulary. `schema.neo4j.json` bumped additively to **1.2.0**. +- **CPG overlay in the Neo4j projection** at level 3: `PyCFGNode` nodes plus the + `PY_HAS_CFG_NODE`/`PY_CFG_NEXT`/`PY_CDG`/`PY_DDG`/`PY_PARAM_IN`/`PY_PARAM_OUT`/`PY_SUMMARY` + edge vocabulary — cross-language in shape, PY_-namespaced like every other row family so a + multi-language database never mingles analyzers' dependence edges. `schema.neo4j.json` + bumped additively to **1.2.0**. - **New flags**: `--graphs cfg,dfg,pdg,sdg` (scopes the emitted sections; strict validation — unknown values or use below `-a 3` exit non-zero) and `--graph-field-depth` (access-path k-limit, default 3 — the bound that guarantees the interprocedural fixpoint terminates). diff --git a/README.md b/README.md index 20a49f0..e8e0afe 100644 --- a/README.md +++ b/README.md @@ -381,7 +381,9 @@ $ canpy --help canpy --input ./my-python-project -a 3 --graph-field-depth 2 # tighter access-path k-limit ``` Level 3 also enriches the Neo4j projection (`--emit neo4j`) with the CPG overlay - (`:CFGNode` nodes and `CFG_NEXT`/`CDG`/`DDG`/`PARAM_IN`/`PARAM_OUT`/`SUMMARY` edges). + (`:PyCFGNode` nodes and `PY_CFG_NEXT`/`PY_CDG`/`PY_DDG`/`PY_PARAM_IN`/`PY_PARAM_OUT`/ + `PY_SUMMARY` edges — the cross-language dataflow vocabulary, PY_-namespaced like every + other row family so multi-language databases never mingle analyzers' edges). ## Analysis levels diff --git a/codeanalyzer/neo4j/project.py b/codeanalyzer/neo4j/project.py index c1aafc1..fc42173 100644 --- a/codeanalyzer/neo4j/project.py +++ b/codeanalyzer/neo4j/project.py @@ -75,7 +75,7 @@ def project(app: PyApplication, app_name: str) -> GraphRows: ) # Level-3 CPG overlay (present only at -a 3): the same program_graphs IR - # projected as :CFGNode nodes and the shared cross-language edge types. + # projected as :PyCFGNode nodes and PY_-namespaced dependence edge types. if app.program_graphs is not None: _project_program_graphs(b, app) @@ -99,16 +99,18 @@ def _signature_modules(app: PyApplication) -> dict: def _cfg_node_ref(b: RowBuilder, sig: str, node_id: int) -> NodeRef: - return NodeRef("CFGNode", "id", f"{sig}#{node_id}") + return NodeRef("PyCFGNode", "id", f"{sig}#{node_id}") def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: - """CFG/PDG/SDG rows: node label ``CFGNode`` (merge key ``id`` = - ``#``) and edge types ``HAS_CFG_NODE`` / ``CFG_NEXT`` - (prop ``kind``) / ``CDG`` / ``DDG`` (prop ``var``) / ``PARAM_IN`` / - ``PARAM_OUT`` / ``SUMMARY`` — the shared cross-language vocabulary, so no - ``PY_`` prefix. Parameter nodes ride the same label with their HRB kinds - plus ``var``/``call_node`` props (an additive, recorded extension).""" + """CFG/PDG/SDG rows: node label ``PyCFGNode`` (merge key ``id`` = + ``#``) and edge types ``PY_HAS_CFG_NODE`` / + ``PY_CFG_NEXT`` (prop ``kind``) / ``PY_CDG`` / ``PY_DDG`` (prop ``var``) / + ``PY_PARAM_IN`` / ``PY_PARAM_OUT`` / ``PY_SUMMARY``. The vocabulary is + cross-language in shape but PY_-namespaced like every other row family, so + a multi-language database never mingles analyzers' dependence edges. + Parameter nodes ride the same label with their HRB kinds plus + ``var``/``call_node`` props (an additive, recorded extension).""" pg = app.program_graphs sig_module = _signature_modules(app) @@ -117,7 +119,7 @@ def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: module = sig_module.get(sig) for n in (fg.cfg.nodes if fg.cfg else []): ref = b.node( - ["CFGNode"], + ["PyCFGNode"], "id", f"{sig}#{n.id}", prune( @@ -129,10 +131,10 @@ def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: } ), ) - b.edge("HAS_CFG_NODE", owner, ref) + b.edge("PY_HAS_CFG_NODE", owner, ref) for p in fg.param_nodes or []: ref = b.node( - ["CFGNode"], + ["PyCFGNode"], "id", f"{sig}#{p.id}", prune( @@ -146,17 +148,17 @@ def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: } ), ) - b.edge("HAS_CFG_NODE", owner, ref) + b.edge("PY_HAS_CFG_NODE", owner, ref) for e in (fg.cfg.edges if fg.cfg else []): b.edge( - "CFG_NEXT", + "PY_CFG_NEXT", _cfg_node_ref(b, sig, e.source), _cfg_node_ref(b, sig, e.target), {"kind": e.kind}, ) for e in (fg.pdg.edges if fg.pdg else []): b.edge( - e.type, # CDG | DDG + f"PY_{e.type}", # PY_CDG | PY_DDG _cfg_node_ref(b, sig, e.source), _cfg_node_ref(b, sig, e.target), prune({"var": e.var}), @@ -166,7 +168,7 @@ def _project_program_graphs(b: RowBuilder, app: PyApplication) -> None: if e.type == "CALL": continue # the callable-level PY_CALLS twin already carries calls b.edge( - e.type, # PARAM_IN | PARAM_OUT | SUMMARY + f"PY_{e.type}", # PY_PARAM_IN | PY_PARAM_OUT | PY_SUMMARY _cfg_node_ref(b, e.source.signature, e.source.node), _cfg_node_ref(b, e.target.signature, e.target.node), prune({"var": e.var}), diff --git a/codeanalyzer/neo4j/schema.py b/codeanalyzer/neo4j/schema.py index 5085bab..f43c1b0 100644 --- a/codeanalyzer/neo4j/schema.py +++ b/codeanalyzer/neo4j/schema.py @@ -176,13 +176,15 @@ class RelType: "_module": "string", }, ), - # Level-3 CPG overlay (present only at -a 3). The label and edge types - # below are the shared cross-language dataflow vocabulary — deliberately - # NOT PY_-prefixed. `id` = "#"; parameter-passing - # nodes (formal/actual in/out) ride the same label with `var`/`call_node`. + # Level-3 CPG overlay (present only at -a 3). The dataflow vocabulary is + # shared cross-language in *shape* (same suffixes, props, semantics) but + # namespaced per language like every other row family — a multi-language + # Neo4j database must never mingle one analyzer's dependence edges with + # another's. `id` = "#"; parameter-passing nodes + # (formal/actual in/out) ride the same label with `var`/`call_node`. NodeLabel( - "CFGNode", - "CFGNode", + "PyCFGNode", + "PyCFGNode", "id", { "id": "string", @@ -220,14 +222,15 @@ class RelType: {"imported_names": "string[]", "aliases": "string[]"}, ), RelType("PY_DECORATED_BY", ["PyCallable"], ["PyDecorator"]), - # Level-3 CPG overlay (shared cross-language vocabulary, -a 3 only). - RelType("HAS_CFG_NODE", ["PyCallable"], ["CFGNode"]), - RelType("CFG_NEXT", ["CFGNode"], ["CFGNode"], {"kind": "string"}), - RelType("CDG", ["CFGNode"], ["CFGNode"]), - RelType("DDG", ["CFGNode"], ["CFGNode"], {"var": "string"}), - RelType("PARAM_IN", ["CFGNode"], ["CFGNode"], {"var": "string"}), - RelType("PARAM_OUT", ["CFGNode"], ["CFGNode"], {"var": "string"}), - RelType("SUMMARY", ["CFGNode"], ["CFGNode"]), + # Level-3 CPG overlay (-a 3 only): the cross-language dataflow vocabulary, + # PY_-namespaced so per-language SDK backends can scope their queries. + RelType("PY_HAS_CFG_NODE", ["PyCallable"], ["PyCFGNode"]), + RelType("PY_CFG_NEXT", ["PyCFGNode"], ["PyCFGNode"], {"kind": "string"}), + RelType("PY_CDG", ["PyCFGNode"], ["PyCFGNode"]), + RelType("PY_DDG", ["PyCFGNode"], ["PyCFGNode"], {"var": "string"}), + RelType("PY_PARAM_IN", ["PyCFGNode"], ["PyCFGNode"], {"var": "string"}), + RelType("PY_PARAM_OUT", ["PyCFGNode"], ["PyCFGNode"], {"var": "string"}), + RelType("PY_SUMMARY", ["PyCFGNode"], ["PyCFGNode"]), ] diff --git a/schema.neo4j.json b/schema.neo4j.json index 8098d3e..a9d7e07 100644 --- a/schema.neo4j.json +++ b/schema.neo4j.json @@ -137,8 +137,8 @@ } }, { - "label": "CFGNode", - "merge_label": "CFGNode", + "label": "PyCFGNode", + "merge_label": "PyCFGNode", "key": "id", "properties": { "id": "string", @@ -276,80 +276,80 @@ "properties": {} }, { - "type": "HAS_CFG_NODE", + "type": "PY_HAS_CFG_NODE", "from": [ "PyCallable" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": {} }, { - "type": "CFG_NEXT", + "type": "PY_CFG_NEXT", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": { "kind": "string" } }, { - "type": "CDG", + "type": "PY_CDG", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": {} }, { - "type": "DDG", + "type": "PY_DDG", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": { "var": "string" } }, { - "type": "PARAM_IN", + "type": "PY_PARAM_IN", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": { "var": "string" } }, { - "type": "PARAM_OUT", + "type": "PY_PARAM_OUT", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": { "var": "string" } }, { - "type": "SUMMARY", + "type": "PY_SUMMARY", "from": [ - "CFGNode" + "PyCFGNode" ], "to": [ - "CFGNode" + "PyCFGNode" ], "properties": {} } @@ -363,7 +363,7 @@ "CREATE CONSTRAINT pycallsite_id IF NOT EXISTS FOR (x:PyCallSite) REQUIRE x.id IS UNIQUE", "CREATE CONSTRAINT pyattribute_id IF NOT EXISTS FOR (x:PyAttribute) REQUIRE x.id IS UNIQUE", "CREATE CONSTRAINT pyvariable_id IF NOT EXISTS FOR (x:PyVariable) REQUIRE x.id IS UNIQUE", - "CREATE CONSTRAINT cfgnode_id IF NOT EXISTS FOR (x:CFGNode) REQUIRE x.id IS UNIQUE" + "CREATE CONSTRAINT pycfgnode_id IF NOT EXISTS FOR (x:PyCFGNode) REQUIRE x.id IS UNIQUE" ], "indexes": [ "CREATE INDEX py_callable_name IF NOT EXISTS FOR (c:PyCallable) ON (c.name)", diff --git a/test/test_dataflow_cpg.py b/test/test_dataflow_cpg.py index 8a303e6..279ac09 100644 --- a/test/test_dataflow_cpg.py +++ b/test/test_dataflow_cpg.py @@ -1,9 +1,9 @@ """Stage-8b gate: the CPG projection of the level-3 graphs. -- CFGNode row count equals the JSON section's node count (CFG + parameter +- PyCFGNode row count equals the JSON section's node count (CFG + parameter nodes) — the contract's count-parity assertion; -- every CFG_NEXT/CDG/DDG/PARAM_IN/PARAM_OUT/SUMMARY edge endpoint references - an emitted CFGNode id (deferred-edge/no-dangling gate); +- every PY_CFG_NEXT/PY_CDG/PY_DDG/PY_PARAM_IN/PY_PARAM_OUT/PY_SUMMARY edge + endpoint references an emitted PyCFGNode id (deferred-edge/no-dangling gate); - the Cypher snapshot renders and contains the overlay's vocabulary. Loading into a live Neo4j is exercised by the (container-gated) bolt tests; @@ -21,7 +21,7 @@ FIXTURE = Path(__file__).parent / "fixtures" / "single_functionalities" / "dataflow" -CPG_EDGE_TYPES = {"CFG_NEXT", "CDG", "DDG", "PARAM_IN", "PARAM_OUT", "SUMMARY"} +CPG_EDGE_TYPES = {"PY_CFG_NEXT", "PY_CDG", "PY_DDG", "PY_PARAM_IN", "PY_PARAM_OUT", "PY_SUMMARY"} @pytest.fixture(scope="module") @@ -44,31 +44,31 @@ def test_cfg_node_count_matches_the_json_section(level3_app, rows): len(fg.cfg.nodes if fg.cfg else []) + len(fg.param_nodes or []) for fg in level3_app.program_graphs.functions.values() ) - emitted = [n for n in rows.nodes if "CFGNode" in n.labels] + emitted = [n for n in rows.nodes if "PyCFGNode" in n.labels] assert expected > 0 assert len(emitted) == expected def test_no_dangling_cpg_edge_endpoints(rows): - cfg_ids = {n.value for n in rows.nodes if "CFGNode" in n.labels} + cfg_ids = {n.value for n in rows.nodes if "PyCFGNode" in n.labels} cpg_edges = [e for e in rows.edges if e.type in CPG_EDGE_TYPES] assert cpg_edges, "no CPG edges projected" for e in cpg_edges: - if e.from_ref.label == "CFGNode": + if e.from_ref.label == "PyCFGNode": assert e.from_ref.value in cfg_ids, e - if e.to_ref.label == "CFGNode": + if e.to_ref.label == "PyCFGNode": assert e.to_ref.value in cfg_ids, e def test_every_callable_with_graphs_owns_its_cfg_nodes(level3_app, rows): - has_edges = [e for e in rows.edges if e.type == "HAS_CFG_NODE"] + has_edges = [e for e in rows.edges if e.type == "PY_HAS_CFG_NODE"] owned = {e.to_ref.value for e in has_edges} - cfg_ids = {n.value for n in rows.nodes if "CFGNode" in n.labels} + cfg_ids = {n.value for n in rows.nodes if "PyCFGNode" in n.labels} assert owned == cfg_ids, "every CFGNode must be owned by its callable" def test_cypher_snapshot_renders_the_overlay(level3_app, rows): cypher = render_cypher(rows, "dataflow-fixture") - assert ":CFGNode" in cypher + assert ":PyCFGNode" in cypher for t in CPG_EDGE_TYPES: assert t in cypher, f"{t} missing from the snapshot"