From ee69180fd8f1add01cd8d874fac3e5e7ce36e3b1 Mon Sep 17 00:00:00 2001 From: robertyluo Date: Mon, 29 Jun 2026 23:10:30 +0800 Subject: [PATCH] feat(config): add `include` to force gitignored first-party source into the index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `.gitignore` keeps files out of the index, which is wrong when the gitignored files are real first-party source tracked by a SECOND VCS. In a project that uses SVN/Perforce alongside Git, some source is committed to that VCS and deliberately `.gitignore`d so it never lands in Git. git never lists those files, so CodeGraph never indexed them — and neither existing knob helped: `includeIgnored` only revives *embedded git repos* inside an ignored dir (findIgnoredEmbeddedRepos/findNestedGitRepos), and `exclude` is the opposite (drop tracked files, #999). Add a project-level `include` whitelist to `codegraph.json`: gitignore-style patterns whose matching source files are indexed even when gitignored / not git-tracked. { "include": ["Tools/", "Tools/**", "Local/typescript/"] } Implementation: - project-config: parse/validate/cache `include` exactly like `exclude` (warn-and-skip on malformed input, never throw), exposed as `loadIncludePatterns`. - extraction: - `collectIncludedFiles` actively discovers the whitelisted files off disk (git can't list them) via a targeted walk of each pattern's static prefix (`includeStaticRoots`), overriding `.gitignore` — but never resurfacing a built-in default-ignored dir (node_modules/dist/…), `.git`, or the CodeGraph data dir, and honoring `exclude`. - union those files into BOTH enumeration paths: `getGitVisibleFiles` (git) and `scanDirectoryWalk` (non-git). `sync` rides the same scan, so adds/ mods/removals of included files reconcile automatically. - make `ScopeIgnore` include-aware so the file watcher watches the included files and the gitignored directories leading to them; `exclude` still wins and default-ignored dirs are still pruned. Precedence: exclude > include > .gitignore/defaults; built-in default-ignored dirs are never re-included. Docs: configuration.md, README, and a CHANGELOG [Unreleased] entry. Tests: __tests__/include-config.test.ts — loader (parse/validate/cache), `scanDirectory` behavior on git and non-git paths, recursive `**` glob, exclude-wins, no node_modules resurrection, and `buildScopeIgnore` (watcher-facing) scope. Co-authored-by: Cursor --- CHANGELOG.md | 3 + README.md | 15 ++ __tests__/include-config.test.ts | 242 ++++++++++++++++++ .../docs/getting-started/configuration.md | 26 +- src/extraction/index.ts | 198 +++++++++++++- src/project-config.ts | 67 ++++- 6 files changed, 545 insertions(+), 6 deletions(-) create mode 100644 __tests__/include-config.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 143f8e92f..ecff3b094 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### New Features + +- You can now force gitignored first-party source **into** the index with an `include` list in `codegraph.json`. The case this solves: a project tracked by a second VCS (SVN, Perforce, …) alongside Git, where some real source is committed to that VCS and deliberately listed in `.gitignore` so it never lands in Git — git never lists those files, so CodeGraph never indexed them, and neither `includeIgnored` (which only revives *embedded git repositories* inside a gitignored directory) nor `exclude` (its opposite) could help. Add a root `codegraph.json` with, e.g., `{ "include": ["Tools/", "Local/typescript/"] }` and CodeGraph discovers those files directly off disk — overriding `.gitignore` — and indexes them on the full index, incremental `sync`, and file-watching, on both git and non-git projects. Patterns are gitignore-style and matched against project-root-relative paths (a directory, a recursive `**` glob, or a single file). An explicit `exclude` still wins, and built-in skips like `node_modules`, `dist`, and `.git` are never re-included. This complements the existing `exclude` (its opposite — keep tracked files *out*) and `includeIgnored` (opt *in* to gitignored embedded repos). ## [1.1.3] - 2026-06-29 diff --git a/README.md b/README.md index 7b6c33932..2b5cfef2c 100644 --- a/README.md +++ b/README.md @@ -618,6 +618,21 @@ watch: } ``` +Conversely, when real source is gitignored on purpose — a project under a second +VCS (SVN, Perforce) that `.gitignore`s its own source so it stays out of Git — +force it back in with `include` (the opposite of `exclude`; `includeIgnored` +only revives embedded git repos, not plain source): + +```json +{ + "include": ["Tools/", "Local/typescript/"] +} +``` + +CodeGraph discovers those files off disk, overriding `.gitignore`, on index, +sync, and watch. An explicit `exclude` still wins, and built-in skips +(`node_modules`, `dist`, `.git`) are never re-included. + ### Custom file extensions If your project uses a non-standard extension for a [supported diff --git a/__tests__/include-config.test.ts b/__tests__/include-config.test.ts new file mode 100644 index 000000000..27bc1be22 --- /dev/null +++ b/__tests__/include-config.test.ts @@ -0,0 +1,242 @@ +/** + * `codegraph.json` `include` — force first-party source INTO the index even when + * `.gitignore` would drop it. + * + * The whitelist `includeIgnored` never was: that one only revives *embedded git + * repos* inside ignored dirs (#622/#699), so pure source gitignored out of Git + * (the SVN+Git dual-VCS case — committed to SVN, `.gitignore`d so it never lands + * in Git) had no way in. Three layers under test: + * 1. Loader: parse/validate/cache, mirroring the `exclude` loader. + * 2. Behavior: `scanDirectory` adds included paths on BOTH the git + * (`git ls-files`) and non-git (filesystem walk) enumeration paths. + * 3. Scope: `buildScopeIgnore` (the watcher's source of truth) treats an + * included file — and the gitignored dirs leading to it — as not-ignored. + * + * Invariants: an explicit `exclude` still wins; built-in default-ignored dirs + * (`node_modules`, …) are never resurfaced; every loader failure mode degrades + * to the zero-config default (force nothing in), never a throw. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import * as os from 'node:os'; +import { execFileSync } from 'node:child_process'; +import { + loadIncludePatterns, + loadExcludePatterns, + loadExtensionOverrides, + loadIncludeIgnoredPatterns, + clearProjectConfigCache, +} from '../src/project-config'; +import { scanDirectory, buildScopeIgnore } from '../src/extraction'; + +describe('include loader (codegraph.json)', () => { + let dir: string; + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-include-')); + clearProjectConfigCache(); + }); + afterEach(() => { + clearProjectConfigCache(); + fs.rmSync(dir, { recursive: true, force: true }); + }); + const writeConfig = (obj: unknown) => + fs.writeFileSync( + path.join(dir, 'codegraph.json'), + typeof obj === 'string' ? obj : JSON.stringify(obj) + ); + + it('returns an empty list when there is no codegraph.json (the default)', () => { + expect(loadIncludePatterns(dir)).toEqual([]); + }); + + it('loads a well-formed pattern array', () => { + writeConfig({ include: ['Tools/', 'Local/**'] }); + expect(loadIncludePatterns(dir)).toEqual(['Tools/', 'Local/**']); + }); + + it('trims whitespace and drops blank / non-string entries', () => { + writeConfig({ include: [' Tools/ ', '', ' ', 42, null, 'Local/'] }); + expect(loadIncludePatterns(dir)).toEqual(['Tools/', 'Local/']); + }); + + it('ignores a non-array include value without throwing', () => { + writeConfig({ include: 'Tools/' }); + expect(loadIncludePatterns(dir)).toEqual([]); + }); + + it('ignores malformed JSON without throwing', () => { + writeConfig('{ not: valid json '); + expect(loadIncludePatterns(dir)).toEqual([]); + }); + + it('coexists with extensions / includeIgnored / exclude in one file (shared single parse)', () => { + writeConfig({ + extensions: { '.foo': 'typescript' }, + includeIgnored: ['pkgs/'], + exclude: ['static/'], + include: ['Tools/'], + }); + expect(loadExtensionOverrides(dir)).toEqual({ '.foo': 'typescript' }); + expect(loadIncludeIgnoredPatterns(dir)).toEqual(['pkgs/']); + expect(loadExcludePatterns(dir)).toEqual(['static/']); + expect(loadIncludePatterns(dir)).toEqual(['Tools/']); + }); + + it('picks up a changed config (mtime-invalidated cache)', () => { + writeConfig({ include: ['Tools/'] }); + expect(loadIncludePatterns(dir)).toEqual(['Tools/']); + + writeConfig({ include: ['Local/'] }); + const future = new Date(Date.now() + 2000); + fs.utimesSync(path.join(dir, 'codegraph.json'), future, future); + + expect(loadIncludePatterns(dir)).toEqual(['Local/']); + }); + + it('drops the patterns again when the config file is removed', () => { + writeConfig({ include: ['Tools/'] }); + expect(loadIncludePatterns(dir)).toEqual(['Tools/']); + fs.rmSync(path.join(dir, 'codegraph.json')); + expect(loadIncludePatterns(dir)).toEqual([]); + }); +}); + +describe('include behavior — scanDirectory force-indexes gitignored source', () => { + let dir: string; + const mk = (rel: string, content = 'export const x = 1;\n') => { + const p = path.join(dir, rel); + fs.mkdirSync(path.dirname(p), { recursive: true }); + fs.writeFileSync(p, content); + }; + const writeConfig = (obj: unknown) => + fs.writeFileSync(path.join(dir, 'codegraph.json'), JSON.stringify(obj)); + const scan = () => scanDirectory(dir).map((f) => f.replace(/\\/g, '/')); + + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-include-scan-')); + clearProjectConfigCache(); + }); + afterEach(() => { + clearProjectConfigCache(); + fs.rmSync(dir, { recursive: true, force: true }); + }); + + const gitInit = () => { + execFileSync('git', ['init', '-q'], { cwd: dir }); + execFileSync('git', ['add', '-A'], { cwd: dir }); + execFileSync('git', ['-c', 'user.email=a@b.c', '-c', 'user.name=t', 'commit', '-qm', 'x'], { cwd: dir }); + }; + + it('indexes a .gitignored source dir when include opts it in (git path) — the core fix', () => { + mk('app/main.ts'); + mk('Tools/gen.py', 'def gen():\n return 1\n'); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Tools/\n'); // SVN-only source, kept out of Git + gitInit(); // Tools/ is gitignored → NOT tracked + + // Sanity: without include the gitignored source is invisible. + let files = scan(); + expect(files).toContain('app/main.ts'); + expect(files.some((f) => f.startsWith('Tools/'))).toBe(false); + + // With include the gitignored source is forced in, app code still there. + writeConfig({ include: ['Tools/'] }); + clearProjectConfigCache(); + files = scan(); + expect(files).toContain('app/main.ts'); + expect(files).toContain('Tools/gen.py'); + }); + + it('forces gitignored source in on the non-git filesystem-walk path too', () => { + mk('app/main.ts'); + mk('Tools/gen.py', 'def gen():\n return 1\n'); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Tools/\n'); + // No git init → scanDirectory falls back to the filesystem walk (which still + // honours .gitignore), so Tools/ must be re-added by include. + writeConfig({ include: ['Tools/'] }); + clearProjectConfigCache(); + const files = scan(); + expect(files).toContain('app/main.ts'); + expect(files).toContain('Tools/gen.py'); + }); + + it('supports a recursive ** glob and nested dirs', () => { + mk('src/a.ts'); + mk('Local/ts/a.ts'); + mk('Local/ts/nested/b.ts'); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Local/\n'); + gitInit(); + writeConfig({ include: ['Local/**'] }); + clearProjectConfigCache(); + const files = scan(); + expect(files).toContain('Local/ts/a.ts'); + expect(files).toContain('Local/ts/nested/b.ts'); + }); + + it('lets an explicit exclude win over include', () => { + mk('Tools/keep.py', 'def k():\n return 1\n'); + mk('Tools/secret/drop.py', 'def d():\n return 1\n'); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Tools/\n'); + gitInit(); + writeConfig({ include: ['Tools/'], exclude: ['Tools/secret/'] }); + clearProjectConfigCache(); + const files = scan(); + expect(files).toContain('Tools/keep.py'); + expect(files.some((f) => f.startsWith('Tools/secret/'))).toBe(false); + }); + + it('never resurrects a built-in default-ignored dir (node_modules) via include', () => { + mk('src/a.ts'); + mk('node_modules/pkg/index.js'); + gitInit(); + // Even explicitly opting node_modules in must not pull it into the graph. + writeConfig({ include: ['node_modules/'] }); + clearProjectConfigCache(); + const files = scan(); + expect(files).toContain('src/a.ts'); + expect(files.some((f) => f.startsWith('node_modules/'))).toBe(false); + }); + + it('is a no-op with no include config (gitignored source stays out)', () => { + mk('app/main.ts'); + mk('Tools/gen.py', 'def gen():\n return 1\n'); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Tools/\n'); + gitInit(); + const files = scan(); + expect(files).toContain('app/main.ts'); + expect(files.some((f) => f.startsWith('Tools/'))).toBe(false); + }); +}); + +describe('include scope — buildScopeIgnore keeps included paths watchable', () => { + let dir: string; + beforeEach(() => { + dir = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-include-scope-')); + clearProjectConfigCache(); + execFileSync('git', ['init', '-q'], { cwd: dir }); + fs.writeFileSync(path.join(dir, '.gitignore'), 'Tools/\nOther/\n'); + fs.writeFileSync(path.join(dir, 'codegraph.json'), JSON.stringify({ include: ['Tools/'] })); + }); + afterEach(() => { + clearProjectConfigCache(); + fs.rmSync(dir, { recursive: true, force: true }); + }); + + it('does not ignore an included file, nor the gitignored dir leading to it', () => { + const scope = buildScopeIgnore(dir); + // The included file and its (gitignored) directory are watchable. + expect(scope.ignores('Tools/gen.py')).toBe(false); + expect(scope.ignores('Tools/')).toBe(false); + // A different gitignored dir that was NOT opted in stays ignored. + expect(scope.ignores('Other/')).toBe(true); + expect(scope.ignores('Other/x.py')).toBe(true); + }); + + it('still ignores everything when no include is configured', () => { + fs.writeFileSync(path.join(dir, 'codegraph.json'), JSON.stringify({})); + clearProjectConfigCache(); + const scope = buildScopeIgnore(dir); + expect(scope.ignores('Tools/gen.py')).toBe(true); + expect(scope.ignores('Tools/')).toBe(true); + }); +}); diff --git a/site/src/content/docs/getting-started/configuration.md b/site/src/content/docs/getting-started/configuration.md index 9d9abeb24..d63784e72 100644 --- a/site/src/content/docs/getting-started/configuration.md +++ b/site/src/content/docs/getting-started/configuration.md @@ -1,9 +1,9 @@ --- title: Configuration -description: CodeGraph is zero-config by default, with one optional codegraph.json for custom extensions, excluding tracked directories, and indexing nested git repositories. +description: CodeGraph is zero-config by default, with one optional codegraph.json for custom extensions, excluding tracked directories, indexing gitignored source, and indexing nested git repositories. --- -Next to none — CodeGraph is **zero-config by default**, with nothing to write or keep in sync to get started. Language support is automatic from the file extension; there's nothing to wire up per language. The one optional file, `codegraph.json`, covers [custom file extensions](#custom-file-extensions), [excluding tracked directories](#excluding-a-tracked-directory), and [indexing nested git repositories](#indexing-nested-git-repositories). +Next to none — CodeGraph is **zero-config by default**, with nothing to write or keep in sync to get started. Language support is automatic from the file extension; there's nothing to wire up per language. The one optional file, `codegraph.json`, covers [custom file extensions](#custom-file-extensions), [excluding tracked directories](#excluding-a-tracked-directory), [indexing gitignored source](#indexing-gitignored-source-a-second-vcs), and [indexing nested git repositories](#indexing-nested-git-repositories). ## What it skips out of the box @@ -31,6 +31,28 @@ Each entry is a gitignore-style pattern, matched against project-root-relative p Re-index (`codegraph index`) after adding or changing `exclude`. +## Indexing gitignored source (a second VCS) + +`.gitignore` keeps files out of the index — which is usually what you want, but not when the gitignored files are real first-party source. The case this exists for: a project tracked by **SVN, Perforce, or another VCS alongside Git**, where some source is committed to that VCS and deliberately listed in `.gitignore` so it never lands in Git. That source is still yours and you want it in the graph, but git never lists it, so CodeGraph never sees it. (`includeIgnored` doesn't help — it only revives *embedded git repositories* inside a gitignored directory, not plain source.) + +List those paths under `include` in `codegraph.json` to force them in: + +```json +{ + "include": ["Tools/", "Local/typescript/"] +} +``` + +Each entry is a gitignore-style pattern, matched against project-root-relative paths (a directory like `"Tools/"`, a recursive `"Tools/**"` glob, or a single file all work). CodeGraph discovers the matching files directly off disk — overriding `.gitignore` — and indexes them everywhere it looks at files: the full index, incremental `sync`, and file-watching. + +A few things to know: + +- An explicit [`exclude`](#excluding-a-tracked-directory) still wins — listing the same path in both keeps it out. +- Built-in skips like `node_modules`, `dist`, and `.git` are never re-included, even when an `include` pattern would match inside them. +- This is the opposite of `exclude` (which keeps tracked files *out*); it's for source git itself never tracks. + +Re-index (`codegraph index`) after adding or changing `include`. + ## Custom file extensions If your project uses a non-standard extension for a [supported language](/codegraph/reference/languages/) — say `.dota_lua` for Lua, or `.tpl` for PHP — those files are skipped by default, because the extension isn't one CodeGraph recognizes. Map them with an optional `codegraph.json` at your project root: diff --git a/src/extraction/index.ts b/src/extraction/index.ts index f96ec6099..6b40850ef 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -21,7 +21,7 @@ import { QueryBuilder } from '../db/queries'; import { extractFromSource } from './tree-sitter'; import { ParseWorkerPool, resolveParsePoolSize } from './parse-pool'; import { detectLanguage, isSourceFile, isLanguageSupported, isFileLevelOnlyLanguage, initGrammars, loadGrammarsForLanguages } from './grammars'; -import { loadExtensionOverrides, loadIncludeIgnoredPatterns, loadExcludePatterns } from '../project-config'; +import { loadExtensionOverrides, loadIncludeIgnoredPatterns, loadExcludePatterns, loadIncludePatterns } from '../project-config'; import { isCodeGraphDataDir } from '../directory'; import { logDebug, logWarn } from '../errors'; import { validatePathWithinRoot, normalizePath } from '../utils'; @@ -322,6 +322,153 @@ function loadExcludeMatcher(rootDir: string): Ignore | null { return patterns.length > 0 ? ignore().add(patterns) : null; } +/** + * Matcher for the project's `codegraph.json` `include` patterns — first-party + * source to force INTO the index even when `.gitignore` drops it (the general + * whitelist `includeIgnored` never was — that one only revives *embedded git + * repos*). The case it exists for: a project under a second VCS (SVN/Perforce) + * `.gitignore`s its own real source so it stays out of Git, yet we still want it + * indexed. Returns `null` when nothing is force-included (the zero-config + * default → no overhead, no extra walk). Built once per scan/sync/scope + * operation from the scan root. + */ +function loadIncludeMatcher(rootDir: string): Ignore | null { + const patterns = loadIncludePatterns(rootDir); + return patterns.length > 0 ? ignore().add(patterns) : null; +} + +/** Glob metacharacters that end the static (literal) prefix of an `include` pattern. */ +const GLOB_META = /[*?[\]{}!]/; + +/** + * The static directory prefix of each `include` pattern — the literal leading + * path up to the first glob segment — trailing-slashed, used to (a) walk only + * the opted-in subtrees in `collectIncludedFiles` and (b) let `ScopeIgnore` keep + * the watcher descending toward them. `Tools/` stays `Tools/`; a recursive + * `Tools/**` glob yields `Tools/`; `src/local/file.ts` yields `src/local/` (the + * file's dir); a pattern that starts with a glob (like a leading `**`) yields + * `''`, meaning "no static root — walk the whole tree". Duplicates and roots + * nested under a broader root are collapsed so each subtree is walked once. + */ +function includeStaticRoots(patterns: string[]): string[] { + const roots = new Set(); + for (const pattern of patterns) { + let p = pattern.replace(/^\/+/, ''); + const trailingSlash = p.endsWith('/'); + if (trailingSlash) p = p.slice(0, -1); + const segs = p.split('/').filter(Boolean); + const lead: string[] = []; + for (const s of segs) { + if (GLOB_META.test(s)) break; + lead.push(s); + } + const hadWildcard = lead.length < segs.length; + // A wholly-literal pattern with no trailing slash names a file (or a dir we + // can't tell apart) — drop its last segment so we walk the containing dir + // and let the matcher pick the file. A trailing slash or a glob means the + // remaining `lead` is already the directory to walk. + if (!hadWildcard && !trailingSlash && lead.length > 0) lead.pop(); + if (lead.length === 0) { + roots.clear(); + roots.add(''); + return ['']; // a top-level glob forces a whole-tree walk; nothing narrower matters + } + roots.add(lead.join('/') + '/'); + } + // Collapse roots nested under a broader one (e.g. drop `a/b/` if `a/` is present). + const all = [...roots]; + return all.filter((r) => !all.some((other) => other !== r && r.startsWith(other))); +} + +/** + * Actively discover the source files an `include` whitelist forces in. `git + * ls-files` never lists gitignored files, so a filtered filesystem walk of just + * the opted-in subtrees (`includeStaticRoots`) is the only way to find them. + * Returns project-root-relative, normalized source-file paths. + * + * A file is collected when it MATCHES `include`, is NOT hit by `exclude` (an + * explicit exclude always wins), is a recognized source file, and does not live + * under a built-in default-ignored dir (`node_modules`, `dist`, …), `.git`, or + * CodeGraph's data dir — those are never resurfaced, mirroring `ScopeIgnore`. + * `.gitignore` is deliberately NOT consulted: overriding it is the whole point. + */ +function collectIncludedFiles( + rootDir: string, + include: Ignore, + exclude: Ignore | null, + roots: string[], + overrides: Record, +): Set { + const out = new Set(); + const defaults = defaultsOnlyIgnore(); + const visited = new Set(); + + const consider = (abs: string, rel: string, isDir: boolean): void => { + if (isDir) { + if (defaults.ignores(rel + '/')) return; // never node_modules/dist/… via include + walk(abs); + } else { + if (defaults.ignores(rel)) return; + if (!include.ignores(rel)) return; + if (exclude && exclude.ignores(rel)) return; + if (!isSourceFile(rel, overrides)) return; + out.add(rel); + } + }; + + function walk(absDir: string): void { + let realDir: string; + try { + realDir = fs.realpathSync(absDir); + } catch { + return; + } + if (visited.has(realDir)) return; // symlink-cycle guard + visited.add(realDir); + + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(absDir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue; + const abs = path.join(absDir, entry.name); + const rel = normalizePath(path.relative(rootDir, abs)); + if (!rel || rel.startsWith('..')) continue; + if (entry.isSymbolicLink()) { + try { + const st = fs.statSync(fs.realpathSync(abs)); + consider(abs, rel, st.isDirectory()); + } catch { + // broken symlink — skip + } + continue; + } + consider(abs, rel, entry.isDirectory()); + } + } + + for (const root of roots) { + walk(root === '' ? rootDir : path.join(rootDir, root)); + } + return out; +} + +/** + * The included source files (`codegraph.json` `include`) for a scan root, or an + * empty set when nothing is force-included. Centralizes loading the matcher, + * roots, exclude, and overrides so both enumeration paths (git and filesystem + * walk) add the same files. + */ +function collectIncludedFilesForRoot(rootDir: string): Set { + const include = loadIncludeMatcher(rootDir); + if (!include) return new Set(); + const roots = includeStaticRoots(loadIncludePatterns(rootDir)); + return collectIncludedFiles(rootDir, include, loadExcludeMatcher(rootDir), roots, loadExtensionOverrides(rootDir)); +} + /** * `git ls-files --directory` collapses a wholly-untracked/ignored directory into * one entry — and when the command's own cwd is such a directory (the indexed @@ -469,6 +616,17 @@ export class ScopeIgnore { * exclude applies even to tracked files and even inside embedded repos. */ private exclude: Ignore | null = null, + /** + * Project `codegraph.json` `include` patterns — first-party source forced + * INTO the index despite `.gitignore`. When a path matches, it is NOT + * ignored (so the watcher watches it), overriding `.gitignore`/`rootMatcher` + * — but never `exclude` (checked first) and never a built-in default-ignored + * dir. `includeRoots` are the static prefixes so a gitignored ANCESTOR + * directory of an included subtree still isn't pruned by the directory + * walker/watcher. + */ + private include: Ignore | null = null, + private includeRoots: string[] = [], ) { // Longest root first so paths in nested embedded repos hit the innermost matcher. this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length); @@ -479,6 +637,18 @@ export class ScopeIgnore { // path: it must drop git-TRACKED paths (which `.gitignore` can't) and apply // everywhere, including ancestors of embedded repos. if (this.exclude && this.exclude.ignores(rel)) return true; + // User `include`: force first-party source in despite `.gitignore`. Never + // resurfaces a built-in default-ignored dir (node_modules/dist/…), so an + // include pattern can't accidentally pull in dependency/build trees. + if (this.include && !this.defaults.ignores(rel)) { + if (rel.endsWith('/')) { + // A directory on (or leading to) an included subtree must stay walkable + // so the watcher/walker descends to reach the forced-in files. + if (this.includeRoots.some((r) => r.startsWith(rel) || rel.startsWith(r))) return false; + } else if (this.include.ignores(rel)) { + return false; + } + } for (const { root, matcher } of this.embedded) { if (rel.startsWith(root)) { const inner = rel.slice(root.length); @@ -504,10 +674,13 @@ export class ScopeIgnore { */ export function buildScopeIgnore(rootDir: string, embeddedRoots?: Iterable): ScopeIgnore { const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir); + const include = loadIncludeMatcher(rootDir); return new ScopeIgnore( buildDefaultIgnore(rootDir), roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })), loadExcludeMatcher(rootDir), + include, + include ? includeStaticRoots(loadIncludePatterns(rootDir)) : [], ); } @@ -755,7 +928,14 @@ function getGitVisibleFiles(rootDir: string): Set | null { // not the parent's: the parent's .gitignore hides the child repo from git, // not from the index. (#514) const ig = buildScopeIgnore(rootDir, embeddedRoots); - return new Set([...files].filter((f) => !ig.ignores(f))); + const visible = new Set([...files].filter((f) => !ig.ignores(f))); + // Force-include first-party source the project whitelisted in + // `codegraph.json` `include`. These are gitignored, so `git ls-files` never + // listed them above — discover them directly off disk and add them. (The + // common SVN+Git dual-VCS case: source committed to SVN, gitignored out of + // Git, but still wanted in the graph.) + for (const f of collectIncludedFilesForRoot(rootDir)) visible.add(f); + return visible; } catch { return null; } @@ -1062,6 +1242,20 @@ function scanDirectoryWalk( const exclude = loadExcludeMatcher(rootDir); if (exclude) baseMatchers.push({ dir: rootDir, ig: exclude }); walk(rootDir, baseMatchers); + + // Force-include first-party source whitelisted in `codegraph.json` `include` + // — the walk above honours `.gitignore`, so anything gitignored was dropped; + // add it back here (deduped). Mirrors the git path's union. + const included = collectIncludedFilesForRoot(rootDir); + if (included.size > 0) { + const seen = new Set(files); + for (const f of included) { + if (!seen.has(f)) { + files.push(f); + seen.add(f); + } + } + } return files; } diff --git a/src/project-config.ts b/src/project-config.ts index b27f08d40..ed03d7fc7 100644 --- a/src/project-config.ts +++ b/src/project-config.ts @@ -53,6 +53,20 @@ export interface ProjectConfig { * and your `.gitignore`. */ exclude?: string[]; + /** + * Gitignore-style patterns for first-party source to force INTO the index even + * when `.gitignore` would drop it — the general whitelist `includeIgnored` + * never was (that one only revives *embedded git repos* inside ignored dirs). + * The case this exists for: a project under a second VCS (SVN, Perforce, …) + * deliberately `.gitignore`s its own real source so it never lands in Git, yet + * that source must still be indexed. Matched against project-root-relative + * paths, so `"Tools/"`, a recursive `"Tools/**"` glob, or `"Local/typescript"` + * all work. + * Built-in default-ignored dirs (`node_modules`, `dist`, …), `.git`, and + * CodeGraph's own data dir are never resurfaced; an explicit `exclude` still + * wins. Absent/empty (the default) forces nothing in. + */ + include?: string[]; } /** Parsed, validated view of a project's `codegraph.json`. */ @@ -60,6 +74,7 @@ interface ParsedConfig { extensions: Record; includeIgnored: string[]; exclude: string[]; + include: string[]; } interface CacheEntry { @@ -81,6 +96,7 @@ const EMPTY_CONFIG: ParsedConfig = Object.freeze({ extensions: EMPTY_EXTENSIONS, includeIgnored: Object.freeze([]) as unknown as string[], exclude: Object.freeze([]) as unknown as string[], + include: Object.freeze([]) as unknown as string[], }); /** @@ -132,10 +148,16 @@ function parseConfig(file: string): ParsedConfig { const extensions = extractExtensions(parsed, file); const includeIgnored = extractIncludeIgnored(parsed, file); const exclude = extractExclude(parsed, file); - if (extensions === EMPTY_EXTENSIONS && includeIgnored.length === 0 && exclude.length === 0) { + const include = extractInclude(parsed, file); + if ( + extensions === EMPTY_EXTENSIONS && + includeIgnored.length === 0 && + exclude.length === 0 && + include.length === 0 + ) { return EMPTY_CONFIG; } - return { extensions, includeIgnored, exclude }; + return { extensions, includeIgnored, exclude, include }; } /** @@ -214,6 +236,34 @@ function extractExclude(parsed: object, file: string): string[] { return out; } +/** + * Validate the `include` patterns: an array of non-empty gitignore-style strings + * naming first-party source to force INTO the index despite `.gitignore` — the + * whitelist for SVN/Perforce-only source a project gitignores out of Git (the + * general case `includeIgnored` never covered). A non-array value or a + * non-string/blank entry warns-and-skips; never throws. Patterns are kept + * verbatim (trimmed) so they match exactly as a `.gitignore` line would, against + * project-root-relative paths. + */ +function extractInclude(parsed: object, file: string): string[] { + const raw = (parsed as ProjectConfig).include; + if (raw === undefined) return []; + if (!Array.isArray(raw)) { + logWarn(`Ignoring "include" in ${PROJECT_CONFIG_FILENAME}: must be an array of gitignore-style patterns`, { file }); + return []; + } + + const out: string[] = []; + for (const entry of raw) { + if (typeof entry !== 'string' || !entry.trim()) { + logWarn(`Ignoring an "include" entry in ${PROJECT_CONFIG_FILENAME}: every pattern must be a non-empty string`, { file }); + continue; + } + out.push(entry.trim()); + } + return out; +} + /** * Load the parsed `codegraph.json` for a project, mtime-cached. A missing or * malformed file yields the zero-config default. One `stat` (and at most one @@ -275,6 +325,19 @@ export function loadExcludePatterns(rootDir: string): string[] { return loadParsedConfig(rootDir).exclude; } +/** + * Load the validated `include` patterns for a project, mtime-cached. + * + * These name first-party source to force INTO the index even when `.gitignore` + * would drop it — the whitelist for SVN/Perforce-only source a project + * gitignores out of Git. An empty result — the zero-config default — forces + * nothing in. Built-in default-ignored dirs, `.git`, and CodeGraph's data dir + * are never resurfaced, and an explicit `exclude` still wins. + */ +export function loadIncludePatterns(rootDir: string): string[] { + return loadParsedConfig(rootDir).include; +} + /** Test/maintenance hook: forget cached config (e.g. after rewriting it in a test). */ export function clearProjectConfigCache(): void { cache.clear();