Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ java-codebase-rag install --non-interactive --agent claude-code

After `pip install --upgrade java-codebase-rag`, run `java-codebase-rag update` to refresh shipped artifacts and catch up the index (Lance + graph).

All indexing lifecycle commands (`init`, `increment`, `reprocess`, `install`, `update`) show a unified `Vectors → Optimize → Graph` progress bar on stderr during the index build (powered by `rich`); pass `--quiet` to suppress it.

### Manual registration

If you prefer manual configuration, see [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) for the full CLI reference.
Expand Down
102 changes: 95 additions & 7 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from __future__ import annotations

import argparse
import contextlib
import hashlib
import json
import logging
Expand Down Expand Up @@ -84,6 +85,53 @@ def _verbose_stderr_line(content: str) -> None:
print(content, file=sys.stderr, flush=True)


def _emit_graph_progress(parts: dict[str, object], *, verbose: bool) -> None:
"""Emit one ``JCIRAG_PROGRESS kind=graph …`` line to stderr (gated by verbose).

The parent process (``pipeline.run_build_ast_graph`` /
``run_incremental_graph``) passes ``--verbose`` in default AND verbose modes
(only suppressed for ``--quiet``), so this structured progress surfaces in
default mode (where the parent renders it) and verbose mode (raw relay). In
``--quiet`` the builder is never invoked with ``--verbose`` so nothing is
emitted. Field order is fixed so the parser and tests can pin substrings.
"""
if not verbose:
return
fields = ["kind=graph"]
for key in ("pass", "done", "total", "status", "elapsed_s"):
if key in parts:
fields.append(f"{key}={parts[key]}")
line = "JCIRAG_PROGRESS " + " ".join(fields)
_verbose_stderr_line(line)


# Pass-1 per-file tick cadence: bound stderr volume on huge trees without making
# the bar feel stale. A final tick on pass completion carries status=done.
_PASS1_TICK_EVERY = 25


@contextlib.contextmanager
def _graph_pass_progress(pass_label: str, *, verbose: bool):
"""Emit ``pass=N/6 status=running`` on entry and ``status=done elapsed_s=…``
on exit for passes 2–6 (each advances the rendered bar by 1/6).

Usage: ``with _graph_pass_progress("2/6", verbose=verbose): …``
"""
if not verbose:
yield
return
_emit_graph_progress({"pass": pass_label, "status": "running"}, verbose=verbose)
t0 = time.time()
try:
yield
finally:
elapsed = time.time() - t0
_emit_graph_progress(
{"pass": pass_label, "status": "done", "elapsed_s": f"{elapsed:.2f}"},
verbose=verbose,
)


class _VerbosePassHeartbeats:
"""Emit ``[tag] running … Ns elapsed`` every 5s on stderr while in scope (verbose only)."""

Expand Down Expand Up @@ -837,21 +885,50 @@ def _register_type(
return entry


def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool, scope_files: set[str] | None = None) -> dict[str, JavaFileAst]:
def pass1_parse(
root: Path,
tables: GraphTables,
*,
verbose: bool,
scope_files: set[str] | None = None,
removed_files: set[str] | None = None,
) -> dict[str, JavaFileAst]:
"""Walk files, parse them, populate node indexes. Returns path -> AST.

Args:
root: Source root directory.
tables: GraphTables to populate.
verbose: Whether to emit progress output.
scope_files: Optional set of relative POSIX paths to parse. If None, parse all files.
removed_files: Optional set of relative POSIX paths that no longer exist
on disk (incremental deletions). These are members of ``scope_files``
(they were deleted, so they participate in scoped deletion) but are
never visited by the parse walk, so they must be excluded from the
pass-1 total to keep ``done`` from undercounting then two-way-clamping.
"""
asts: dict[str, JavaFileAst] = {}
ignore = LayeredIgnore(root)
t0 = time.time()
n_files = 0
if verbose:
_verbose_stderr_line(_PASS1_START)
# Count-first: one filtered walk (no parsing) to set the EXACT total before
# the parse loop ticks. Single-layer ignore → the count is exact, so the
# rendered bar is determinate. For a scoped (incremental) parse the total is
# the number of files that will actually be visited: scope minus any removed
# files (which are members of scope for deletion but gone from disk, so the
# parse walk never ticks them); for a full rebuild it is the non-ignored
# .java count.
if verbose:
if scope_files is not None:
removed = removed_files if removed_files is not None else set()
pass1_total = len(scope_files - removed)
else:
pass1_total = sum(1 for _ in iter_java_source_files(root, ignore=ignore))
_emit_graph_progress(
{"pass": "1/6", "done": 0, "total": pass1_total, "status": "running"},
verbose=verbose,
)
slow_sec = 0.0
raw_slow = os.environ.get("JAVA_CODEBASE_RAG_TEST_GRAPH_SLOW_SEC", "").strip()
if raw_slow:
Expand All @@ -871,6 +948,11 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool, scope_files:
if scope_files is not None and rel not in scope_files:
continue
n_files += 1
if verbose and (n_files % _PASS1_TICK_EVERY == 0):
_emit_graph_progress(
{"pass": "1/6", "done": n_files, "status": "running"},
verbose=verbose,
)
try:
content = p.read_bytes()
except OSError:
Expand Down Expand Up @@ -906,6 +988,10 @@ def pass1_parse(root: Path, tables: GraphTables, *, verbose: bool, scope_files:

if verbose:
elapsed = time.time() - t0
_emit_graph_progress(
{"pass": "1/6", "done": n_files, "status": "done", "elapsed_s": f"{elapsed:.2f}"},
verbose=verbose,
)
_verbose_stderr_line(
f"[graph] pass 1 · parsed {n_files} files in {elapsed:.2f}s: "
f"{len(tables.types)} types, {len(tables.members)} members, "
Expand Down Expand Up @@ -1145,7 +1231,7 @@ def pass2_edges(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
seen_inj: set[tuple[str, str, str, str]] = set()
if verbose:
_verbose_stderr_line(_PASS2_START)
with _VerbosePassHeartbeats("[graph] pass 2", verbose=verbose):
with _graph_pass_progress("2/6", verbose=verbose), _VerbosePassHeartbeats("[graph] pass 2", verbose=verbose):
for fqn, entry in tables.types.items():
ast = asts.get(entry.file_path)
if ast is None:
Expand Down Expand Up @@ -1818,7 +1904,7 @@ def pass3_calls(tables: GraphTables, asts: dict[str, JavaFileAst], *, verbose: b
_verbose_stderr_line(_PASS3_START)
_build_member_indexes(tables)
stats = CallResolutionStats()
with _VerbosePassHeartbeats("[graph] pass 3", verbose=verbose):
with _graph_pass_progress("3/6", verbose=verbose), _VerbosePassHeartbeats("[graph] pass 3", verbose=verbose):
for rel_path, file_ast in asts.items():
try:
_process_file_calls(file_ast, rel_path, tables, stats)
Expand Down Expand Up @@ -1972,7 +2058,7 @@ def pass4_routes(
meta_chain = collect_annotation_meta_chain(prs)
if verbose:
_verbose_stderr_line(_PASS4_START)
with _VerbosePassHeartbeats("[graph] pass 4", verbose=verbose):
with _graph_pass_progress("4/6", verbose=verbose), _VerbosePassHeartbeats("[graph] pass 4", verbose=verbose):

for ast in asts.values():
stats.routes_skipped_unresolved += ast.routes_skipped_unresolved
Expand Down Expand Up @@ -2149,7 +2235,7 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str:

if verbose:
_verbose_stderr_line(_PASS5_START)
with _VerbosePassHeartbeats("[graph] pass 5", verbose=verbose):
with _graph_pass_progress("5/6", verbose=verbose), _VerbosePassHeartbeats("[graph] pass 5", verbose=verbose):
for member in sorted(tables.members, key=lambda x: x.node_id):
if member.decl.is_constructor:
continue
Expand Down Expand Up @@ -2551,7 +2637,7 @@ def _micro_factor(member: MemberEntry | None) -> float:

if verbose:
_verbose_stderr_line(_PASS6_START)
with _VerbosePassHeartbeats("[graph] pass 6", verbose=verbose):
with _graph_pass_progress("6/6", verbose=verbose), _VerbosePassHeartbeats("[graph] pass 6", verbose=verbose):
for row in tables.http_call_rows:
if row.match != "unresolved":
continue
Expand Down Expand Up @@ -3586,7 +3672,9 @@ def incremental_rebuild(
_verbose_stderr_line("[increment] rebuilding scoped files (passes 1-4)")

tables = GraphTables()
asts = pass1_parse(source_root, tables, verbose=verbose, scope_files=scope_files)
asts = pass1_parse(
source_root, tables, verbose=verbose, scope_files=scope_files, removed_files=removed
)

# Load existing types and members for cross-file resolution (only from unchanged files)
_load_existing_types(conn, tables, exclude_files=scope_files)
Expand Down
42 changes: 38 additions & 4 deletions docs/JAVA-CODEBASE-RAG-CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ java-codebase-rag install --scope user
- `--agent {claude-code,qwen-code,gigacode}` — Agent host to configure (can be passed multiple times).
- `--scope {project,user}` — Installation scope (default: `project`). Project scope writes to `.<host>/` in the project repo; user scope writes to `~/.<host>/` (globally available).
- `--model MODEL` — Embedding model path or `auto` (default: `auto`, downloads `sentence-transformers/all-MiniLM-L6-v2` on first run).
- `--quiet` / `-q` — Suppress the indexing progress stream on stderr (wizard prompts unchanged).
- `--verbose` / `-v` — Raw-relay subprocess output during the indexing sub-step (no progress bar).

**Exit codes:**
- `0` — Success (all stages completed).
Expand All @@ -55,7 +57,7 @@ java-codebase-rag install --scope user
3. Agent host selection — Claude Code, Qwen Code, GigaCode (multi-select).
4. Install scope — project or user.
5. MCP entrypoint resolution + artifact deployment — config, skill, agent files.
6. Index + finish — YAML generation, `.gitignore` update, `init`.
6. Index + finish — YAML generation, `.gitignore` update, `init`. Stage 6's indexing sub-step renders the unified `Vectors → Optimize → Graph` progress on **stderr** (see [Indexing progress](#indexing-progress-stderr)); the wizard's conversational stdout is unchanged.

**Re-running `install`:** If `.java-codebase-rag.yml` exists, the installer shows current values and offers "Update" (pre-filled) or "Start fresh". Existing MCP entries are updated in-place (merged, not duplicated). Skill/agent files trigger overwrite confirmation.

Expand All @@ -78,13 +80,14 @@ java-codebase-rag update --force
**Flags:**
- `--force` — Overwrite all artifacts even if content matches.
- `--dry-run` — Print changes without writing files.
- `--quiet` / `-q` — Suppress the indexing progress stream on stderr (wizard stdout unchanged).
- `--verbose` / `-v` — Raw-relay subprocess output during the indexing sub-step (no progress bar).

**Behavior:**
- Detects previously configured agent hosts (scans both project-level and user-level config files).
- Refreshes skill and agent files (versioned assets from the package).
- Updates MCP entrypoint path if `java-codebase-rag-mcp` has moved.
- Runs an incremental index update (Lance + graph) if an index exists — same as `java-codebase-rag increment`.
- Skips MCP config if the entry already exists and is correct.
- Runs an incremental index update (Lance + graph) if an index exists — same as `java-codebase-rag increment`. The indexing sub-step renders the unified `Vectors → Optimize → Graph` progress on **stderr** (see [Indexing progress](#indexing-progress-stderr)); it no longer runs silently.

**Exit codes:**
- `0` — Success.
Expand All @@ -95,14 +98,45 @@ java-codebase-rag update --force

- **TTY:** human-readable `pprint` of the payload on stdout (except **successful selective `reprocess`** with `--vectors-only` / `--graph-only`, which prints `Rebuilt:` / `Skipped:` lines instead of dumping the full dict).
- **Piped / non-TTY:** **single JSON object** per invocation on stdout (no trailing noise). Use this in scripts and CI.
- **Lifecycle stderr:** `init`, `increment`, `reprocess`, and `erase` stream subprocess progress (and relayed child stdout) to **stderr**; pass **`--quiet`** to suppress that stream. **stdout** stays the JSON/pprint payload only.
- **Lifecycle stderr:** `init`, `increment`, `reprocess`, `install`, `update`, and `erase` stream subprocess progress (and relayed child stdout) to **stderr**; pass **`--quiet`** to suppress that stream. **stdout** stays the JSON/pprint payload (`init`/`increment`/`reprocess`) or the wizard conversational text (`install`/`update`) only.

Example:

```bash
java-codebase-rag meta --source-root /path/to/java/repo --index-dir /path/to/.java-codebase-rag | jq .ontology_version
```

### Indexing progress (stderr)

All five lifecycle commands that build the index (`init`, `increment`, `reprocess`, `install`, `update`) render the **same unified progress** on **stderr** during indexing: a header line, a three-phase list `Vectors → Optimize → Graph`, and a footer line. The phase list is the single source of truth for "what's happening right now":

- **Vectors** — the `cocoindex update` Lance catch-up / full reprocess.
- **Optimize** — the serialized Lance table compaction that runs after a successful vectors phase.
- **Graph** — the `build_ast_graph.py` Kuzu/LadybugDB build (full or incremental).

**Determinate vs indeterminate per command:**

| Phase | Determinate? |
| ----- | ------------ |
| `Vectors` (full `init` / `reprocess`) | Approximately determinate — a pre-walk estimates the file count; the bar **clamps to 100% on completion** (the pre-walk overstates by ignored/empty files). |
| `Vectors` (incremental `increment` / `update`) | Indeterminate — CocoIndex's `memo=True` cache only calls the per-file function for changed files, so no denominator is known up front. A pulsing bar plus a "files touched: N" counter. |
| `Optimize` | Always indeterminate (no item count exposed by Lance compaction). |
| `Graph` (full `init` / `reprocess`) | Determinate — pass 1 does a count-first filtered walk for an exact total; passes 2–6 are six known steps. |
| `Graph` (incremental `increment` / `update`) | Determinate when it runs; falls back to a full rebuild on schema change. |

**Flags, TTY, and failure:**

| Mode | Behaviour |
| ---- | --------- |
| TTY (default) | `rich` `Live` region — the multi-line phase display (spinner + bar + `%` + ETA). |
| Non-TTY / CI | `rich` auto-disables; concise throttled stderr lines (~every 5 s per phase + a terminal line) so CI logs still show progress. |
| `--quiet` / `-q` | Suppresses the entire progress stream (no header, phases, or footer). The stdout payload is unchanged. |
| `--verbose` / `-v` | Bypasses parsing; relays raw subprocess output verbatim (Lance warnings, brownfield events, the raw `JCIRAG_PROGRESS` protocol lines). No `Live` region. |
| Phase failure | The failing phase renders a red `✗`; the footer carries `(exit=N)`. The `rich` `Live` region is torn down cleanly so the error stays visible. |
| Missing `cocoindex` / builder binary | The pre-spawn stub emits a `status=failed` line; no phase is left hung at `running`. |

> **Behaviour change (this release).** `install` and `update` now emit their indexing progress on **stderr** (previously `install` printed indexing chatter to stdout, and `update` ran the whole indexing step with `quiet=True` — completely silent). The wizard conversational stdout for both commands is otherwise unchanged. `update`'s previously-ignored `--quiet` / `--verbose` flags, and `install`'s previously-ignored `--verbose` flag, are now wired through (`install` already honored `--quiet`).

## Environment variables (summary)

| Variable | Role |
Expand Down
Loading
Loading