Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,19 @@ index_dir: ./.java-codebase-rag
embedding:
# Hub id OR local directory containing the sentence-transformers model files.
# - Hub id example: `sentence-transformers/all-MiniLM-L6-v2`
# - Local path examples: `/opt/models/minilm`, `~/models/minilm`, `$MODEL_DIR/minilm`
# - Local path examples: `/opt/models/minilm`, `~/models/minilm`, `$MODEL_DIR/minilm`, `./models/minilm`
# - Resolution applies expanduser + expandvars when the value is path-shaped
# (starts with `/`, `./`, `../`, `~`, or contains `$`). Same rule for
# `SBERT_MODEL` and `--embedding-model` after precedence picks the string.
# Plain `org/name` is treated as a hub id and passed through unchanged.
# A relative path without `./` (e.g. `models/minilm`) is ambiguous with
# hub-id shape — prepend `./` if you mean a local directory.
# (starts with `/`, `./`, `../`, `~`, or contains `$`); a result still
# `./` / `../`-prefixed after that expansion is then resolved to absolute.
# Same rule for `SBERT_MODEL` and `--embedding-model` after precedence picks
# the string. Plain `org/name` is treated as a hub id and passed through
# unchanged. A relative path without `./` (e.g. `models/minilm`) is
# ambiguous with hub-id shape — prepend `./` if you mean a local directory.
# - Relative base (mirrors `index_dir`): a YAML `model` resolves against THIS
# config file's directory; `SBERT_MODEL` / `--embedding-model` resolve
# against the resolved `source_root`. So a committed `model: ./models/minilm`
# is portable across machines and across the CLI indexer vs the MCP reader,
# regardless of process CWD.
# - Env: SBERT_MODEL. CLI: --embedding-model. Default: sentence-transformers/all-MiniLM-L6-v2
model: sentence-transformers/all-MiniLM-L6-v2

Expand Down Expand Up @@ -220,7 +226,7 @@ async_producer_overrides:
| Field | Expanded? | Notes |
|---|---|---|
| `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. A YAML relative path resolves against the config file's directory (same base as `source_root`); the default `./.java-codebase-rag` sits beside the resolved `source_root`. |
| `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`. Plain `org/name` is treated as a hub id and passed through. Applies to the value after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). |
| `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`; `~` / `$VAR` are expanded, then a result still `./` / `../`-prefixed is resolved to absolute. Plain `org/name` is treated as a hub id and passed through. Relative base (mirrors `index_dir`): a YAML `model` resolves against the config file's directory; `SBERT_MODEL` / `--embedding-model` resolve against `source_root`. Applies after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). |
| `embedding.device` | n/a | Device strings (`cpu`, `cuda`, `mps`) aren't paths. |
| `microservice_roots[*]` | no | Each entry is a directory **name** relative to `source_root`, not an arbitrary path. |
| Brownfield `path:` / `topic:` values | no | These are URL paths and Kafka topic names, not filesystem paths. Literal characters preserved. |
Expand Down
59 changes: 54 additions & 5 deletions java_codebase_rag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,36 @@ def cocoindex_subprocess_env_defaults() -> dict[str, str]:
_UNRESOLVED_VAR_RE = re.compile(r"\$(\w+|\{[^}]+\})")


def maybe_expand_embedding_model_path(value: str) -> str:
"""Expand ``~`` and ``$VAR`` when *value* is path-shaped.
def maybe_expand_embedding_model_path(
value: str,
*,
config_dir: Path | None = None,
source_root: Path | None = None,
source: SettingSource | None = None,
) -> str:
"""Expand ``~`` / ``$VAR`` for path-shaped values and resolve relatives to absolute.

Path-shape: starts with ``/``, ``./``, ``../``, ``~``, or contains ``$``.
Plain ``org/name`` (hub id) does not match and is passed through unchanged.

Used for ``embedding.model`` after precedence resolution and for runtime
``SBERT_MODEL`` reads (e.g. MCP) so the string matches ``ResolvedOperatorConfig``.
Relative resolution mirrors :func:`_resolve_index_dir_path` so a committed
config is portable regardless of process CWD:

* YAML values (``source == "yaml"``) resolve against ``config_dir`` (the
directory holding ``.java-codebase-rag.yml``).
* CLI / env values resolve against ``source_root``.

Only a result that still starts with ``./`` or ``../`` *after* ``~`` /
``$VAR`` expansion is re-based — so hub ids (``org/name``), absolute paths,
``~/``-expanded paths, and an env var that already yielded an absolute path
are all left untouched.

When no base is supplied (the runtime ``SBERT_MODEL`` read via
:func:`resolved_sbert_model_for_process_env`), relative resolution is
skipped: the value is returned ``expandvars`` / ``expanduser``-expanded but
not re-based, matching the prior best-effort behavior. The main resolution
path (:func:`resolve_operator_config`) supplies a base, so the absolute path
it stores is what downstream loaders receive.
"""
needs_expand = value.startswith(("/", "./", "../", "~")) or "$" in value
if not needs_expand:
Expand All @@ -70,9 +92,31 @@ def maybe_expand_embedding_model_path(value: str) -> str:
f"java-codebase-rag: path-shaped model string contains unresolved variable: {expanded}",
file=sys.stderr,
)
if expanded.startswith(("./", "../")):
base = _embedding_model_base(
source=source, config_dir=config_dir, source_root=source_root
)
if base is not None:
return str((base / expanded).resolve())
return expanded


def _embedding_model_base(
*,
source: SettingSource | None,
config_dir: Path | None,
source_root: Path | None,
) -> Path | None:
"""Base directory for a relative ``embedding.model``.

Mirrors :func:`_resolve_index_dir_path`: YAML values anchor on the config
file's directory; CLI / env values anchor on the resolved ``source_root``.
"""
if source == "yaml":
return config_dir
return source_root


def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
"""``SBERT_MODEL`` from the process environment, with the same expansion as YAML/CLI resolution.

Expand Down Expand Up @@ -387,7 +431,12 @@ def resolve_operator_config(
yaml_path=("embedding", "model"),
default=_DEFAULT_EMBEDDING_MODEL,
)
model = maybe_expand_embedding_model_path(model)
model = maybe_expand_embedding_model_path(
model,
config_dir=config_dir,
source_root=root,
source=model_src,
)
device, device_src = _pick_optional_device(
cli_val=cli_embedding_device,
env_key="SBERT_DEVICE",
Expand Down
149 changes: 149 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,155 @@ def test_existing_behavior_unchanged(self, tmp_path, monkeypatch):
assert result.index_dir == tmp_path / ".java-codebase-rag"


class TestEmbeddingModelRelativePath:
"""``embedding.model`` relative paths resolve against a base directory.

Mirrors ``index_dir`` (see ``TestIndexDirRelativeToConfigDir``): a relative
model path in YAML resolves against the config file's directory; a relative
model path from CLI / env resolves against the resolved ``source_root``.
This makes a committed ``.java-codebase-rag.yml`` portable — the model loads
from the same absolute path for the CLI indexer and the MCP reader, instead
of resolving against an unreliable process CWD.
"""

def test_yaml_relative_model_resolves_against_config_dir(self, tmp_path, monkeypatch):
"""``embedding.model: ./models/minilm`` (YAML) -> <config_dir>/models/minilm."""
monkeypatch.delenv("SBERT_MODEL", raising=False)
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)

config_dir = tmp_path / "ctx"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text(
"embedding:\n model: ./models/minilm\n"
)
monkeypatch.chdir(config_dir)

result = resolve_operator_config(source_root=None)
assert result.embedding_model == str((config_dir / "models/minilm").resolve())
assert result.embedding_model_source == "yaml"

def test_yaml_double_dot_model_resolves_against_config_dir(self, tmp_path, monkeypatch):
"""``embedding.model: ../shared/minilm`` (YAML) -> <config_dir>/../shared/minilm."""
monkeypatch.delenv("SBERT_MODEL", raising=False)
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)

config_dir = tmp_path / "ctx"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text(
"embedding:\n model: ../shared/minilm\n"
)
monkeypatch.chdir(config_dir)

result = resolve_operator_config(source_root=None)
assert result.embedding_model == str((tmp_path / "shared/minilm").resolve())

def test_env_relative_model_resolves_against_source_root(self, tmp_path, monkeypatch):
"""``SBERT_MODEL=./models/minilm`` (env) -> <source_root>/models/minilm.

Config sets ``source_root: ../`` so source_root (tmp_path) differs from
config_dir (tmp_path/ctx); the env-sourced model must anchor on
source_root, not config_dir — matching ``index_dir``'s env base.
"""
monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False)

config_dir = tmp_path / "ctx"
config_dir.mkdir()
(config_dir / YAML_CONFIG_FILENAMES[0]).write_text("source_root: ../\n")
monkeypatch.chdir(config_dir)
monkeypatch.setenv("SBERT_MODEL", "./models/minilm")

result = resolve_operator_config(source_root=None)
assert result.source_root == tmp_path
assert result.embedding_model == str((tmp_path / "models/minilm").resolve())
assert result.embedding_model_source == "env"

def test_cli_relative_model_resolves_against_source_root(self, tmp_path, monkeypatch):
"""``--embedding-model ./models/minilm`` (CLI) -> <source_root>/models/minilm."""
monkeypatch.delenv("SBERT_MODEL", raising=False)

result = resolve_operator_config(
source_root=tmp_path, cli_embedding_model="./models/minilm"
)
assert result.embedding_model == str((tmp_path / "models/minilm").resolve())
assert result.embedding_model_source == "cli"


class TestMaybeExpandEmbeddingModelPath:
"""Unit tests pinning the expansion/resolution helper's contract."""

def test_no_base_leaves_relative_unchanged(self):
"""Without a base dir, relative paths are NOT resolved.

``resolved_sbert_model_for_process_env`` (the MCP runtime read of
``SBERT_MODEL``) calls this with no base; it must stay a no-op for
relative values so MCP behavior is unchanged there. The main resolution
path supplies a base, so the absolute path it produces is what reaches
the lazy loader in practice.
"""
from java_codebase_rag.config import maybe_expand_embedding_model_path

assert maybe_expand_embedding_model_path("./models/minilm") == "./models/minilm"
assert maybe_expand_embedding_model_path("../shared/minilm") == "../shared/minilm"

def test_hub_id_passthrough(self):
from java_codebase_rag.config import maybe_expand_embedding_model_path

assert maybe_expand_embedding_model_path("org/name") == "org/name"
assert (
maybe_expand_embedding_model_path("sentence-transformers/all-MiniLM-L6-v2")
== "sentence-transformers/all-MiniLM-L6-v2"
)

def test_absolute_passthrough(self):
from java_codebase_rag.config import maybe_expand_embedding_model_path

assert maybe_expand_embedding_model_path("/opt/models/minilm") == "/opt/models/minilm"

def test_env_var_expansion_preserved(self, monkeypatch):
from java_codebase_rag.config import maybe_expand_embedding_model_path

monkeypatch.setenv("MODEL_DIR", "/opt/models")
assert maybe_expand_embedding_model_path("${MODEL_DIR}/minilm") == "/opt/models/minilm"
assert maybe_expand_embedding_model_path("$MODEL_DIR/minilm") == "/opt/models/minilm"

def test_tilde_expansion_preserved(self, monkeypatch):
from java_codebase_rag.config import maybe_expand_embedding_model_path

monkeypatch.setenv("HOME", "/home/user")
assert maybe_expand_embedding_model_path("~/models/minilm") == "/home/user/models/minilm"

def test_yaml_base_resolves_relative(self, tmp_path):
from java_codebase_rag.config import maybe_expand_embedding_model_path

out = maybe_expand_embedding_model_path(
"./models/minilm", config_dir=tmp_path, source="yaml"
)
assert out == str((tmp_path / "models/minilm").resolve())

def test_cli_env_base_is_source_root(self, tmp_path):
from java_codebase_rag.config import maybe_expand_embedding_model_path

for src in ("cli", "env"):
out = maybe_expand_embedding_model_path(
"./models/minilm", source_root=tmp_path, source=src
)
assert out == str((tmp_path / "models/minilm").resolve())

def test_absolute_after_env_var_not_rebased(self, tmp_path, monkeypatch):
"""An env var that already yields an absolute path is left absolute.

Guards the ``${HUB_ID}`` edge: only ``./`` / ``../``-prefixed results are
re-based, so a var holding ``org/name`` or an absolute path is untouched.
"""
from java_codebase_rag.config import maybe_expand_embedding_model_path

monkeypatch.setenv("MODEL_DIR", "/opt/models")
out = maybe_expand_embedding_model_path(
"${MODEL_DIR}/minilm", config_dir=tmp_path, source="yaml"
)
assert out == "/opt/models/minilm"


def test_cocoindex_subprocess_env_defaults_uses_real_inflight_env_var() -> None:
"""The throttle must use CocoIndex's REAL env var name.

Expand Down
Loading