From ca4f9fbb7c5be9db305abbd9a334ad23d3da4e2e Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sun, 14 Jun 2026 22:09:34 +0300 Subject: [PATCH] feat(config): resolve relative embedding.model against config dir / source_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A relative `embedding.model` (e.g. `./models/minilm`) was handed to sentence-transformers verbatim and resolved against process CWD — unlike `index_dir` and `source_root`, which anchor on the config file's directory. That made a committed `.java-codebase-rag.yml` non-portable, and could let the CLI indexer and the MCP reader load different models from the same config when their CWDs differed. `maybe_expand_embedding_model_path` now takes optional `config_dir` / `source_root` / `source` kwargs and, after the existing `~` / `$VAR` expansion, resolves a result still `./` / `../`-prefixed to absolute — mirroring `_resolve_index_dir_path`: YAML values anchor on the config file's directory, CLI / env values on `source_root`. Hub ids, absolute paths, `~/`-expanded values, and an env var that already yielded an absolute path are all left untouched. With no base supplied, relative resolution is skipped, so the MCP runtime read (`resolved_sbert_model_for_process_env`) is byte-for-byte unchanged — it receives an already-absolute path from `apply_to_os_environ` in the normal flow. The #239 MCP YAML-loading fix is untouched. Co-Authored-By: Claude --- docs/CONFIGURATION.md | 20 +++-- java_codebase_rag/config.py | 59 ++++++++++++-- tests/test_config.py | 149 ++++++++++++++++++++++++++++++++++++ 3 files changed, 216 insertions(+), 12 deletions(-) diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 68a3b2d..9820b1f 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -98,13 +98,19 @@ index_dir: ./.java-codebase-rag embedding: # Hub id OR local directory containing the sentence-transformers model files. # - Hub id example: `sentence-transformers/all-MiniLM-L6-v2` - # - Local path examples: `/opt/models/minilm`, `~/models/minilm`, `$MODEL_DIR/minilm` + # - Local path examples: `/opt/models/minilm`, `~/models/minilm`, `$MODEL_DIR/minilm`, `./models/minilm` # - Resolution applies expanduser + expandvars when the value is path-shaped - # (starts with `/`, `./`, `../`, `~`, or contains `$`). Same rule for - # `SBERT_MODEL` and `--embedding-model` after precedence picks the string. - # Plain `org/name` is treated as a hub id and passed through unchanged. - # A relative path without `./` (e.g. `models/minilm`) is ambiguous with - # hub-id shape — prepend `./` if you mean a local directory. + # (starts with `/`, `./`, `../`, `~`, or contains `$`); a result still + # `./` / `../`-prefixed after that expansion is then resolved to absolute. + # Same rule for `SBERT_MODEL` and `--embedding-model` after precedence picks + # the string. Plain `org/name` is treated as a hub id and passed through + # unchanged. A relative path without `./` (e.g. `models/minilm`) is + # ambiguous with hub-id shape — prepend `./` if you mean a local directory. + # - Relative base (mirrors `index_dir`): a YAML `model` resolves against THIS + # config file's directory; `SBERT_MODEL` / `--embedding-model` resolve + # against the resolved `source_root`. So a committed `model: ./models/minilm` + # is portable across machines and across the CLI indexer vs the MCP reader, + # regardless of process CWD. # - Env: SBERT_MODEL. CLI: --embedding-model. Default: sentence-transformers/all-MiniLM-L6-v2 model: sentence-transformers/all-MiniLM-L6-v2 @@ -220,7 +226,7 @@ async_producer_overrides: | Field | Expanded? | Notes | |---|---|---| | `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. A YAML relative path resolves against the config file's directory (same base as `source_root`); the default `./.java-codebase-rag` sits beside the resolved `source_root`. | -| `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`. Plain `org/name` is treated as a hub id and passed through. Applies to the value after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). | +| `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`; `~` / `$VAR` are expanded, then a result still `./` / `../`-prefixed is resolved to absolute. Plain `org/name` is treated as a hub id and passed through. Relative base (mirrors `index_dir`): a YAML `model` resolves against the config file's directory; `SBERT_MODEL` / `--embedding-model` resolve against `source_root`. Applies after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). | | `embedding.device` | n/a | Device strings (`cpu`, `cuda`, `mps`) aren't paths. | | `microservice_roots[*]` | no | Each entry is a directory **name** relative to `source_root`, not an arbitrary path. | | Brownfield `path:` / `topic:` values | no | These are URL paths and Kafka topic names, not filesystem paths. Literal characters preserved. | diff --git a/java_codebase_rag/config.py b/java_codebase_rag/config.py index 23a9308..0b7a2de 100644 --- a/java_codebase_rag/config.py +++ b/java_codebase_rag/config.py @@ -52,14 +52,36 @@ def cocoindex_subprocess_env_defaults() -> dict[str, str]: _UNRESOLVED_VAR_RE = re.compile(r"\$(\w+|\{[^}]+\})") -def maybe_expand_embedding_model_path(value: str) -> str: - """Expand ``~`` and ``$VAR`` when *value* is path-shaped. +def maybe_expand_embedding_model_path( + value: str, + *, + config_dir: Path | None = None, + source_root: Path | None = None, + source: SettingSource | None = None, +) -> str: + """Expand ``~`` / ``$VAR`` for path-shaped values and resolve relatives to absolute. Path-shape: starts with ``/``, ``./``, ``../``, ``~``, or contains ``$``. Plain ``org/name`` (hub id) does not match and is passed through unchanged. - Used for ``embedding.model`` after precedence resolution and for runtime - ``SBERT_MODEL`` reads (e.g. MCP) so the string matches ``ResolvedOperatorConfig``. + Relative resolution mirrors :func:`_resolve_index_dir_path` so a committed + config is portable regardless of process CWD: + + * YAML values (``source == "yaml"``) resolve against ``config_dir`` (the + directory holding ``.java-codebase-rag.yml``). + * CLI / env values resolve against ``source_root``. + + Only a result that still starts with ``./`` or ``../`` *after* ``~`` / + ``$VAR`` expansion is re-based — so hub ids (``org/name``), absolute paths, + ``~/``-expanded paths, and an env var that already yielded an absolute path + are all left untouched. + + When no base is supplied (the runtime ``SBERT_MODEL`` read via + :func:`resolved_sbert_model_for_process_env`), relative resolution is + skipped: the value is returned ``expandvars`` / ``expanduser``-expanded but + not re-based, matching the prior best-effort behavior. The main resolution + path (:func:`resolve_operator_config`) supplies a base, so the absolute path + it stores is what downstream loaders receive. """ needs_expand = value.startswith(("/", "./", "../", "~")) or "$" in value if not needs_expand: @@ -70,9 +92,31 @@ def maybe_expand_embedding_model_path(value: str) -> str: f"java-codebase-rag: path-shaped model string contains unresolved variable: {expanded}", file=sys.stderr, ) + if expanded.startswith(("./", "../")): + base = _embedding_model_base( + source=source, config_dir=config_dir, source_root=source_root + ) + if base is not None: + return str((base / expanded).resolve()) return expanded +def _embedding_model_base( + *, + source: SettingSource | None, + config_dir: Path | None, + source_root: Path | None, +) -> Path | None: + """Base directory for a relative ``embedding.model``. + + Mirrors :func:`_resolve_index_dir_path`: YAML values anchor on the config + file's directory; CLI / env values anchor on the resolved ``source_root``. + """ + if source == "yaml": + return config_dir + return source_root + + def resolved_sbert_model_for_process_env(import_time_default: str) -> str: """``SBERT_MODEL`` from the process environment, with the same expansion as YAML/CLI resolution. @@ -387,7 +431,12 @@ def resolve_operator_config( yaml_path=("embedding", "model"), default=_DEFAULT_EMBEDDING_MODEL, ) - model = maybe_expand_embedding_model_path(model) + model = maybe_expand_embedding_model_path( + model, + config_dir=config_dir, + source_root=root, + source=model_src, + ) device, device_src = _pick_optional_device( cli_val=cli_embedding_device, env_key="SBERT_DEVICE", diff --git a/tests/test_config.py b/tests/test_config.py index a07d2f4..0d97a26 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -292,6 +292,155 @@ def test_existing_behavior_unchanged(self, tmp_path, monkeypatch): assert result.index_dir == tmp_path / ".java-codebase-rag" +class TestEmbeddingModelRelativePath: + """``embedding.model`` relative paths resolve against a base directory. + + Mirrors ``index_dir`` (see ``TestIndexDirRelativeToConfigDir``): a relative + model path in YAML resolves against the config file's directory; a relative + model path from CLI / env resolves against the resolved ``source_root``. + This makes a committed ``.java-codebase-rag.yml`` portable — the model loads + from the same absolute path for the CLI indexer and the MCP reader, instead + of resolving against an unreliable process CWD. + """ + + def test_yaml_relative_model_resolves_against_config_dir(self, tmp_path, monkeypatch): + """``embedding.model: ./models/minilm`` (YAML) -> /models/minilm.""" + monkeypatch.delenv("SBERT_MODEL", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + + config_dir = tmp_path / "ctx" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text( + "embedding:\n model: ./models/minilm\n" + ) + monkeypatch.chdir(config_dir) + + result = resolve_operator_config(source_root=None) + assert result.embedding_model == str((config_dir / "models/minilm").resolve()) + assert result.embedding_model_source == "yaml" + + def test_yaml_double_dot_model_resolves_against_config_dir(self, tmp_path, monkeypatch): + """``embedding.model: ../shared/minilm`` (YAML) -> /../shared/minilm.""" + monkeypatch.delenv("SBERT_MODEL", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + + config_dir = tmp_path / "ctx" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text( + "embedding:\n model: ../shared/minilm\n" + ) + monkeypatch.chdir(config_dir) + + result = resolve_operator_config(source_root=None) + assert result.embedding_model == str((tmp_path / "shared/minilm").resolve()) + + def test_env_relative_model_resolves_against_source_root(self, tmp_path, monkeypatch): + """``SBERT_MODEL=./models/minilm`` (env) -> /models/minilm. + + Config sets ``source_root: ../`` so source_root (tmp_path) differs from + config_dir (tmp_path/ctx); the env-sourced model must anchor on + source_root, not config_dir — matching ``index_dir``'s env base. + """ + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + + config_dir = tmp_path / "ctx" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text("source_root: ../\n") + monkeypatch.chdir(config_dir) + monkeypatch.setenv("SBERT_MODEL", "./models/minilm") + + result = resolve_operator_config(source_root=None) + assert result.source_root == tmp_path + assert result.embedding_model == str((tmp_path / "models/minilm").resolve()) + assert result.embedding_model_source == "env" + + def test_cli_relative_model_resolves_against_source_root(self, tmp_path, monkeypatch): + """``--embedding-model ./models/minilm`` (CLI) -> /models/minilm.""" + monkeypatch.delenv("SBERT_MODEL", raising=False) + + result = resolve_operator_config( + source_root=tmp_path, cli_embedding_model="./models/minilm" + ) + assert result.embedding_model == str((tmp_path / "models/minilm").resolve()) + assert result.embedding_model_source == "cli" + + +class TestMaybeExpandEmbeddingModelPath: + """Unit tests pinning the expansion/resolution helper's contract.""" + + def test_no_base_leaves_relative_unchanged(self): + """Without a base dir, relative paths are NOT resolved. + + ``resolved_sbert_model_for_process_env`` (the MCP runtime read of + ``SBERT_MODEL``) calls this with no base; it must stay a no-op for + relative values so MCP behavior is unchanged there. The main resolution + path supplies a base, so the absolute path it produces is what reaches + the lazy loader in practice. + """ + from java_codebase_rag.config import maybe_expand_embedding_model_path + + assert maybe_expand_embedding_model_path("./models/minilm") == "./models/minilm" + assert maybe_expand_embedding_model_path("../shared/minilm") == "../shared/minilm" + + def test_hub_id_passthrough(self): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + assert maybe_expand_embedding_model_path("org/name") == "org/name" + assert ( + maybe_expand_embedding_model_path("sentence-transformers/all-MiniLM-L6-v2") + == "sentence-transformers/all-MiniLM-L6-v2" + ) + + def test_absolute_passthrough(self): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + assert maybe_expand_embedding_model_path("/opt/models/minilm") == "/opt/models/minilm" + + def test_env_var_expansion_preserved(self, monkeypatch): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + monkeypatch.setenv("MODEL_DIR", "/opt/models") + assert maybe_expand_embedding_model_path("${MODEL_DIR}/minilm") == "/opt/models/minilm" + assert maybe_expand_embedding_model_path("$MODEL_DIR/minilm") == "/opt/models/minilm" + + def test_tilde_expansion_preserved(self, monkeypatch): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + monkeypatch.setenv("HOME", "/home/user") + assert maybe_expand_embedding_model_path("~/models/minilm") == "/home/user/models/minilm" + + def test_yaml_base_resolves_relative(self, tmp_path): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + out = maybe_expand_embedding_model_path( + "./models/minilm", config_dir=tmp_path, source="yaml" + ) + assert out == str((tmp_path / "models/minilm").resolve()) + + def test_cli_env_base_is_source_root(self, tmp_path): + from java_codebase_rag.config import maybe_expand_embedding_model_path + + for src in ("cli", "env"): + out = maybe_expand_embedding_model_path( + "./models/minilm", source_root=tmp_path, source=src + ) + assert out == str((tmp_path / "models/minilm").resolve()) + + def test_absolute_after_env_var_not_rebased(self, tmp_path, monkeypatch): + """An env var that already yields an absolute path is left absolute. + + Guards the ``${HUB_ID}`` edge: only ``./`` / ``../``-prefixed results are + re-based, so a var holding ``org/name`` or an absolute path is untouched. + """ + from java_codebase_rag.config import maybe_expand_embedding_model_path + + monkeypatch.setenv("MODEL_DIR", "/opt/models") + out = maybe_expand_embedding_model_path( + "${MODEL_DIR}/minilm", config_dir=tmp_path, source="yaml" + ) + assert out == "/opt/models/minilm" + + def test_cocoindex_subprocess_env_defaults_uses_real_inflight_env_var() -> None: """The throttle must use CocoIndex's REAL env var name.