From 1b72761c428bec0889cf6318adaf39176a3afe23 Mon Sep 17 00:00:00 2001 From: Daniel Frankcom Date: Wed, 17 Jun 2026 16:04:18 -0700 Subject: [PATCH 1/3] Add a mongot search sidecar to the replica-set target Signed-off-by: Daniel Frankcom --- dev/compose.yaml | 49 ++++++++++++++++++- dev/mongot.yml | 30 ++++++++++++ documentdb_tests/framework/engine_registry.py | 37 +++++++++++++- documentdb_tests/framework/preconditions.py | 6 +++ 4 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 dev/mongot.yml diff --git a/dev/compose.yaml b/dev/compose.yaml index 4258cecfb..b9ee7622e 100644 --- a/dev/compose.yaml +++ b/dev/compose.yaml @@ -27,7 +27,7 @@ # query: # # A service with no `x-test-target` is not a test target and is ignored by the -# registry. +# registry (e.g. the mongot sidecar, which is reached only through its mongod). # # Memory: each mongod caps its WiredTiger cache (--wiredTigerCacheSizeGB). By # default a mongod sizes its cache to ~50% of the host/VM RAM; with several @@ -60,7 +60,26 @@ services: mongo-replset: image: mongo:8.2.4 profiles: ["mongo-replset", "all"] - command: ["--replSet", "rs0", "--bind_ip_all", "--wiredTigerCacheSizeGB", "1.5"] + command: + - "--replSet" + - "rs0" + - "--bind_ip_all" + - "--wiredTigerCacheSizeGB" + - "1.5" + # Point at the mongot search sidecar so this replica set also serves the + # search surfaces. mongot is transparent to all other behavior, so the + # set behaves identically to a plain replica set apart from gaining + # search; it is one target, not two. + - "--setParameter" + - "mongotHost=mongot:27028" + - "--setParameter" + - "searchIndexManagementHostAndPort=mongot:27028" + - "--setParameter" + - "useGrpcForSearch=true" + - "--setParameter" + - "skipAuthenticationToMongot=true" + - "--setParameter" + - "skipAuthenticationToSearchIndexManagementServer=true" ports: - "27018:27017" healthcheck: @@ -71,3 +90,29 @@ services: x-test-target: engine: mongodb query: directConnection=true + + # mongot: the search sidecar for the mongo-replset target. Not a test target + # on its own; the suite reaches it only through mongo-replset. mongot is + # MongoDB Search Community Edition (SSPL, same license as the server). It + # replicates from the replica set as an authenticated sync source and reads + # its password from a file, so the entrypoint writes that file (a fixed + # local-dev secret, matched by the searchCoordinator user the harness creates + # on the replica set) with owner-only permissions before launching. It retries + # the connection until that user exists. + mongot: + image: mongodb/mongodb-community-search:latest + profiles: ["mongo-replset", "all"] + entrypoint: + - "sh" + - "-c" + - > + umask 077 && + mkdir -p /mongot-secrets && + printf '%s' "$$MONGOT_SYNC_PASSWORD" > /mongot-secrets/passwordFile && + exec /mongot-community/mongot --config /mongot-config/mongot.yml + environment: + # Fixed local-dev secret shared with the searchCoordinator user the + # harness provisions on mongo-replset. Not a real credential. + MONGOT_SYNC_PASSWORD: "searchSyncPassword" + volumes: + - ./mongot.yml:/mongot-config/mongot.yml:ro diff --git a/dev/mongot.yml b/dev/mongot.yml new file mode 100644 index 000000000..9860f54a2 --- /dev/null +++ b/dev/mongot.yml @@ -0,0 +1,30 @@ +# mongot configuration for the mongo-replset target (dev/compose.yaml service +# "mongot"). mongot is MongoDB Search Community Edition (SSPL), the same license +# as the server. It runs alongside the replica set's mongod and serves the +# search and vector search surfaces. +# +# mongot replicates from the mongod replica set as a sync source. It requires an +# authenticated connection (it has no unauthenticated mode), so it logs in as a +# dedicated user holding the searchCoordinator role. That user and its password +# file are provisioned by the target's startup (see dev/compose.yaml). +syncSource: + replicaSet: + hostAndPort: "mongo-replset:27017" + username: "searchSyncUser" + passwordFile: "/mongot-secrets/passwordFile" + authSource: "admin" + tls: false +storage: + dataPath: "/var/lib/mongot" +server: + grpc: + # mongod reaches mongot here (see mongotHost / searchIndexManagementHostAndPort + # on the mongo-replset service). Bound on all interfaces so the mongod + # container can connect over the compose network. + address: "0.0.0.0:27028" + tls: + mode: "disabled" +healthCheck: + address: "0.0.0.0:8080" +logging: + verbosity: INFO diff --git a/documentdb_tests/framework/engine_registry.py b/documentdb_tests/framework/engine_registry.py index 774ab7810..1e2bf1664 100644 --- a/documentdb_tests/framework/engine_registry.py +++ b/documentdb_tests/framework/engine_registry.py @@ -102,6 +102,15 @@ def _is_reachable(connection_string: str) -> bool: # replSetInitiate error code when the set is already initiated (e.g. a race # between concurrent callers); treated as success. _ALREADY_INITIALIZED = 23 +# createUser error code when the user already exists (idempotent re-runs). +_USER_ALREADY_EXISTS = 51003 + +# The user mongot authenticates as to replicate from a search-enabled mongod. +# Its name and password are a fixed local-dev secret matched by the mongot +# sidecar's config (see dev/mongot.yml and the mongot service in +# dev/compose.yaml); it is not a real credential. +_SEARCH_SYNC_USER = "searchSyncUser" +_SEARCH_SYNC_PASSWORD = "searchSyncPassword" def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: @@ -120,12 +129,14 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: that already initiated it (AlreadyInitialized) is tolerated. After initiating, it waits up to ``timeout_s`` for a primary to be elected - so callers can write immediately. + so callers can write immediately. A search-enabled mongod additionally has + the searchCoordinator user mongot needs provisioned once it is primary. """ client: MongoClient = MongoClient(connection_string, serverSelectionTimeoutMS=5000) try: try: client.admin.command("replSetGetStatus") + _ensure_search_user(client) # Idempotent; a no-op off a search target. return # Already initiated. except OperationFailure as exc: if exc.code != _NOT_YET_INITIALIZED: @@ -140,6 +151,7 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: deadline = time.monotonic() + timeout_s while time.monotonic() < deadline: if client.admin.command("hello").get("isWritablePrimary"): + _ensure_search_user(client) return time.sleep(0.5) raise TimeoutError( @@ -149,6 +161,29 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: client.close() +def _ensure_search_user(client: MongoClient) -> None: + """Provision the searchCoordinator user a search-enabled mongod needs. + + A search target points at a mongot sidecar (a non-empty ``mongotHost``). + mongot replicates from this mongod as an authenticated sync source, so it + needs a user with the searchCoordinator role to log in as. This creates that + user (idempotently) once the server is primary. It is a no-op on a target + without a mongot sidecar. + """ + if not client.admin.command({"getParameter": 1, "mongotHost": 1}).get("mongotHost"): + return # Not a search target. + try: + client.admin.command( + "createUser", + _SEARCH_SYNC_USER, + pwd=_SEARCH_SYNC_PASSWORD, + roles=[{"role": "searchCoordinator", "db": "admin"}], + ) + except OperationFailure as exc: + if exc.code != _USER_ALREADY_EXISTS: + raise + + def live_targets(compose_path: Path = COMPOSE_PATH) -> list[Target]: """Return the declared targets that are currently reachable.""" return [t for t in load_targets(compose_path) if _is_reachable(t.connection_string)] diff --git a/documentdb_tests/framework/preconditions.py b/documentdb_tests/framework/preconditions.py index c090c95b4..eac418a3a 100644 --- a/documentdb_tests/framework/preconditions.py +++ b/documentdb_tests/framework/preconditions.py @@ -56,11 +56,16 @@ "unforced_compact": "compact succeeds without force", "reindex": "reIndex is permitted", "local_rename": "renaming into the unreplicated local database is permitted", + "search": "search and vector search surfaces are available", } # The capabilities each (engine, topology) target has. To add an engine or # topology, add an entry here; every test then gates correctly. _CAPABILITIES_BY_PROFILE: dict[tuple[str, str], frozenset[str]] = { + # A replica set, wired to a mongot search sidecar so it also serves the + # search surfaces (see dev/compose.yaml). mongot is transparent to all other + # behavior, so this is a replica set that additionally has the search + # capability, not a distinct topology. ("mongodb", "replica_set"): frozenset( { "change_streams", @@ -70,6 +75,7 @@ "cluster_time", "cluster_read_concern", "quorum_write_concern", + "search", } ), ("mongodb", "standalone"): frozenset( From 64ffe591d7bc4304fee86017b8bdcdbf08bbe05e Mon Sep 17 00:00:00 2001 From: Daniel Frankcom Date: Tue, 23 Jun 2026 14:12:43 -0700 Subject: [PATCH 2/3] Add vectorSearch stage tests Signed-off-by: Daniel Frankcom --- .../test_stages_position_vectorSearch.py | 216 +++++++ .../operator/stages/vectorSearch/__init__.py | 0 .../operator/stages/vectorSearch/conftest.py | 237 ++++++++ .../vectorSearch/test_smoke_vectorSearch.py | 47 +- .../test_vectorSearch_core_matching.py | 156 +++++ .../vectorSearch/test_vectorSearch_exact.py | 228 +++++++ .../test_vectorSearch_explain_options.py | 230 +++++++ .../vectorSearch/test_vectorSearch_filter.py | 300 ++++++++++ .../test_vectorSearch_filter_parse_errors.py | 202 +++++++ ...st_vectorSearch_filter_predicate_errors.py | 259 ++++++++ ...st_vectorSearch_index_definition_errors.py | 139 +++++ .../test_vectorSearch_index_path.py | 165 ++++++ .../test_vectorSearch_index_path_errors.py | 283 +++++++++ .../vectorSearch/test_vectorSearch_limit.py | 285 +++++++++ .../vectorSearch/test_vectorSearch_nested.py | 560 ++++++++++++++++++ .../test_vectorSearch_num_candidates.py | 325 ++++++++++ .../test_vectorSearch_parent_filter.py | 369 ++++++++++++ .../test_vectorSearch_query_vector.py | 287 +++++++++ .../test_vectorSearch_query_vector_errors.py | 327 ++++++++++ ...test_vectorSearch_required_field_errors.py | 253 ++++++++ .../vectorSearch/test_vectorSearch_scoring.py | 202 +++++++ ...est_vectorSearch_search_node_preference.py | 271 +++++++++ .../test_vectorSearch_stage_basics.py | 218 +++++++ .../test_vectorSearch_stored_source.py | 251 ++++++++ .../stages/vectorSearch/utils/__init__.py | 0 .../vectorSearch/utils/vectorSearch_common.py | 44 ++ documentdb_tests/framework/error_codes.py | 4 + documentdb_tests/framework/property_checks.py | 17 + 28 files changed, 5863 insertions(+), 12 deletions(-) create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_vectorSearch.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/conftest.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_core_matching.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_exact.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_explain_options.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_parse_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_predicate_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_definition_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_limit.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_nested.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_num_candidates.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_parent_filter.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_required_field_errors.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_scoring.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_search_node_preference.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stage_basics.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stored_source.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/__init__.py create mode 100644 documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/vectorSearch_common.py diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_vectorSearch.py b/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_vectorSearch.py new file mode 100644 index 000000000..a2885ebcb --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_vectorSearch.py @@ -0,0 +1,216 @@ +"""Tests for $vectorSearch pipeline position constraints and stage placement.""" + +from __future__ import annotations + +import time + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + FACET_PIPELINE_INVALID_STAGE_ERROR, + LOOKUP_SUB_PIPELINE_NOT_ALLOWED_ERROR, + NOT_FIRST_STAGE_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import DOUBLE_ZERO + +pytestmark = pytest.mark.requires(search=True) + +_POSITION_CORPUS = [ + {"_id": 1, "vec": [1.0, DOUBLE_ZERO, DOUBLE_ZERO]}, + {"_id": 2, "vec": [0.8, 0.2, DOUBLE_ZERO]}, + {"_id": 3, "vec": [0.6, 0.4, DOUBLE_ZERO]}, +] + +_INDEX_READY_TIMEOUT_SECONDS = 120 + + +@pytest.fixture(scope="module") +def position_collection(engine_client, worker_id): + """A module-scoped collection with a READY cosine vectorSearch index, shared + read-only across the placement cases so the index is built and polled once + rather than per test. The collection carries a fixed name so the + $unionWith/$lookup sub-pipeline cases can reference it as their source.""" + db_name = fixtures.generate_database_name("stages_vectorSearch_position", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["position"] + coll.insert_many([dict(doc) for doc in _POSITION_CORPUS]) + db.command( + { + "createSearchIndexes": coll.name, + "indexes": [ + { + "name": "vs_position_index", + "type": "vectorSearch", + "definition": { + "fields": [ + { + "type": "vector", + "path": "vec", + "numDimensions": 3, + "similarity": "cosine", + }, + ] + }, + } + ], + } + ) + deadline = time.monotonic() + _INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(coll.aggregate([{"$listSearchIndexes": {}}])) + if indexes and indexes[0].get("status") == "READY": + break + time.sleep(2) + else: + raise TimeoutError("vectorSearch index did not reach READY state") + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Stage Placement Allowed]: $vectorSearch succeeds as the first stage +# of the main pipeline and as the first stage of a $unionWith sub-pipeline. +VECTORSEARCH_PLACEMENT_TESTS: list[StageTestCase] = [ + StageTestCase( + "first_stage_main_pipeline", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_position_index", + "path": "vec", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + {"$project": {"_id": 1}}, + ], + expected=[{"_id": 1}, {"_id": 2}, {"_id": 3}], + msg="$vectorSearch should succeed as the first stage of the main pipeline", + ), + StageTestCase( + "first_stage_union_with_sub_pipeline", + pipeline=[ + {"$match": {"_id": {"$lt": 0}}}, + { + "$unionWith": { + "coll": "position", + "pipeline": [ + { + "$vectorSearch": { + "index": "vs_position_index", + "path": "vec", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + {"$project": {"_id": 1}}, + ], + } + }, + ], + expected=[{"_id": 1}, {"_id": 2}, {"_id": 3}], + msg="$vectorSearch should succeed as the first stage of a $unionWith sub-pipeline", + ), +] + +# Property [Stage Placement Errors]: $vectorSearch is rejected when it is not the +# first stage of a pipeline, when nested in a $facet sub-pipeline, and when +# nested in a $lookup sub-pipeline. +VECTORSEARCH_PLACEMENT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "not_first_stage", + pipeline=[ + {"$match": {"_id": 1}}, + { + "$vectorSearch": { + "index": "vs_position_index", + "path": "vec", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + ], + error_code=NOT_FIRST_STAGE_ERROR, + msg="$vectorSearch should be rejected when it is not the first stage", + ), + StageTestCase( + "inside_facet", + pipeline=[ + { + "$facet": { + "results": [ + { + "$vectorSearch": { + "index": "vs_position_index", + "path": "vec", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + ], + } + }, + ], + error_code=FACET_PIPELINE_INVALID_STAGE_ERROR, + msg="$vectorSearch should be rejected inside a $facet sub-pipeline", + ), + StageTestCase( + "inside_lookup_sub_pipeline", + pipeline=[ + { + "$lookup": { + "from": "position", + "pipeline": [ + { + "$vectorSearch": { + "index": "vs_position_index", + "path": "vec", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + ], + "as": "matches", + } + }, + ], + error_code=LOOKUP_SUB_PIPELINE_NOT_ALLOWED_ERROR, + msg="$vectorSearch should be rejected inside a $lookup sub-pipeline", + ), +] + +VECTORSEARCH_POSITION_TESTS: list[StageTestCase] = ( + VECTORSEARCH_PLACEMENT_TESTS + VECTORSEARCH_PLACEMENT_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_POSITION_TESTS)) +def test_vectorSearch_position(position_collection, test_case: StageTestCase): + """Test $vectorSearch pipeline position constraints and rejections.""" + result = execute_command( + position_collection, + { + "aggregate": position_collection.name, + "pipeline": test_case.pipeline, + "cursor": {}, + }, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/__init__.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/conftest.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/conftest.py new file mode 100644 index 000000000..f928016ce --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/conftest.py @@ -0,0 +1,237 @@ +"""Package-scoped fixtures for $vectorSearch stage tests. + +Each vectorSearch index is heavyweight (created, then polled until READY), so the +corpora and indexes are built once per package here rather than per test file.""" + +from __future__ import annotations + +import time +from datetime import datetime, timezone + +import pytest +from bson import Int64 + +from documentdb_tests.framework.test_constants import DOUBLE_ZERO + +from .utils.vectorSearch_common import ( + _FILTER_OID_A, + _FILTER_OID_B, + _FILTER_UUID_A, + _FILTER_UUID_B, +) + +_VECTOR_CORPUS = [ + { + "_id": 1, + "vc": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "ve": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "vd": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "v8": [1.0] + [DOUBLE_ZERO] * 7, + "meta": {"vec": [1.0, DOUBLE_ZERO, DOUBLE_ZERO]}, + "cat": "x", + "year": 1999, + "count": Int64(10), + "rating": 4.5, + "active": True, + "oid": _FILTER_OID_A, + "uid": _FILTER_UUID_A, + "created": datetime(2020, 1, 1, tzinfo=timezone.utc), + "tags": ["x", "y"], + "opt": "p", + "name": "a", + }, + { + "_id": 2, + "vc": [0.8, 0.2, DOUBLE_ZERO], + "ve": [0.8, 0.2, DOUBLE_ZERO], + "vd": [0.8, 0.2, DOUBLE_ZERO], + "v8": [DOUBLE_ZERO, 1.0] + [DOUBLE_ZERO] * 6, + "meta": {"vec": [0.8, 0.2, DOUBLE_ZERO]}, + "cat": "x", + "year": 2000, + "count": Int64(20), + "rating": 3.0, + "active": False, + "oid": _FILTER_OID_B, + "uid": _FILTER_UUID_B, + "created": datetime(2021, 6, 15, tzinfo=timezone.utc), + "tags": ["y", "z"], + "opt": "p", + "name": "b", + }, + { + "_id": 3, + "vc": [0.6, 0.4, DOUBLE_ZERO], + "ve": [0.6, 0.4, DOUBLE_ZERO], + "vd": [0.6, 0.4, DOUBLE_ZERO], + "v8": [DOUBLE_ZERO] * 2 + [1.0] + [DOUBLE_ZERO] * 5, + "meta": {"vec": [0.6, 0.4, DOUBLE_ZERO]}, + "cat": "y", + "year": 2001, + "count": Int64(30), + "rating": 5.0, + "active": True, + "oid": _FILTER_OID_A, + "uid": _FILTER_UUID_A, + "created": datetime(2022, 12, 31, tzinfo=timezone.utc), + "tags": ["z"], + "opt": None, + "name": "c", + }, + { + "_id": 4, + "vc": [0.2, 0.8, DOUBLE_ZERO], + "ve": [0.2, 0.8, DOUBLE_ZERO], + "vd": [0.2, 0.8, DOUBLE_ZERO], + "v8": [DOUBLE_ZERO] * 3 + [1.0] + [DOUBLE_ZERO] * 4, + "meta": {"vec": [0.2, 0.8, DOUBLE_ZERO]}, + "cat": "y", + "year": 2010, + "count": Int64(40), + "rating": 2.0, + "active": False, + "oid": _FILTER_OID_B, + "uid": _FILTER_UUID_B, + "created": datetime(2023, 3, 3, tzinfo=timezone.utc), + "tags": [], + "name": "d", + }, + { + "_id": 5, + "vc": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO], + "ve": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO], + "vd": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO], + "v8": [DOUBLE_ZERO] * 4 + [1.0] + [DOUBLE_ZERO] * 3, + "meta": {"vec": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO]}, + "cat": "y", + "year": 2010, + "count": Int64(50), + "rating": 4.0, + "active": True, + "oid": _FILTER_OID_A, + "uid": _FILTER_UUID_A, + "created": datetime(2024, 7, 7, tzinfo=timezone.utc), + "tags": ["x"], + "name": "e", + }, +] + +# Mirror each document's cosine vector under a precomposed (NFC) Unicode field +# name (precomposed e-acute, U+00E9) so the index has a real Unicode-named vector +# path to contrast against the decomposed (NFD) query form. +for _doc in _VECTOR_CORPUS: + _doc["caf\u00e9_vec"] = _doc["vc"] + _doc["v8c"] = _doc["v8"] + +_INDEX_READY_TIMEOUT_SECONDS = 120 + + +@pytest.fixture(scope="package") +def vector_search_collection(engine_client, worker_id): + """Provide a collection with a READY cosine vectorSearch index over a fixed corpus.""" + db_name = f"vs_core_{worker_id}" + db = engine_client[db_name] + coll = db["vectors"] + db.drop_collection(coll.name) + db.create_collection(coll.name) + coll.insert_many([dict(doc) for doc in _VECTOR_CORPUS]) + db.command( + { + "createSearchIndexes": coll.name, + "indexes": [ + { + "name": "vs_core_index", + "type": "vectorSearch", + "definition": { + "fields": [ + { + "type": "vector", + "path": "vc", + "numDimensions": 3, + "similarity": "cosine", + }, + { + "type": "vector", + "path": "ve", + "numDimensions": 3, + "similarity": "euclidean", + }, + { + "type": "vector", + "path": "vd", + "numDimensions": 3, + "similarity": "dotProduct", + }, + {"type": "filter", "path": "cat"}, + {"type": "filter", "path": "year"}, + {"type": "filter", "path": "count"}, + {"type": "filter", "path": "rating"}, + {"type": "filter", "path": "active"}, + {"type": "filter", "path": "oid"}, + {"type": "filter", "path": "uid"}, + {"type": "filter", "path": "created"}, + {"type": "filter", "path": "tags"}, + {"type": "filter", "path": "opt"}, + { + "type": "vector", + "path": "v8", + "numDimensions": 8, + "similarity": "euclidean", + }, + { + "type": "vector", + "path": "v8c", + "numDimensions": 8, + "similarity": "cosine", + }, + { + "type": "vector", + "path": "meta.vec", + "numDimensions": 3, + "similarity": "cosine", + }, + { + "type": "vector", + "path": "caf\u00e9_vec", + "numDimensions": 3, + "similarity": "cosine", + }, + ] + }, + } + ], + } + ) + deadline = time.monotonic() + _INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(coll.aggregate([{"$listSearchIndexes": {}}])) + if indexes and indexes[0].get("status") == "READY": + break + time.sleep(2) + else: + raise TimeoutError("vectorSearch index did not reach READY state") + yield coll + engine_client.drop_database(db_name) + + +@pytest.fixture(scope="package") +def vector_search_no_index_collection(engine_client, worker_id): + """Provide an empty collection with no vectorSearch index for spec-error tests.""" + db_name = f"vs_core_noindex_{worker_id}" + db = engine_client[db_name] + coll = db["vectors"] + db.drop_collection(coll.name) + db.create_collection(coll.name) + yield coll + engine_client.drop_database(db_name) + + +@pytest.fixture(scope="package") +def vector_search_absent_collection(engine_client, worker_id): + """Provide a handle to a collection that does not exist (never created).""" + db_name = f"vs_core_absent_{worker_id}" + db = engine_client[db_name] + coll = db["absent_vectors"] + db.drop_collection(coll.name) + yield coll + engine_client.drop_database(db_name) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_smoke_vectorSearch.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_smoke_vectorSearch.py index 83fd9d6b2..35e58aeba 100644 --- a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_smoke_vectorSearch.py +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_smoke_vectorSearch.py @@ -1,36 +1,59 @@ """ Smoke test for $vectorSearch stage. -Tests basic $vectorSearch stage functionality. +Tests basic $vectorSearch stage functionality against a mongot-backed search +target. Gated with requires(search=True) so it is deselected on non-search +targets rather than unconditionally skipped. """ +import time + import pytest from documentdb_tests.framework.assertions import assertSuccess from documentdb_tests.framework.executor import execute_command -pytestmark = pytest.mark.smoke +pytestmark = [pytest.mark.smoke, pytest.mark.requires(search=True)] + +_INDEX_READY_TIMEOUT_SECONDS = 120 -@pytest.mark.skip(reason="Requires Atlas Search configuration - not available on standard MongoDB") +@pytest.mark.aggregate def test_smoke_vectorSearch(collection): """Test basic $vectorSearch stage behavior.""" - # Create vector index with vectorOptions + collection.insert_many([{"_id": 1, "name": "test", "embedding": [0.1, 0.2, 0.3]}]) + execute_command( collection, { - "createIndexes": collection.name, + "createSearchIndexes": collection.name, "indexes": [ { - "key": {"embedding": "vector"}, "name": "embedding_vector", - "vectorOptions": {"type": "hnsw", "dimensions": 3.0, "similarity": "euclidean"}, + "type": "vectorSearch", + "definition": { + "fields": [ + { + "type": "vector", + "path": "embedding", + "numDimensions": 3, + "similarity": "euclidean", + } + ] + }, } ], }, ) - collection.insert_many([{"_id": 1, "name": "test", "embedding": [0.1, 0.2, 0.3]}]) + deadline = time.monotonic() + _INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(collection.aggregate([{"$listSearchIndexes": {}}])) + if indexes and indexes[0].get("status") == "READY": + break + time.sleep(2) + else: + raise TimeoutError("vectorSearch index did not reach READY state") result = execute_command( collection, @@ -39,11 +62,11 @@ def test_smoke_vectorSearch(collection): "pipeline": [ { "$vectorSearch": { - "queryVector": [0.1, 0.2, 0.3], - "path": "embedding", - "numCandidates": 10.0, - "limit": 5.0, "index": "embedding_vector", + "path": "embedding", + "queryVector": [0.1, 0.2, 0.3], + "numCandidates": 10, + "limit": 5, } } ], diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_core_matching.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_core_matching.py new file mode 100644 index 000000000..14916ee3c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_core_matching.py @@ -0,0 +1,156 @@ +"""Tests for the $vectorSearch stage: core matching and result semantics.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Eq, + Len, + NotExists, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [Result Cardinality At Most Limit]: when fewer documents match than +# limit, all available documents are returned with no error and no padding. +VECTORSEARCH_CARDINALITY_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "limit_exceeds_collection_returns_all", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 100, + "limit": 100, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should return all available documents without padding when " + "limit exceeds the collection size", + ), +] + +# Property [Full Document Shape]: by default the stage returns the complete source +# document with its fields intact and no injected similarity-score field. +VECTORSEARCH_FULL_DOCUMENT_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "full_document_returned", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 1, + } + }, + ], + expected={ + "cursor.firstBatch": Len(1), + "cursor.firstBatch.0._id": Eq(1), + "cursor.firstBatch.0.cat": Eq("x"), + "cursor.firstBatch.0.year": Eq(1999), + "cursor.firstBatch.0.name": Eq("a"), + "cursor.firstBatch.0.vc": Eq([1.0, DOUBLE_ZERO, DOUBLE_ZERO]), + "cursor.firstBatch.0.vectorSearchScore": NotExists(), + "cursor.firstBatch.0.score": NotExists(), + }, + msg="$vectorSearch should return the full source document with no injected " + "score field by default", + ), +] + +# Property [Missing Collection Tolerance]: a well-formed query against a +# nonexistent collection, or an existing empty collection with no index, returns +# zero results with no namespace or index error. +VECTORSEARCH_MISSING_COLLECTION_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nonexistent_collection_zero_results", + collection_fixture="vector_search_absent_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should return zero results with no error against a " + "nonexistent collection", + ), + VectorSearchTest( + "empty_collection_zero_results", + collection_fixture="vector_search_no_index_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should return zero results with no error against an empty " + "collection that has no index", + ), +] + +VECTORSEARCH_CORE_MATCHING_ALL_TESTS = ( + VECTORSEARCH_CARDINALITY_TESTS + + VECTORSEARCH_FULL_DOCUMENT_TESTS + + VECTORSEARCH_MISSING_COLLECTION_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_CORE_MATCHING_ALL_TESTS)) +def test_vectorSearch_core_matching(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: core matching and result semantics.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_exact.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_exact.py new file mode 100644 index 000000000..570010c68 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_exact.py @@ -0,0 +1,228 @@ +"""Tests for the $vectorSearch stage: exact ANN/ENN selection.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [exact ANN Selection]: when exact is absent (omitted or null) or +# false, ANN runs with numCandidates and returns documents ordered by similarity +# up to limit. +VECTORSEARCH_EXACT_ANN_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "exact_omitted_defaults_ann", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg="$vectorSearch should default to ANN and return similarity-ordered " + "documents up to limit when exact is omitted", + ), + VectorSearchTest( + "exact_false_ann", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": False, + "numCandidates": 10, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg="$vectorSearch should run ANN and return similarity-ordered documents " + "up to limit when exact is false", + ), + VectorSearchTest( + "exact_null_treated_as_absent", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": None, + "numCandidates": 10, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg="$vectorSearch should treat exact null as field-absent and apply ANN " + "when numCandidates is present", + ), +] + +# Property [exact ENN Selection]: exact:true runs ENN and succeeds whether +# numCandidates is omitted or null, returning documents ordered by similarity up +# to limit. +VECTORSEARCH_EXACT_ENN_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "exact_true_num_candidates_omitted", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg="$vectorSearch should run ENN without numCandidates and return " + "similarity-ordered documents up to limit", + ), + VectorSearchTest( + "exact_true_num_candidates_null", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "numCandidates": None, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg="$vectorSearch should run ENN with numCandidates null and return " + "similarity-ordered documents up to limit", + ), + VectorSearchTest( + "exact_true_scores_match_ann", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.9850712418556213)}, + {"_id": 3, "score": pytest.approx(0.9160251617431641)}, + {"_id": 4, "score": pytest.approx(0.6212677955627441)}, + {"_id": 5, "score": pytest.approx(0.5)}, + ], + msg="$vectorSearch ENN should return the same similarity-ordered ids and " + "scores as the equivalent ANN query", + ), +] + +# Property [exact Type Strictness]: a non-boolean exact value of any BSON type is +# rejected as a non-boolean with no coercion of numeric or string truthiness. +VECTORSEARCH_EXACT_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"exact_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": val, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} exact value as a non-boolean", + ) + for tid, val in [ + ("int32_one", 1), + ("int32_zero", 0), + ("int64", Int64(1)), + ("double_one", 1.0), + ("double_zero", DOUBLE_ZERO), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("string_true", "true"), + ("string_empty", ""), + ("array", [True]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +VECTORSEARCH_EXACT_ALL_TESTS = ( + VECTORSEARCH_EXACT_ANN_TESTS + + VECTORSEARCH_EXACT_ENN_TESTS + + VECTORSEARCH_EXACT_TYPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_EXACT_ALL_TESTS)) +def test_vectorSearch_exact(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: exact ANN/ENN selection.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_explain_options.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_explain_options.py new file mode 100644 index 000000000..986c23c01 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_explain_options.py @@ -0,0 +1,230 @@ +"""Tests for the $vectorSearch stage: explainOptions behavior and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + Exists, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [explainOptions Trace Element Type Non-Validation]: on a genuine +# explain aggregate explainOptions.traceDocumentIds accepts an element of any +# non-null BSON type (the documented "array of objectIDs" is not enforced) and +# echoes the supplied ids back into the explain output. +VECTORSEARCH_EXPLAIN_OPTIONS_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"explain_options_{tid}", + explain=True, + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 3, + "explainOptions": {"traceDocumentIds": ids}, + } + } + ], + expected={ + "ok": Eq(1.0), + # pymongo decodes a subtype-0 BSON binary element to plain bytes, + # whose equality with the Binary input is subtype-sensitive, so the + # binary case asserts the echoed field is present rather than equal. + "stages.0.$vectorSearch.explainOptions.traceDocumentIds": ( + Exists() if tid == "binary" else Eq(ids) + ), + }, + msg=f"$vectorSearch should accept a {tid} traceDocumentIds element on a " + "genuine explain and echo it into the explain output", + ) + for tid, ids in [ + ("objectid", [ObjectId("5a9427648b0beebeb69537a5"), ObjectId("5a9427648b0beebeb69537b6")]), + ("int32", [1, 2]), + ("int64", [Int64(1), Int64(2)]), + ("double", [1.5, 2.5]), + ("decimal128", [DECIMAL128_ONE_AND_HALF]), + ("string", ["x", "y"]), + ("bool", [True, False]), + ("object", [{"a": 1}]), + ("array", [[1, 2]]), + ("datetime", [datetime(2020, 1, 1, tzinfo=timezone.utc)]), + ("timestamp", [Timestamp(1, 1)]), + ("binary", [Binary(b"\x01\x02\x03")]), + ("regex", [Regex(".*", "i")]), + ("code", [Code("function(){}")]), + ("minkey", [MinKey()]), + ("maxkey", [MaxKey()]), + ("mixed", [1, "x", 1.5, ObjectId("5a9427648b0beebeb69537a5")]), + ] +] + +# Property [explainOptions Structure Validation]: explainOptions and its +# traceDocumentIds sub-field are structurally validated independent of index +# existence, so a malformed shape is rejected against a collection with no index. +VECTORSEARCH_EXPLAIN_OPTIONS_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"explain_options_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "explainOptions": explain_options, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=msg, + ) + for tid, explain_options, msg in [ + ( + "non_document", + 5, + "$vectorSearch should reject a non-document explainOptions value", + ), + ( + "unknown_subfield", + {"bogus": 1}, + "$vectorSearch should reject an unknown explainOptions sub-field", + ), + ( + "trace_document_ids_non_array", + {"traceDocumentIds": 5}, + "$vectorSearch should reject a non-array traceDocumentIds", + ), + ( + "trace_document_ids_empty", + {"traceDocumentIds": []}, + "$vectorSearch should reject an empty traceDocumentIds array", + ), + ] +] + +# Property [explainOptions Requires Explain Mode]: explainOptions present on a +# non-explain query is rejected once the index resolves, because the option is +# only valid when the query is run in explain mode. +VECTORSEARCH_EXPLAIN_OPTIONS_NON_EXPLAIN_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "explain_options_non_explain_query", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "explainOptions": {"traceDocumentIds": [1, 2]}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject explainOptions on a non-explain query", + ), +] + +# Property [explainOptions Requires Non-Empty traceDocumentIds]: on a genuine +# explain query, explainOptions must carry a non-empty traceDocumentIds, so +# omitting it entirely and supplying a null element are both rejected. +VECTORSEARCH_EXPLAIN_OPTIONS_EXPLAIN_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "explain_options_empty_document", + explain=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "explainOptions": {}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an explainOptions with no traceDocumentIds " + "on an explain query", + ), + VectorSearchTest( + "explain_options_trace_null_element", + explain=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "explainOptions": {"traceDocumentIds": [None]}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a null traceDocumentIds element on an explain query", + ), +] + +VECTORSEARCH_EXPLAIN_OPTIONS_ALL_TESTS = ( + VECTORSEARCH_EXPLAIN_OPTIONS_TESTS + + VECTORSEARCH_EXPLAIN_OPTIONS_ERROR_TESTS + + VECTORSEARCH_EXPLAIN_OPTIONS_NON_EXPLAIN_ERROR_TESTS + + VECTORSEARCH_EXPLAIN_OPTIONS_EXPLAIN_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_EXPLAIN_OPTIONS_ALL_TESTS)) +def test_vectorSearch_explain_options(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: explainOptions behavior and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + aggregate = {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}} + command = ( + {"explain": aggregate, "verbosity": "queryPlanner"} if test_case.explain else aggregate + ) + result = execute_command(coll, command) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter.py new file mode 100644 index 000000000..bcbbca4a0 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter.py @@ -0,0 +1,300 @@ +"""Tests for the $vectorSearch stage: filter pre-filtering.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Int64, +) + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + _FILTER_OID_A, + _FILTER_UUID_A, + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [filter Match All]: an empty filter document applies no predicate and +# retains every document. +VECTORSEARCH_FILTER_MATCH_ALL_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "match_all_empty_filter", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should retain every document for an empty filter", + ), +] + +# Property [filter Per-Field Operators]: each supported per-field operator, and +# the $eq shorthand, pre-filters to exactly the documents matching that predicate. +VECTORSEARCH_FILTER_OPERATOR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"operator_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should pre-filter with the {tid} operator", + ) + for tid, flt, ids in [ + ("shorthand_eq", {"year": 2001}, [3]), + ("eq", {"year": {"$eq": 2010}}, [4, 5]), + ("ne", {"year": {"$ne": 2010}}, [1, 2, 3]), + ("gt", {"year": {"$gt": 2000}}, [3, 4, 5]), + ("gte", {"year": {"$gte": 2001}}, [3, 4, 5]), + ("lt", {"year": {"$lt": 2001}}, [1, 2]), + ("lte", {"year": {"$lte": 2000}}, [1, 2]), + ("in", {"year": {"$in": [1999, 2001]}}, [1, 3]), + ("nin", {"year": {"$nin": [1999, 2001]}}, [2, 4, 5]), + ("exists", {"cat": {"$exists": True}}, [1, 2, 3, 4, 5]), + ("not", {"year": {"$not": {"$gt": 2000}}}, [1, 2]), + ] +] + +# Property [filter Combinators]: top-level $and, $or, $nor, and implicit +# multi-field AND compose their arms into the correct combined predicate. +VECTORSEARCH_FILTER_COMBINATOR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"combinator_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should compose the {tid} combinator correctly", + ) + for tid, flt, ids in [ + ("and", {"$and": [{"cat": "y"}, {"year": {"$gte": 2010}}]}, [4, 5]), + ("or", {"$or": [{"cat": "x"}, {"year": 2001}]}, [1, 2, 3]), + ("nor", {"$nor": [{"cat": "x"}]}, [3, 4, 5]), + ("implicit_and", {"cat": "y", "active": True}, [3, 5]), + ] +] + +# Property [filter Value Types]: each supported predicate value type pre-filters +# correctly, including null as a direct value and element membership on an +# array-valued field. +VECTORSEARCH_FILTER_VALUE_TYPE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"value_type_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should pre-filter with a {tid} predicate value", + ) + for tid, flt, ids in [ + ("string", {"cat": "x"}, [1, 2]), + ("int32", {"year": 2000}, [2]), + ("int64", {"count": Int64(20)}, [2]), + ("double", {"rating": 4.5}, [1]), + ("boolean", {"active": True}, [1, 3, 5]), + ("object_id", {"oid": _FILTER_OID_A}, [1, 3, 5]), + ("date", {"created": {"$gt": datetime(2023, 1, 1, tzinfo=timezone.utc)}}, [4, 5]), + ("uuid", {"uid": _FILTER_UUID_A}, [1, 3, 5]), + ("null_direct", {"opt": None}, [3]), + ("array_element_membership", {"tags": "x"}, [1, 5]), + ] +] + +# Property [filter Numeric In Mixing]: a $in/$nin list mixing int and double +# elements is accepted as same-type numeric and brackets by numeric value. +VECTORSEARCH_FILTER_NUMERIC_IN_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"numeric_in_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should accept a mixed int/double {tid} list", + ) + for tid, flt, ids in [ + ("in", {"rating": {"$in": [4.0, 5]}}, [3, 5]), + ("nin", {"rating": {"$nin": [4.0, 5]}}, [1, 2, 4]), + ] +] + +# Property [filter Cross-Type Bracketing]: a predicate whose value type differs +# from the indexed field's type brackets to no match without erroring. +VECTORSEARCH_FILTER_CROSS_TYPE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "cross_type_string_vs_numeric", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$gt": "a"}}, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should return no results for a cross-type comparison without erroring", + ), +] + +# Property [filter Limit And Mode]: the filter is applied before limit, a filter +# matching nothing yields an empty result, and filtering is identical under ANN +# and ENN. +VECTORSEARCH_FILTER_LIMIT_MODE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "applied_before_limit", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 1, + "filter": {"cat": "y"}, + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 3)]}, + msg="$vectorSearch should apply the filter before limit truncates by score", + ), + VectorSearchTest( + "matches_nothing_empty", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": 3000}, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should return an empty result for a filter that matches nothing", + ), + VectorSearchTest( + "enn_filter", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "limit": 5, + "filter": {"cat": "x"}, + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$vectorSearch should apply the filter identically under ENN", + ), +] + +VECTORSEARCH_FILTER_ALL_TESTS = ( + VECTORSEARCH_FILTER_MATCH_ALL_TESTS + + VECTORSEARCH_FILTER_OPERATOR_TESTS + + VECTORSEARCH_FILTER_COMBINATOR_TESTS + + VECTORSEARCH_FILTER_VALUE_TYPE_TESTS + + VECTORSEARCH_FILTER_NUMERIC_IN_TESTS + + VECTORSEARCH_FILTER_CROSS_TYPE_TESTS + + VECTORSEARCH_FILTER_LIMIT_MODE_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_FILTER_ALL_TESTS)) +def test_vectorSearch_filter(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: filter pre-filtering.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_parse_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_parse_errors.py new file mode 100644 index 000000000..11943e3f2 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_parse_errors.py @@ -0,0 +1,202 @@ +"""Tests for the $vectorSearch stage: filter parse and shape errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BAD_VALUE_ERROR, + EXPRESSION_NOT_OBJECT_ERROR, + NEAR_NOT_ALLOWED_ERROR, + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [filter Scalar Type Rejection]: a non-object, non-array scalar filter +# value is rejected at parse time as not an object. +VECTORSEARCH_FILTER_SCALAR_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"filter_scalar_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": val, + } + } + ], + error_code=EXPRESSION_NOT_OBJECT_ERROR, + msg=f"$vectorSearch should reject a {tid} scalar filter value as not an object", + ) + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [filter Array Rejection]: an array filter value is rejected as not a +# document rather than treated as an empty filter. +VECTORSEARCH_FILTER_ARRAY_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_array_empty", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": [], + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an empty-array filter as not a document", + ), +] + +# Property [filter MQL Parse Rejection]: a filter operator rejected by the MQL +# parser surfaces a parse error rather than an executor validation error. +VECTORSEARCH_FILTER_MQL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"filter_mql_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + } + ], + error_code=BAD_VALUE_ERROR, + msg=f"$vectorSearch should reject a {tid} filter as an MQL parse error", + ) + for tid, flt in [ + ("text", {"$text": {"$search": "x"}}), + ("comment", {"year": {"$comment": "x"}}), + ("unknown_top_level", {"$bad": 1}), + ] +] + +# Property [filter Geo Operator Rejection]: a geospatial filter operator is +# rejected because it requires sorting geospatial data. +VECTORSEARCH_FILTER_GEO_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_geo_near", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$near": [0, 0]}}, + } + } + ], + error_code=NEAR_NOT_ALLOWED_ERROR, + msg="$vectorSearch should reject a geo operator in a filter", + ), +] + +# Property [filter Combinator Argument Validation]: a top-level logical +# combinator requires a non-empty array argument. +VECTORSEARCH_FILTER_COMBINATOR_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"filter_combinator_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": flt, + } + } + ], + error_code=BAD_VALUE_ERROR, + msg=f"$vectorSearch should reject a {tid} combinator argument", + ) + for tid, flt in [ + ("empty_array", {"$and": []}), + ("non_array", {"$and": {"year": 1}}), + ] +] + +VECTORSEARCH_FILTER_PARSE_ERRORS_ALL_TESTS = ( + VECTORSEARCH_FILTER_SCALAR_ERROR_TESTS + + VECTORSEARCH_FILTER_ARRAY_ERROR_TESTS + + VECTORSEARCH_FILTER_MQL_ERROR_TESTS + + VECTORSEARCH_FILTER_GEO_ERROR_TESTS + + VECTORSEARCH_FILTER_COMBINATOR_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_FILTER_PARSE_ERRORS_ALL_TESTS)) +def test_vectorSearch_filter_parse_errors(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: filter parse and shape errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_predicate_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_predicate_errors.py new file mode 100644 index 000000000..38b7ee482 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_filter_predicate_errors.py @@ -0,0 +1,259 @@ +"""Tests for the $vectorSearch stage: filter predicate errors.""" + +from __future__ import annotations + +import pytest +from bson import ( + Code, + MaxKey, + MinKey, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BAD_VALUE_ERROR, + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [filter Unsupported Operator Rejection]: a parseable filter operator +# that is not a supported comparison operator is rejected, each surfacing its own +# diagnostic message. +VECTORSEARCH_FILTER_OPERATOR_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_operator_regex", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$regex": "x"}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an unsupported per-field filter operator", + ), + VectorSearchTest( + "filter_operator_expr", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"$expr": {"$gt": ["$year", 1]}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a $expr filter operator", + ), + VectorSearchTest( + "filter_operator_json_schema", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"$jsonSchema": {"required": ["year"]}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a $jsonSchema filter operator", + ), +] + +# Property [filter Value Type Rejection]: a predicate value whose BSON type is +# not among the supported filter value types is rejected. +VECTORSEARCH_FILTER_VALUE_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"filter_value_type_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": val}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} filter value as an unsupported type", + ) + for tid, val in [ + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("object", {"a": 1}), + ("array", [1, 2]), + ("timestamp", Timestamp(1, 1)), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("non_uuid_binary", Binary(b"\x01\x02\x03")), + ("minkey", {"$gt": MinKey()}), + ("maxkey", {"$gt": MaxKey()}), + ] +] + +# Property [filter Field Not Indexed]: a filter referencing a field not indexed +# as the filter type is rejected, and a dollar-prefixed key is parsed as a +# top-level operator rather than a field name. +VECTORSEARCH_FILTER_FIELD_NOT_INDEXED_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_field_not_indexed", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"_id": 1}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a filter on a field not indexed as the filter type", + ), + VectorSearchTest( + "filter_dollar_prefixed_key", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"$year": 1}, + } + } + ], + error_code=BAD_VALUE_ERROR, + msg="$vectorSearch should parse a dollar-prefixed filter key as a top-level " + "operator and reject it", + ), +] + +# Property [filter Element Constraints]: $in element lists must be non-empty, +# same-type, and free of null elements, and $exists requires a boolean argument. +VECTORSEARCH_FILTER_ELEMENT_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_in_empty", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$in": []}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an empty $in list in a filter", + ), + VectorSearchTest( + "filter_in_mixed_type", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$in": [1, "x"]}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a mixed-type $in list in a filter", + ), + VectorSearchTest( + "filter_in_null_element", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$in": [None]}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a null element in a filter $in list", + ), + VectorSearchTest( + "filter_exists_non_boolean", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"year": {"$exists": 1}}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a non-boolean $exists argument in a filter", + ), +] + +VECTORSEARCH_FILTER_PREDICATE_ERRORS_ALL_TESTS = ( + VECTORSEARCH_FILTER_OPERATOR_ERROR_TESTS + + VECTORSEARCH_FILTER_VALUE_TYPE_ERROR_TESTS + + VECTORSEARCH_FILTER_FIELD_NOT_INDEXED_ERROR_TESTS + + VECTORSEARCH_FILTER_ELEMENT_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_FILTER_PREDICATE_ERRORS_ALL_TESTS)) +def test_vectorSearch_filter_predicate_errors(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: filter predicate errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_definition_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_definition_errors.py new file mode 100644 index 000000000..b378a0a71 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_definition_errors.py @@ -0,0 +1,139 @@ +"""Tests for the $vectorSearch stage: vectorSearch index definition errors. + +These are createSearchIndexes-time validation errors for definition options that +the $vectorSearch surface owns (nestedRoot, storedSource, and the vector field's +numDimensions, against which a query vector's length is checked). They fail +synchronously at index-create time, so unlike the query-path tests they do not +build a READY index or run an aggregate.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, + VECTOR_SEARCH_NESTED_ROOT_MISMATCH_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_case import BaseTestCase +from documentdb_tests.framework.test_constants import DOUBLE_ZERO + +pytestmark = pytest.mark.requires(search=True) + + +@dataclass(frozen=True) +class IndexDefinitionErrorTest(BaseTestCase): + """A vectorSearch index definition that createSearchIndexes must reject.""" + + definition: dict[str, Any] = field(default_factory=dict) + + +# Property [nestedRoot Path Mismatch]: a vectorSearch index whose nestedRoot does +# not name any vector field path in the definition is rejected at index-create time. +VECTORSEARCH_NESTED_ROOT_MISMATCH_ERROR_TESTS: list[IndexDefinitionErrorTest] = [ + IndexDefinitionErrorTest( + "nested_root_does_not_match_vector_path", + definition={ + "nestedRoot": "nonexistent", + "fields": [ + { + "type": "vector", + "path": "reviews.embedding", + "numDimensions": 3, + "similarity": "cosine", + } + ], + }, + error_code=VECTOR_SEARCH_NESTED_ROOT_MISMATCH_ERROR, + msg="createSearchIndexes should reject a vectorSearch index whose nestedRoot " + "does not match any vector field path", + ), +] + +# Property [storedSource true Unsupported]: a vectorSearch index defined with +# storedSource: true is rejected at index-create time, because only include, +# exclude, or false are accepted. +VECTORSEARCH_STORED_SOURCE_TRUE_ERROR_TESTS: list[IndexDefinitionErrorTest] = [ + IndexDefinitionErrorTest( + "stored_source_true_unsupported", + definition={ + "storedSource": True, + "fields": [ + { + "type": "vector", + "path": "embedding", + "numDimensions": 3, + "similarity": "cosine", + } + ], + }, + error_code=UNKNOWN_ERROR, + msg="createSearchIndexes should reject a vectorSearch index defined with " + "storedSource true", + ), +] + +# Property [numDimensions Bounds]: a vector field whose numDimensions falls +# outside the accepted range is rejected at index-create time, asserted at both +# boundaries. +VECTORSEARCH_NUM_DIMENSIONS_BOUNDS_ERROR_TESTS: list[IndexDefinitionErrorTest] = [ + IndexDefinitionErrorTest( + f"num_dimensions_bounds_{tid}", + definition={ + "fields": [ + { + "type": "vector", + "path": "embedding", + "numDimensions": ndim, + "similarity": "cosine", + } + ], + }, + error_code=UNKNOWN_ERROR, + msg=f"createSearchIndexes should reject a vector field with a {tid} " + "numDimensions as out of bounds", + ) + for tid, ndim in [ + ("below_lower", 0), + ("above_upper", 8193), + ] +] + +VECTORSEARCH_INDEX_DEFINITION_ERROR_TESTS = ( + VECTORSEARCH_NESTED_ROOT_MISMATCH_ERROR_TESTS + + VECTORSEARCH_STORED_SOURCE_TRUE_ERROR_TESTS + + VECTORSEARCH_NUM_DIMENSIONS_BOUNDS_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_INDEX_DEFINITION_ERROR_TESTS)) +def test_vectorSearch_index_definition_errors(test_case: IndexDefinitionErrorTest, collection): + """$vectorSearch: vectorSearch index definition errors.""" + collection.insert_one( + { + "_id": 1, + "embedding": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "reviews": [{"embedding": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], "rating": 5}], + } + ) + result = execute_command( + collection, + { + "createSearchIndexes": collection.name, + "indexes": [ + {"name": "vidx", "type": "vectorSearch", "definition": test_case.definition} + ], + }, + ) + assertResult( + result, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path.py new file mode 100644 index 000000000..14a61b1e9 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path.py @@ -0,0 +1,165 @@ +"""Tests for the $vectorSearch stage: index and path resolution.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + Len, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [index Exact Name Match]: the correct existing index name resolves and +# returns the collection's similarity-ordered documents. +VECTORSEARCH_INDEX_MATCH_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "match_correct_name", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected=PerDoc( + {"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}, {"_id": Eq(4)}, {"_id": Eq(5)} + ), + msg="$vectorSearch should resolve the correct existing index name and return results", + ), +] + +# Property [index Name Silent Miss]: an index name that is not byte-for-byte the +# existing index name returns zero results with no error, because matching is +# exact and literal rather than fuzzy or expression-evaluated. +VECTORSEARCH_INDEX_SILENT_MISS_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"silent_miss_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": name, + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg=f"$vectorSearch should silently return zero results for a {tid} index name", + ) + for tid, name in [ + ("nonexistent", "no_such_index"), + ("case_variant", "vs_core_index".upper()), + ("leading_space", " vs_core_index"), + ("trailing_space", "vs_core_index "), + ("tab", "vs_core_index\t"), + ("newline", "vs_core_index\n"), + ("dollar_prefix", "$vs_core_index"), + ("dollar_only", "$"), + ("dollar_now_variable", "$$NOW"), + ("double_dollar", "$$"), + ("null_byte", "vs_core_index\x00"), + ("control_char", "vs\x01core"), + ("punctuation", '{vs}"core",;'), + # "e" + U+0301 combining acute accent, not the precomposed "é" (U+00E9). + ("unicode_combining", "vse\u0301core"), + ("cjk", "向量索引"), + ("emoji", "🔍index"), + ] +] + +# Property [index Nonexistent Skips Dimension Check]: a nonexistent index name +# combined with a dimension-mismatched queryVector still returns zero results with +# no error, because the dimension check is skipped when the named index is absent. +VECTORSEARCH_INDEX_DIMENSION_SKIP_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nonexistent_skips_dimension_check", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "no_such_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should return zero results without a dimension error when " + "the named index does not exist", + ), +] + +# Property [path Field Resolution]: a path naming the correct vector-indexed +# field resolves and returns the collection's similarity-ordered documents, +# including a dot-notation path resolved against a nested-path index. +VECTORSEARCH_PATH_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected=PerDoc( + {"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}, {"_id": Eq(4)}, {"_id": Eq(5)} + ), + msg=f"$vectorSearch should resolve the {tid} vector path and return results", + ) + for tid, path in [ + ("top_level", "ve"), + ("dot_notation_nested", "meta.vec"), + ] +] + +VECTORSEARCH_INDEX_PATH_ALL_TESTS = ( + VECTORSEARCH_INDEX_MATCH_TESTS + + VECTORSEARCH_INDEX_SILENT_MISS_TESTS + + VECTORSEARCH_INDEX_DIMENSION_SKIP_TESTS + + VECTORSEARCH_PATH_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_INDEX_PATH_ALL_TESTS)) +def test_vectorSearch_index_path(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: index and path resolution.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path_errors.py new file mode 100644 index 000000000..09a5abb6d --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_index_path_errors.py @@ -0,0 +1,283 @@ +"""Tests for the $vectorSearch stage: index and path errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [index Type Strictness]: a non-string index value of any BSON type is +# rejected as a non-string with no coercion. +VECTORSEARCH_INDEX_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"index_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": val, + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} index value as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", ["a"]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [index Empty String]: an empty-string index is rejected as empty, +# distinct from both the non-string type error and the nonexistent-name silent +# miss. +VECTORSEARCH_INDEX_EMPTY_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "index_empty_string", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an empty-string index", + ), +] + +# Property [path Type Strictness]: a non-string path value of any BSON type is +# rejected as a non-string with no coercion. +VECTORSEARCH_PATH_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": val, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} path value as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", ["a"]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [path Not Indexed As Vector]: a path that does not name a +# vector-indexed field is a hard error, whether the field is nonexistent, an +# existing non-vector field, or a field indexed only as the filter type. +VECTORSEARCH_PATH_NOT_INDEXED_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_not_indexed_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} path as not indexed as vector", + ) + for tid, path in [ + ("nonexistent_field", "no_such_field"), + ("non_vector_field", "name"), + ("filter_only_field", "cat"), + ] +] + +# Property [path No Field-Path Syntax Validation]: a malformed field-path string +# is looked up literally and produces the not-indexed-as-vector error, never a +# field-path syntax error. +VECTORSEARCH_PATH_SYNTAX_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_syntax_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should look up a {tid} path literally and reject it as " + "not indexed as vector", + ) + for tid, path in [ + ("empty", ""), + ("leading_dot", ".x"), + ("trailing_dot", "x."), + ("empty_component", "a..b"), + ("null_byte", "a\x00b"), + ("deep_nesting", ".".join("a" * 50)), + ("very_long", "a" * 10_000), + ] +] + +# Property [path Literal Not Expression]: a dollar-prefixed or variable-like path +# string is matched literally with no expression or variable evaluation, yielding +# the not-indexed-as-vector error. +VECTORSEARCH_PATH_LITERAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_literal_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should treat a {tid} path as a literal name, not an " + "expression, and reject it as not indexed as vector", + ) + for tid, path in [ + ("field_ref", "$embedding"), + ("now_variable", "$$NOW"), + ("root_variable", "$$ROOT"), + ("dollar_only", "$"), + ("double_dollar", "$$"), + ] +] + +# Property [path Exact Byte Matching]: path matching is byte-for-byte, so a +# case variant or whitespace-padded form of the real field name does not match +# and yields the not-indexed-as-vector error. +VECTORSEARCH_PATH_EXACT_MATCH_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"path_exact_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should not match a {tid} of the real field name and " + "reject it as not indexed as vector", + ) + for tid, path in [ + ("case_variant", "vc".upper()), + ("leading_space", " vc"), + ("trailing_space", "vc "), + # "cafe" + combining acute (U+0301): the decomposed (NFD) form of the + # indexed precomposed "caf\u00e9_vec" (U+00E9), a distinct byte sequence + # that must not match. + ("non_normalized_unicode", "cafe\u0301_vec"), + ] +] + +VECTORSEARCH_INDEX_PATH_ERRORS_ALL_TESTS = ( + VECTORSEARCH_INDEX_TYPE_ERROR_TESTS + + VECTORSEARCH_INDEX_EMPTY_ERROR_TESTS + + VECTORSEARCH_PATH_TYPE_ERROR_TESTS + + VECTORSEARCH_PATH_NOT_INDEXED_ERROR_TESTS + + VECTORSEARCH_PATH_SYNTAX_ERROR_TESTS + + VECTORSEARCH_PATH_LITERAL_ERROR_TESTS + + VECTORSEARCH_PATH_EXACT_MATCH_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_INDEX_PATH_ERRORS_ALL_TESTS)) +def test_vectorSearch_index_path_errors(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: index and path errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_limit.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_limit.py new file mode 100644 index 000000000..fffe48977 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_limit.py @@ -0,0 +1,285 @@ +"""Tests for the $vectorSearch stage: limit acceptance and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Decimal128, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, + VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR, + VECTOR_SEARCH_LIMIT_NOT_POSITIVE_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_NEGATIVE_ZERO, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_OVERFLOW, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [limit Numeric Type Acceptance]: limit accepts an int32, Int64, or +# whole-number double and is treated as the integer value, returning exactly that +# many top-similarity documents. +VECTORSEARCH_LIMIT_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"limit_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": limit, + } + }, + ], + expected=expected, + msg=f"$vectorSearch should accept a {tid} limit and return the integer " + "number of top-similarity documents", + ) + for tid, limit, expected in [ + ("int32", 2, PerDoc({"_id": Eq(1)}, {"_id": Eq(2)})), + ("int64", Int64(3), PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)})), + ("whole_double", 3.0, PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)})), + ] +] + +# Property [limit Type Strictness]: a non-number limit value of any BSON type is +# rejected at the parse layer as not a number, with no coercion. +VECTORSEARCH_LIMIT_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"limit_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": val, + } + } + ], + error_code=VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR, + msg=f"$vectorSearch should reject a {tid} limit value as a non-number", + ) + for tid, val in [ + ("bool_true", True), + ("bool_false", False), + ("string", "5"), + ("array", [5]), + ("object", {"a": 5}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [limit Literal Not Expression]: a limit given as an expression object +# is rejected at the BSON-type layer as a non-number, with no expression +# evaluation. +VECTORSEARCH_LIMIT_LITERAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "limit_literal_object", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": {"$literal": 3}, + } + } + ], + error_code=VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR, + msg="$vectorSearch should reject an expression-object limit without evaluating it", + ), +] + +# Property [limit Fractional Rejection]: a double limit whose truncation is at +# least one but is not a whole number is rejected as a non-integer. +VECTORSEARCH_LIMIT_FRACTIONAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "limit_fractional_one_point_five", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 1.5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a fractional double limit", + ), +] + +# Property [limit Decimal128 Rejection]: a Decimal128 limit is rejected as a +# non-integer even when its value is whole, unlike a whole-number double. +VECTORSEARCH_LIMIT_DECIMAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "limit_decimal_whole", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": Decimal128("3"), + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a whole-valued Decimal128 limit", + ), +] + +# Property [limit Positivity]: a limit whose truncation is less than one is +# rejected as non-positive, checked before the integer-ness check. +VECTORSEARCH_LIMIT_NOT_POSITIVE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"limit_not_positive_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": val, + } + } + ], + error_code=VECTOR_SEARCH_LIMIT_NOT_POSITIVE_ERROR, + msg=f"$vectorSearch should reject a non-positive limit ({tid})", + ) + for tid, val in [ + ("zero_int32", 0), + ("zero_double", DOUBLE_ZERO), + ("negative_zero_double", DOUBLE_NEGATIVE_ZERO), + ("fractional_half", 0.5), + ("negative_int", -1), + ("negative_double", -1.5), + ("nan", FLOAT_NAN), + ("negative_infinity", FLOAT_NEGATIVE_INFINITY), + ] +] + +# Property [limit Int32 Overflow]: a positive whole limit that does not fit in a +# 32-bit integer is rejected as overflowing. +VECTORSEARCH_LIMIT_OVERFLOW_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"limit_overflow_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": val, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a limit that exceeds int32 range ({tid})", + ) + for tid, val in [ + ("int64_just_over", Int64(INT32_OVERFLOW)), + ("double_just_over", float(INT32_OVERFLOW)), + ("positive_infinity", FLOAT_INFINITY), + ] +] + +# Property [limit ENN Ceiling]: under ENN (exact true, no numCandidates) a limit +# at or above the exact-search ceiling is rejected during ENN execution. +VECTORSEARCH_LIMIT_ENN_CEILING_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "limit_enn_ceiling", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "limit": 2_147_483_631, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an ENN limit at the exact-search ceiling", + ), +] + +VECTORSEARCH_LIMIT_ALL_TESTS = ( + VECTORSEARCH_LIMIT_TESTS + + VECTORSEARCH_LIMIT_TYPE_ERROR_TESTS + + VECTORSEARCH_LIMIT_LITERAL_ERROR_TESTS + + VECTORSEARCH_LIMIT_FRACTIONAL_ERROR_TESTS + + VECTORSEARCH_LIMIT_DECIMAL_ERROR_TESTS + + VECTORSEARCH_LIMIT_NOT_POSITIVE_ERROR_TESTS + + VECTORSEARCH_LIMIT_OVERFLOW_ERROR_TESTS + + VECTORSEARCH_LIMIT_ENN_CEILING_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_LIMIT_ALL_TESTS)) +def test_vectorSearch_limit(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: limit acceptance and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_nested.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_nested.py new file mode 100644 index 000000000..38309fcbc --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_nested.py @@ -0,0 +1,560 @@ +"""Tests for the $vectorSearch stage: nestedRoot scoping and nestedOptions.""" + +from __future__ import annotations + +import time +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +_INDEX_READY_TIMEOUT_SECONDS = 120 + +_NESTED_CORPUS = [ + { + "_id": 1, + "country": "US", + "reviews": [ + {"rating": 5, "embedding": [1.0, DOUBLE_ZERO, DOUBLE_ZERO]}, + {"rating": 3, "embedding": [0.9, 0.1, DOUBLE_ZERO]}, + ], + }, + { + "_id": 2, + "country": "US", + "reviews": [{"rating": 2, "embedding": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO]}], + }, + { + "_id": 3, + "country": "CA", + "reviews": [{"rating": 5, "embedding": [0.8, 0.2, DOUBLE_ZERO]}], + }, +] + + +@pytest.fixture(scope="module") +def nested_vector_search_collection(engine_client, worker_id): + """Provide a collection with a READY nestedRoot vectorSearch index over a fixed corpus.""" + db_name = f"vs_nested_{worker_id}" + db = engine_client[db_name] + coll = db["nested_vectors"] + db.drop_collection(coll.name) + db.create_collection(coll.name) + coll.insert_many([dict(doc) for doc in _NESTED_CORPUS]) + db.command( + { + "createSearchIndexes": coll.name, + "indexes": [ + { + "name": "vs_nested_index", + "type": "vectorSearch", + "definition": { + "nestedRoot": "reviews", + "fields": [ + { + "type": "vector", + "path": "reviews.embedding", + "numDimensions": 3, + "similarity": "cosine", + }, + {"type": "filter", "path": "reviews.rating"}, + {"type": "filter", "path": "country"}, + ], + }, + } + ], + } + ) + deadline = time.monotonic() + _INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(coll.aggregate([{"$listSearchIndexes": {}}])) + if indexes and indexes[0].get("status") == "READY": + break + time.sleep(2) + else: + raise TimeoutError("nestedRoot vectorSearch index did not reach READY state") + yield coll + engine_client.drop_database(db_name) + + +# Property [parentFilter Nested Root Scoping]: on a nestedRoot index parentFilter +# pre-filters root-level fields while filter pre-filters nested-level fields, the +# two AND-combine, and a predicate referencing the other level returns zero results. +VECTORSEARCH_PARENT_FILTER_NESTED_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nested_parent_filter_root_field", + collection_fixture="nested_vector_search_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "parentFilter": {"country": "US"}, + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$vectorSearch parentFilter should pre-filter root-level fields on a nestedRoot index", + ), + VectorSearchTest( + "nested_filter_nested_field", + collection_fixture="nested_vector_search_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "filter": {"reviews.rating": 5}, + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 3)]}, + msg="$vectorSearch filter should pre-filter nested-level fields on a nestedRoot index", + ), + VectorSearchTest( + "nested_filter_and_parent_filter", + collection_fixture="nested_vector_search_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "filter": {"reviews.rating": 5}, + "parentFilter": {"country": "US"}, + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$vectorSearch should AND-combine nested filter with root parentFilter", + ), + VectorSearchTest( + "nested_parent_filter_on_nested_field", + collection_fixture="nested_vector_search_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "parentFilter": {"reviews.rating": 5}, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch parentFilter on a nested field should return zero results", + ), + VectorSearchTest( + "nested_filter_on_root_field", + collection_fixture="nested_vector_search_collection", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "filter": {"country": "US"}, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch filter on a root field should return zero results", + ), +] + +# Property [nestedOptions scoreMode]: on a nestedRoot index nestedOptions.scoreMode +# combines a parent's matching nested-array child scores, with "avg" averaging the +# child scores and "max" taking the maximum. +VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "score_mode_avg", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "nestedOptions": {"scoreMode": "avg"}, + } + }, + {"$sort": {"_id": 1}}, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(0.9984709024429321)}, + {"_id": 2, "score": pytest.approx(0.5)}, + {"_id": 3, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should average a parent's matching child scores for scoreMode avg", + ), + VectorSearchTest( + "score_mode_max", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "nestedOptions": {"scoreMode": "max"}, + } + }, + {"$sort": {"_id": 1}}, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.5)}, + {"_id": 3, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should take the maximum of a parent's matching child " + "scores for scoreMode max", + ), +] + +# Property [nestedOptions Default Max]: omitting nestedOptions, or providing an +# empty nestedOptions document, yields the same parent score as scoreMode "max". +VECTORSEARCH_NESTED_OPTIONS_DEFAULT_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "default_max_omitted", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + } + }, + {"$sort": {"_id": 1}}, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.5)}, + {"_id": 3, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should default to scoreMode max when nestedOptions is omitted", + ), + VectorSearchTest( + "default_max_empty_document", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "nestedOptions": {}, + } + }, + {"$sort": {"_id": 1}}, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.5)}, + {"_id": 3, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should default to scoreMode max for an empty nestedOptions document", + ), + VectorSearchTest( + "default_max_score_mode_null", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 20, + "limit": 5, + "nestedOptions": {"scoreMode": None}, + } + }, + {"$sort": {"_id": 1}}, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.5)}, + {"_id": 3, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should treat a null scoreMode as absent and default to scoreMode max", + ), +] + +# Property [nestedOptions Non-Object Rejection]: a nestedOptions value that is +# not a document is rejected as not a document, with no type coercion. +VECTORSEARCH_NESTED_OPTIONS_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"nested_options_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": val, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} nestedOptions value as not a document", + ) + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", []), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [nestedOptions Null As Omitted]: nestedOptions null is treated as +# field-absent and the query succeeds on a flat index as if nestedOptions were +# omitted, in contrast to an empty nestedOptions document, which is rejected on a +# flat index for lacking a nested root. +VECTORSEARCH_NESTED_OPTIONS_NULL_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nested_options_null_omitted_flat_index", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": None, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should treat nestedOptions null as omitted and succeed on a " + "flat index", + ), +] + +# Property [nestedOptions Requires Nested Root]: nestedOptions on a flat +# (non-nestedRoot) index is rejected because the index has no nested root for a +# nested-array score mode to apply to. +VECTORSEARCH_NESTED_OPTIONS_FLAT_INDEX_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nested_options_on_flat_index", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": {"scoreMode": "avg"}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject nestedOptions on a flat index lacking a nested root", + ), +] + +# Property [nestedOptions Invalid scoreMode]: a scoreMode outside the accepted +# set is rejected, and the accepted values are matched case-sensitively. +VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"nested_options_score_mode_{tid}", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": {"scoreMode": score_mode}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject the {tid} scoreMode as unsupported", + ) + for tid, score_mode in [ + ("unrecognized", "median"), + ("case_variant", "AVG"), + ] +] + +# Property [nestedOptions scoreMode Type Strictness]: a non-string scoreMode value +# of any BSON type is rejected as not a string with no coercion. +VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"nested_options_score_mode_type_{tid}", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": {"scoreMode": val}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} scoreMode value as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", ["max"]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [nestedOptions Unknown Sub-Field Rejection]: an unrecognized sub-field +# of nestedOptions is rejected, unlike unknown top-level spec fields which are +# silently ignored. +VECTORSEARCH_NESTED_OPTIONS_UNKNOWN_SUBFIELD_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "nested_options_unknown_subfield", + collection_fixture="nested_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_nested_index", + "path": "reviews.embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "nestedOptions": {"bogus": 1}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an unrecognized nestedOptions sub-field", + ), +] + +VECTORSEARCH_NESTED_ALL_TESTS = ( + VECTORSEARCH_PARENT_FILTER_NESTED_TESTS + + VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_TESTS + + VECTORSEARCH_NESTED_OPTIONS_DEFAULT_TESTS + + VECTORSEARCH_NESTED_OPTIONS_TYPE_ERROR_TESTS + + VECTORSEARCH_NESTED_OPTIONS_NULL_TESTS + + VECTORSEARCH_NESTED_OPTIONS_FLAT_INDEX_ERROR_TESTS + + VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_ERROR_TESTS + + VECTORSEARCH_NESTED_OPTIONS_SCORE_MODE_TYPE_ERROR_TESTS + + VECTORSEARCH_NESTED_OPTIONS_UNKNOWN_SUBFIELD_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_NESTED_ALL_TESTS)) +def test_vectorSearch_nested(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: nestedRoot scoping and nestedOptions.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_num_candidates.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_num_candidates.py new file mode 100644 index 000000000..6fb8572d0 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_num_candidates.py @@ -0,0 +1,325 @@ +"""Tests for the $vectorSearch stage: numCandidates acceptance and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Decimal128, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_MAX, + INT32_OVERFLOW, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [numCandidates Numeric Type and Range Acceptance]: numCandidates +# accepts an int32, Int64, or whole-number double, and both range bounds are +# accepted. +VECTORSEARCH_NUM_CANDIDATES_RANGE_TESTS: list[VectorSearchTest] = [ + *[ + VectorSearchTest( + f"range_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": nc, + "limit": 3, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}), + msg=f"$vectorSearch should accept a {tid} numCandidates within range", + ) + for tid, nc in [ + ("int32", 5), + ("int64", Int64(5)), + ("whole_double", 5.0), + ] + ], + VectorSearchTest( + "range_lower_boundary_one", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 1, + "limit": 1, + } + }, + ], + expected=PerDoc({"_id": Eq(1)}), + msg="$vectorSearch should accept the lower numCandidates bound of 1", + ), + VectorSearchTest( + "range_upper_boundary_ten_thousand", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10_000, + "limit": 5, + } + }, + ], + expected=PerDoc( + {"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}, {"_id": Eq(4)}, {"_id": Eq(5)} + ), + msg="$vectorSearch should accept the upper numCandidates bound of 10000", + ), +] + +# Property [numCandidates Non-Integer Double]: a double numCandidates whose +# value is not a whole number, including NaN, is rejected as not an integer. +VECTORSEARCH_NUM_CANDIDATES_DOUBLE_NON_INTEGER_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"num_candidates_double_non_integer_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": val, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} double numCandidates as not an integer", + ) + for tid, val in [ + ("fractional", 10.5), + ("nan", FLOAT_NAN), + ] +] + +# Property [numCandidates Type Strictness]: a numCandidates of any non-integer +# BSON type other than double is rejected as not an integer with no coercion. +VECTORSEARCH_NUM_CANDIDATES_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"num_candidates_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": val, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} numCandidates as not an integer", + ) + for tid, val in [ + ("decimal128_whole", Decimal128("3")), + ("bool", True), + ("string", "5"), + ("array", [5]), + ("object", {"a": 5}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [numCandidates Literal Not Expression]: a numCandidates given as an +# expression object is rejected at the BSON-type layer as not an integer, with no +# expression evaluation. +VECTORSEARCH_NUM_CANDIDATES_LITERAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "num_candidates_literal_object", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": {"$literal": 10}, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject an expression-object numCandidates without evaluating it", + ), +] + +# Property [numCandidates Bounds]: an integer-valued numCandidates outside the +# accepted range is rejected as out of bounds, with integer-valued doubles routed +# to the bounds check rather than the type check. +VECTORSEARCH_NUM_CANDIDATES_BOUNDS_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"num_candidates_bounds_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": val, + "limit": 1, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} numCandidates as out of bounds", + ) + for tid, val in [ + ("lower_zero", 0), + ("negative_whole_double", -5.0), + ("upper_10001", 10_001), + ("int32_max", INT32_MAX), + ] +] + +# Property [numCandidates Int32 Range]: a numCandidates that does not fit in a +# signed 32-bit integer is rejected as overflowing above the range or +# underflowing below it. +VECTORSEARCH_NUM_CANDIDATES_INT32_RANGE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"num_candidates_int32_range_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": val, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} numCandidates as outside int32 range", + ) + for tid, val in [ + ("int64_just_over", Int64(INT32_OVERFLOW)), + ("double_just_over", float(INT32_OVERFLOW)), + ("positive_infinity", FLOAT_INFINITY), + ("negative_infinity", FLOAT_NEGATIVE_INFINITY), + ] +] + +# Property [numCandidates Less Than Limit]: under ANN a numCandidates smaller +# than limit is rejected because numCandidates must be greater than or equal to +# limit. +VECTORSEARCH_NUM_CANDIDATES_LESS_THAN_LIMIT_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "num_candidates_less_than_limit", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 4, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a numCandidates smaller than limit under ANN", + ), +] + +# Property [numCandidates Forbidden With Exact]: a non-null numCandidates is +# rejected when exact is true, because ENN forbids numCandidates rather than +# merely ignoring it. +VECTORSEARCH_NUM_CANDIDATES_EXACT_FORBIDDEN_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "num_candidates_forbidden_with_exact", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a non-null numCandidates when exact is true", + ), +] + +VECTORSEARCH_NUM_CANDIDATES_ALL_TESTS = ( + VECTORSEARCH_NUM_CANDIDATES_RANGE_TESTS + + VECTORSEARCH_NUM_CANDIDATES_DOUBLE_NON_INTEGER_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_TYPE_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_LITERAL_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_BOUNDS_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_INT32_RANGE_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_LESS_THAN_LIMIT_ERROR_TESTS + + VECTORSEARCH_NUM_CANDIDATES_EXACT_FORBIDDEN_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_NUM_CANDIDATES_ALL_TESTS)) +def test_vectorSearch_num_candidates(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: numCandidates acceptance and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_parent_filter.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_parent_filter.py new file mode 100644 index 000000000..6b4502faf --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_parent_filter.py @@ -0,0 +1,369 @@ +"""Tests for the $vectorSearch stage: parentFilter pre-filtering and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + _FILTER_OID_A, + _FILTER_UUID_A, + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [parentFilter Null As Omitted]: parentFilter null is treated as +# field-absent and the query succeeds as if parentFilter were not specified, +# diverging from filter where null errors. +VECTORSEARCH_PARENT_FILTER_NULL_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "parent_filter_null_omitted", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": None, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should treat parentFilter null as omitted and return all documents", + ), +] + +# Property [parentFilter Flat Index Acceptance]: parentFilter is accepted on a +# flat index as a second pre-filter on filter-type fields, where an empty +# document matches all and a real predicate AND-combines with filter. +VECTORSEARCH_PARENT_FILTER_FLAT_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "parent_filter_empty_matches_all", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": {}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should retain every document for an empty parentFilter on a flat index", + ), + VectorSearchTest( + "parent_filter_second_prefilter", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": {"cat": "x"}, + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$vectorSearch should apply parentFilter as a pre-filter on a flat index", + ), + VectorSearchTest( + "parent_filter_and_combines_with_filter", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"cat": "x"}, + "parentFilter": {"year": {"$gte": 2000}}, + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 2)]}, + msg="$vectorSearch should AND-combine parentFilter with filter on a flat index", + ), +] + +# Property [parentFilter Operator Parity]: parentFilter supports the same +# per-field operators and $eq shorthand as filter. +VECTORSEARCH_PARENT_FILTER_OPERATOR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"parent_operator_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should pre-filter parentFilter with the {tid} operator", + ) + for tid, flt, ids in [ + ("shorthand_eq", {"year": 2001}, [3]), + ("eq", {"year": {"$eq": 2010}}, [4, 5]), + ("ne", {"year": {"$ne": 2010}}, [1, 2, 3]), + ("gt", {"year": {"$gt": 2000}}, [3, 4, 5]), + ("gte", {"year": {"$gte": 2001}}, [3, 4, 5]), + ("lt", {"year": {"$lt": 2001}}, [1, 2]), + ("lte", {"year": {"$lte": 2000}}, [1, 2]), + ("in", {"year": {"$in": [1999, 2001]}}, [1, 3]), + ("nin", {"year": {"$nin": [1999, 2001]}}, [2, 4, 5]), + ("exists", {"cat": {"$exists": True}}, [1, 2, 3, 4, 5]), + ("not", {"year": {"$not": {"$gt": 2000}}}, [1, 2]), + ] +] + +# Property [parentFilter Combinator Parity]: parentFilter composes the same +# top-level combinators and implicit multi-field AND as filter. +VECTORSEARCH_PARENT_FILTER_COMBINATOR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"parent_combinator_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should compose the {tid} combinator in parentFilter", + ) + for tid, flt, ids in [ + ("and", {"$and": [{"cat": "y"}, {"year": {"$gte": 2010}}]}, [4, 5]), + ("or", {"$or": [{"cat": "x"}, {"year": 2001}]}, [1, 2, 3]), + ("nor", {"$nor": [{"cat": "x"}]}, [3, 4, 5]), + ("implicit_and", {"cat": "y", "active": True}, [3, 5]), + ] +] + +# Property [parentFilter Value Type Parity]: parentFilter pre-filters with the +# same predicate value types as filter, including null as a direct value and +# element membership on an array-valued field. +VECTORSEARCH_PARENT_FILTER_VALUE_TYPE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"parent_value_type_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": flt, + } + }, + ], + expected={"cursor.firstBatch": [Len(len(ids)), *(Contains("_id", i) for i in ids)]}, + msg=f"$vectorSearch should pre-filter parentFilter with a {tid} predicate value", + ) + for tid, flt, ids in [ + ("string", {"cat": "x"}, [1, 2]), + ("int32", {"year": 2000}, [2]), + ("int64", {"count": Int64(20)}, [2]), + ("double", {"rating": 4.5}, [1]), + ("boolean", {"active": True}, [1, 3, 5]), + ("object_id", {"oid": _FILTER_OID_A}, [1, 3, 5]), + ("date", {"created": {"$gt": datetime(2023, 1, 1, tzinfo=timezone.utc)}}, [4, 5]), + ("uuid", {"uid": _FILTER_UUID_A}, [1, 3, 5]), + ("null_direct", {"opt": None}, [3]), + ("array_element_membership", {"tags": "x"}, [1, 5]), + ] +] + +# Property [parentFilter Type Rejection]: a non-object parentFilter value of any +# BSON type, including an array, is uniformly rejected on the mongot executor as +# not a document, with no parse-time object check (diverging from filter null). +VECTORSEARCH_PARENT_FILTER_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"parent_filter_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": val, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} parentFilter value as not a document", + ) + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", []), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [parentFilter MQL Uniformity]: every MQL violation in parentFilter, +# including constructs that produce distinct error codes under filter, surfaces +# uniformly as the executor validation error with no mongod-side parse layer. +VECTORSEARCH_PARENT_FILTER_MQL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"parent_mql_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": flt, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} parentFilter as an executor error", + ) + for tid, flt in [ + ("geo_near", {"year": {"$near": [0, 0]}}), + ("text", {"$text": {"$search": "x"}}), + ("comment", {"year": {"$comment": "x"}}), + ("regex_op", {"year": {"$regex": "x"}}), + ("unknown_top_level", {"$bad": 1}), + ("dollar_prefixed_key", {"$year": 1}), + ("not_top_level", {"$not": {"year": 1}}), + ("empty_and", {"$and": []}), + ] +] + +# Property [parentFilter Field Not Indexed]: a parentFilter referencing a field +# not indexed as the filter type is a hard error rather than a silent miss. +VECTORSEARCH_PARENT_FILTER_FIELD_NOT_INDEXED_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "parent_filter_field_not_indexed", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "parentFilter": {"_id": 1}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a parentFilter on a field not indexed as the filter type", + ), +] + +VECTORSEARCH_PARENT_FILTER_ALL_TESTS = ( + VECTORSEARCH_PARENT_FILTER_NULL_TESTS + + VECTORSEARCH_PARENT_FILTER_FLAT_TESTS + + VECTORSEARCH_PARENT_FILTER_OPERATOR_TESTS + + VECTORSEARCH_PARENT_FILTER_COMBINATOR_TESTS + + VECTORSEARCH_PARENT_FILTER_VALUE_TYPE_TESTS + + VECTORSEARCH_PARENT_FILTER_TYPE_ERROR_TESTS + + VECTORSEARCH_PARENT_FILTER_MQL_ERROR_TESTS + + VECTORSEARCH_PARENT_FILTER_FIELD_NOT_INDEXED_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_PARENT_FILTER_ALL_TESTS)) +def test_vectorSearch_parent_filter(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: parentFilter pre-filtering and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector.py new file mode 100644 index 000000000..832481d52 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector.py @@ -0,0 +1,287 @@ +"""Tests for the $vectorSearch stage: queryVector accepted forms.""" + +from __future__ import annotations + +import pytest +from bson import ( + Int64, +) +from bson.binary import Binary, BinaryVectorDtype + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, + INT64_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Baseline cosine scores for the shared query vector, used to assert that every +# accepted numeric and float32-BinData query vector form scores identically. +_COSINE_QUERY_SCORES = [ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.9850712418556213)}, + {"_id": 3, "score": pytest.approx(0.9160251617431641)}, + {"_id": 4, "score": pytest.approx(0.6212677955627441)}, + {"_id": 5, "score": pytest.approx(0.5)}, +] + +# Property [queryVector Numeric And Float32 Equivalence]: every accepted numeric +# array form and the equivalent float32 BinData query vector yield scores +# identical to the double-array baseline. +VECTORSEARCH_QUERY_VECTOR_EQUIVALENCE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"qv_equivalence_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": qv, + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=_COSINE_QUERY_SCORES, + msg=f"$vectorSearch should accept a {tid} queryVector and score it " + "identically to the double array", + ) + for tid, qv in [ + ("int32_array", [1, 0, 0]), + ("int64_array", [Int64(1), INT64_ZERO, INT64_ZERO]), + ("mixed_array", [1, DOUBLE_ZERO, 0]), + ( + "float32_bindata", + Binary.from_vector([1.0, DOUBLE_ZERO, DOUBLE_ZERO], BinaryVectorDtype.FLOAT32), + ), + ] +] + +# Property [queryVector Signed Components]: a query vector containing negative +# components is accepted by cosine, euclidean, and dotProduct indexes. +VECTORSEARCH_QUERY_VECTOR_SIGN_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"qv_mixed_sign_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": [0.5, -0.5, 0.5], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg=f"$vectorSearch should accept a mixed-sign queryVector on a {tid} index", + ) + for tid, path in [ + ("cosine", "vc"), + ("euclidean", "ve"), + ("dot_product", "vd"), + ] +] + +# Property [queryVector Float32 Narrowing]: a query vector whose elements narrow +# to a finite, nonzero float32 value is accepted and returns results. +VECTORSEARCH_QUERY_VECTOR_FLOAT32_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "qv_float32_max_boundary", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [3.4028235e38, DOUBLE_ZERO, 1.0], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should accept a queryVector component at the FLT_MAX boundary", + ), + VectorSearchTest( + "qv_float32_underflow_nonzero", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1e-50, DOUBLE_ZERO, 1.0], + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should accept a queryVector component that underflows to " + "zero while the vector stays nonzero", + ), +] + +# Property [queryVector Zero Vector Acceptance]: a zero query vector ([0,0,0]) is +# accepted by euclidean and dotProduct indexes and returns results. +VECTORSEARCH_QUERY_VECTOR_ZERO_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"qv_zero_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": path, + "queryVector": qv, + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg=f"$vectorSearch should accept a {tid} queryVector and return results", + ) + for tid, path, qv in [ + ("all_euclidean", "ve", [DOUBLE_ZERO, DOUBLE_ZERO, DOUBLE_ZERO]), + ("all_dot_product", "vd", [DOUBLE_ZERO, DOUBLE_ZERO, DOUBLE_ZERO]), + ] +] + +# Property [queryVector Subtype Mismatch Silent Miss]: a BinData query vector +# whose element subtype differs from the indexed float32 returns zero results +# with no error. +VECTORSEARCH_QUERY_VECTOR_SUBTYPE_MISS_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "qv_int8_subtype_cosine", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": Binary.from_vector([1, 0, 0], BinaryVectorDtype.INT8), + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should silently return no results for an int8 BinData " + "queryVector against a float32 cosine index", + ), + VectorSearchTest( + "qv_int8_subtype_euclidean", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "ve", + "queryVector": Binary.from_vector([1, 0, 0], BinaryVectorDtype.INT8), + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should silently return no results for an int8 BinData " + "queryVector against a float32 euclidean index", + ), + VectorSearchTest( + "qv_packed_bit_subtype_euclidean", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "v8", + "queryVector": Binary.from_vector( + [0b10110010], BinaryVectorDtype.PACKED_BIT, padding=0 + ), + "numCandidates": 10, + "limit": 5, + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$vectorSearch should silently return no results for a packed_bit " + "BinData queryVector against a float32 euclidean index", + ), +] + +VECTORSEARCH_QUERY_VECTOR_ALL_TESTS = ( + VECTORSEARCH_QUERY_VECTOR_EQUIVALENCE_TESTS + + VECTORSEARCH_QUERY_VECTOR_SIGN_TESTS + + VECTORSEARCH_QUERY_VECTOR_FLOAT32_TESTS + + VECTORSEARCH_QUERY_VECTOR_ZERO_TESTS + + VECTORSEARCH_QUERY_VECTOR_SUBTYPE_MISS_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_QUERY_VECTOR_ALL_TESTS)) +def test_vectorSearch_query_vector(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: queryVector accepted forms.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector_errors.py new file mode 100644 index 000000000..605e1a866 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_query_vector_errors.py @@ -0,0 +1,327 @@ +"""Tests for the $vectorSearch stage: queryVector rejections.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary, BinaryVectorDtype + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_MIN_SUBNORMAL, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [queryVector Scalar Type Rejection]: a non-array, non-BinData scalar +# queryVector is rejected as an unexpected vector type with no coercion. +VECTORSEARCH_QUERY_VECTOR_SCALAR_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_scalar_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": val, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} scalar queryVector as an unexpected type", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("string", "vec"), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [queryVector Element Type Rejection]: an array queryVector containing a +# non-numeric or Decimal128 element is rejected as an unsupported BSON value. +VECTORSEARCH_QUERY_VECTOR_ELEMENT_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_element_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [val, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a queryVector with a {tid} element as an " + "unsupported BSON value", + ) + for tid, val in [ + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("string", "x"), + ("null", None), + ("object", {"a": 1}), + ("nested_array", [1.0]), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [queryVector Literal Not Expression]: a queryVector given as a field +# reference, expression object, or variable is treated as a literal value and +# rejected without evaluation or array unwrapping. +VECTORSEARCH_QUERY_VECTOR_LITERAL_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_literal_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": qv, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should treat a {tid} queryVector as a literal, not " + "evaluate it as an expression", + ) + for tid, qv in [ + ("field_ref", "$embedding"), + ("now_variable", "$$NOW"), + ("literal_expr", {"$literal": [1.0, DOUBLE_ZERO, DOUBLE_ZERO]}), + ("array_element_ref", ["$n", DOUBLE_ZERO, DOUBLE_ZERO]), + ] +] + +# Property [queryVector Binary Subtype Rejection]: a plain Binary queryVector +# whose subtype is not the vector subtype is rejected. +VECTORSEARCH_QUERY_VECTOR_BINARY_SUBTYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "query_vector_binary_subtype_zero", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": Binary(b"\x00\x01\x02", 0), + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a subtype-0 Binary queryVector", + ), +] + +# Property [queryVector Dimension Mismatch]: a queryVector whose element count +# differs from the index numDimensions is rejected, including an empty array. +VECTORSEARCH_QUERY_VECTOR_DIMENSION_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_dimension_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": qv, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a queryVector of length {n} against a " + f"{3}-dimension index", + ) + for tid, qv, n in [ + ("empty", [], 0), + ("too_few_one", [1.0], 1), + ("too_few_two", [1.0, DOUBLE_ZERO], 2), + ("too_many_four", [1.0, DOUBLE_ZERO, DOUBLE_ZERO, DOUBLE_ZERO], 4), + ] +] + +# Property [queryVector Non-Finite Rejection]: a non-finite queryVector element +# is rejected, including a value that overflows float32 to Infinity. +VECTORSEARCH_QUERY_VECTOR_NON_FINITE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_non_finite_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [val, DOUBLE_ZERO, 1.0], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} queryVector element as non-finite", + ) + for tid, val in [ + ("nan", FLOAT_NAN), + ("positive_infinity", FLOAT_INFINITY), + ("negative_infinity", FLOAT_NEGATIVE_INFINITY), + ("float32_overflow", 3.5e38), + ] +] + +# Property [queryVector Cosine Zero Vector Rejection]: a zero queryVector, +# including values that float32-narrow to all-zero, is rejected against a cosine +# index. +VECTORSEARCH_QUERY_VECTOR_ZERO_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"query_vector_zero_{tid}", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": qv, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} queryVector against a cosine index", + ) + for tid, qv in [ + ("all_zero", [DOUBLE_ZERO, DOUBLE_ZERO, DOUBLE_ZERO]), + ("negative_zero", [DOUBLE_NEGATIVE_ZERO, DOUBLE_ZERO, DOUBLE_ZERO]), + ("float32_underflow", [DOUBLE_MIN_SUBNORMAL, DOUBLE_ZERO, DOUBLE_ZERO]), + ] +] + +# Property [queryVector Packed-Bit Cosine Rejection]: a packed_bit BinData +# queryVector is rejected against a cosine-indexed field because binary vectors +# require euclidean similarity. +VECTORSEARCH_QUERY_VECTOR_PACKED_BIT_COSINE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "query_vector_packed_bit_cosine", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "v8c", + "queryVector": Binary.from_vector( + [0b10110010], BinaryVectorDtype.PACKED_BIT, padding=0 + ), + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a packed_bit queryVector against a cosine index", + ), +] + +# Property [queryVector Packed-Bit Structural Rejection]: a packed_bit BinData +# queryVector with nonzero padding is rejected before the dimension-count and +# similarity checks run. +VECTORSEARCH_QUERY_VECTOR_PACKED_BIT_STRUCTURE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "query_vector_packed_bit_nonzero_padding", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "v8", + "queryVector": Binary.from_vector( + [0b10110000], BinaryVectorDtype.PACKED_BIT, padding=3 + ), + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject a packed_bit queryVector with nonzero padding", + ), +] + +VECTORSEARCH_QUERY_VECTOR_ERRORS_ALL_TESTS = ( + VECTORSEARCH_QUERY_VECTOR_SCALAR_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_ELEMENT_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_LITERAL_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_BINARY_SUBTYPE_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_DIMENSION_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_NON_FINITE_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_ZERO_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_PACKED_BIT_COSINE_ERROR_TESTS + + VECTORSEARCH_QUERY_VECTOR_PACKED_BIT_STRUCTURE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_QUERY_VECTOR_ERRORS_ALL_TESTS)) +def test_vectorSearch_query_vector_errors(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: queryVector rejections.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_required_field_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_required_field_errors.py new file mode 100644 index 000000000..d5dec3aaf --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_required_field_errors.py @@ -0,0 +1,253 @@ +"""Tests for the $vectorSearch stage: required-field and null-as-absent errors.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_NOT_OBJECT_ERROR, + UNKNOWN_ERROR, + VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [Required Fields and Null-as-Absent]: each required field, when +# omitted or set to null, surfaces that field's required-field error with null +# treated as field-absent, except limit null (a type error) and filter null (a +# parse error), which are not treated as absent. +VECTORSEARCH_REQUIRED_FIELD_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "index_omitted", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require the index field when it is omitted", + ), + VectorSearchTest( + "index_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": None, + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should treat index null as field-absent and require the index field", + ), + VectorSearchTest( + "path_omitted", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require the path field when it is omitted", + ), + VectorSearchTest( + "path_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": None, + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should treat path null as field-absent and require the path field", + ), + VectorSearchTest( + "query_vector_omitted", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require queryVector when it is omitted", + ), + VectorSearchTest( + "query_vector_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": None, + "numCandidates": 10, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should treat queryVector null as field-absent and require queryVector", + ), + VectorSearchTest( + "limit_omitted", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require the limit field when it is omitted", + ), + VectorSearchTest( + "limit_omitted_enn", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "exact": True, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require the limit field when it is omitted under ENN", + ), + VectorSearchTest( + "limit_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": None, + } + } + ], + error_code=VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR, + msg="$vectorSearch should treat limit null as a wrong type rather than field-absent", + ), + VectorSearchTest( + "num_candidates_omitted", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require numCandidates for ANN when it is omitted", + ), + VectorSearchTest( + "num_candidates_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": None, + "limit": 5, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should treat numCandidates null as field-absent and require it for ANN", + ), + VectorSearchTest( + "filter_null", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": None, + } + } + ], + error_code=EXPRESSION_NOT_OBJECT_ERROR, + msg="$vectorSearch should reject filter null as a non-object, not treat it as omitted", + ), + VectorSearchTest( + "empty_spec", + collection_fixture="vector_search_no_index_collection", + pipeline=[{"$vectorSearch": {}}], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should require the index field for an empty spec", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_REQUIRED_FIELD_ERROR_TESTS)) +def test_vectorSearch_required_field_errors(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: required-field and null-as-absent errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_scoring.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_scoring.py new file mode 100644 index 000000000..fee84bfd3 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_scoring.py @@ -0,0 +1,202 @@ +"""Tests for the $vectorSearch stage: score similarity and range.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Gte, + Len, + Lte, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [Score Similarity Function]: the score is computed per the index +# similarity function, so cosine, euclidean, and dotProduct indexes each produce +# their own scores for the same data and query. +VECTORSEARCH_SCORE_SIMILARITY_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "similarity_cosine_scores", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.9850712418556213)}, + {"_id": 3, "score": pytest.approx(0.9160251617431641)}, + {"_id": 4, "score": pytest.approx(0.6212677955627441)}, + {"_id": 5, "score": pytest.approx(0.5)}, + ], + msg="$vectorSearch should produce cosine similarity scores for a cosine index", + ), + VectorSearchTest( + "similarity_euclidean_scores", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "ve", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.9259259104728699)}, + {"_id": 3, "score": pytest.approx(0.7575758099555969)}, + {"_id": 4, "score": pytest.approx(0.4385964572429657)}, + {"_id": 5, "score": pytest.approx(0.3333333432674408)}, + ], + msg="$vectorSearch should produce euclidean similarity scores for a euclidean index", + ), + VectorSearchTest( + "similarity_dot_product_scores", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vd", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.8999999761581421)}, + {"_id": 3, "score": pytest.approx(0.800000011920929)}, + {"_id": 4, "score": pytest.approx(0.6000000238418579)}, + {"_id": 5, "score": pytest.approx(0.5)}, + ], + msg="$vectorSearch should produce dotProduct similarity scores for a dotProduct index", + ), +] + +# Property [Score Filter Invariance]: a pre-filter narrows the candidate set but +# leaves a surviving document's vectorSearchScore identical to the unfiltered score. +VECTORSEARCH_SCORE_FILTER_INVARIANCE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "filter_preserves_surviving_scores", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "filter": {"cat": "x"}, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected=[ + {"_id": 1, "score": pytest.approx(1.0)}, + {"_id": 2, "score": pytest.approx(0.9850712418556213)}, + ], + msg="$vectorSearch should keep a surviving document's score identical under a filter", + ), +] + +# Property [searchScore Omitted]: requesting the unpopulated metadata name +# searchScore omits the projected field rather than erroring or populating it, +# because $vectorSearch populates vectorSearchScore, not searchScore. +VECTORSEARCH_SEARCHSCORE_OMITTED_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "search_score_metadata_omitted", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 2, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=[{"_id": 1}, {"_id": 2}], + msg="$vectorSearch should omit the searchScore field rather than error or populate it", + ), +] + +# Property [Score Range]: every returned document is assigned a vectorSearchScore +# that falls within the closed interval [0, 1]. +VECTORSEARCH_SCORE_RANGE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "score_range_within_unit_interval", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 0, "score": {"$meta": "vectorSearchScore"}}}, + ], + expected={ + "cursor.firstBatch": Len(5), + "cursor.firstBatch.0.score": [Gte(0), Lte(1)], + "cursor.firstBatch.1.score": [Gte(0), Lte(1)], + "cursor.firstBatch.2.score": [Gte(0), Lte(1)], + "cursor.firstBatch.3.score": [Gte(0), Lte(1)], + "cursor.firstBatch.4.score": [Gte(0), Lte(1)], + }, + msg="$vectorSearch should assign every result a score within [0, 1]", + ), +] + +VECTORSEARCH_SCORING_ALL_TESTS = ( + VECTORSEARCH_SCORE_SIMILARITY_TESTS + + VECTORSEARCH_SCORE_FILTER_INVARIANCE_TESTS + + VECTORSEARCH_SEARCHSCORE_OMITTED_TESTS + + VECTORSEARCH_SCORE_RANGE_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_SCORING_ALL_TESTS)) +def test_vectorSearch_scoring(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: score similarity and range.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_search_node_preference.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_search_node_preference.py new file mode 100644 index 000000000..9cbd96a68 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_search_node_preference.py @@ -0,0 +1,271 @@ +"""Tests for the $vectorSearch stage: searchNodePreference behavior and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [searchNodePreference Accepted No Effect]: a searchNodePreference +# specification is accepted and recognized without changing the result set, +# whether it carries extra keys alongside a valid key or is null (treated as +# omitted). +VECTORSEARCH_SEARCH_NODE_PREFERENCE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "search_node_preference_key", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": {"key": "n1"}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should accept a searchNodePreference key with no effect " + "on the result set", + ), + VectorSearchTest( + "search_node_preference_extra_keys", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": {"key": "n1", "extra": "x"}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should silently ignore extra keys alongside a valid " + "searchNodePreference key", + ), + VectorSearchTest( + "search_node_preference_null", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": None, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should treat a null searchNodePreference as omitted and succeed", + ), +] + +# Property [searchNodePreference Type Rejection]: a searchNodePreference value of +# any non-document BSON type, including an array, is rejected as not a document +# with no coercion. +VECTORSEARCH_SEARCH_NODE_PREFERENCE_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"search_node_preference_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": val, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} searchNodePreference value as not a document", + ) + for tid, val in [ + ("string", "n1"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", []), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [searchNodePreference Key Required]: a searchNodePreference that omits +# its key, or gives a null key (treated as field-absent, not a type error), +# requires the key. +VECTORSEARCH_SEARCH_NODE_PREFERENCE_KEY_REQUIRED_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"search_node_preference_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": snp, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=msg, + ) + for tid, snp, msg in [ + ( + "key_omitted", + {}, + "$vectorSearch should require the searchNodePreference key when it is omitted", + ), + ( + "key_null", + {"key": None}, + "$vectorSearch should treat a null searchNodePreference key as field-absent " + "and require it", + ), + ] +] + +# Property [searchNodePreference Key Type Rejection]: a searchNodePreference key +# of any non-string BSON type is rejected as a non-string with no coercion. +VECTORSEARCH_SEARCH_NODE_PREFERENCE_KEY_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"search_node_preference_key_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "searchNodePreference": {"key": val}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} searchNodePreference key as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", ["a"]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +VECTORSEARCH_SEARCH_NODE_PREFERENCE_ALL_TESTS = ( + VECTORSEARCH_SEARCH_NODE_PREFERENCE_TESTS + + VECTORSEARCH_SEARCH_NODE_PREFERENCE_TYPE_ERROR_TESTS + + VECTORSEARCH_SEARCH_NODE_PREFERENCE_KEY_REQUIRED_ERROR_TESTS + + VECTORSEARCH_SEARCH_NODE_PREFERENCE_KEY_TYPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_SEARCH_NODE_PREFERENCE_ALL_TESTS)) +def test_vectorSearch_search_node_preference(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: searchNodePreference behavior and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stage_basics.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stage_basics.py new file mode 100644 index 000000000..743827a2f --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stage_basics.py @@ -0,0 +1,218 @@ +"""Tests for the $vectorSearch stage: stage-level acceptance and errors.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + QUERY_METADATA_NOT_AVAILABLE_ERROR, + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +# Property [quantization Silently Ignored]: quantization is accepted with no +# validation regardless of value or BSON type and produces results identical to +# omitting it, never surfacing a value or type error. +VECTORSEARCH_QUANTIZATION_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"quantization_{tid}", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "quantization": value, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg=f"$vectorSearch should silently ignore a {tid} quantization value and " + "return the same results as omitting it", + ) + for tid, value in [ + ("scalar", "scalar"), + ("bogus", "bogus"), + ("int32", 1), + ("int64", Int64(2)), + ("double", 1.5), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("bool", True), + ("array", [1]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("null", None), + ] +] + +# Property [Unknown Spec Field Silently Ignored]: an unrecognized top-level spec +# field alongside all required fields is silently ignored and the query succeeds +# with results identical to omitting it, rather than raising an unknown-field error. +VECTORSEARCH_UNKNOWN_FIELD_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "unknown_field_ignored", + raw_res=True, + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "bogus": 1, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$vectorSearch should silently ignore an unrecognized top-level spec " + "field and return the same results as omitting it", + ), +] + +# Property [model/query Mutual Exclusivity]: model or query supplied alongside +# queryVector is rejected, because model is autoEmbed-only and exactly one of +# query and queryVector may be present, while both fields are still recognized by +# the parser. +VECTORSEARCH_MODEL_QUERY_EXCLUSIVITY_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "model_with_query_vector", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "model": "voyage-3", + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject model supplied alongside queryVector", + ), + VectorSearchTest( + "query_with_query_vector", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "query": {"text": "hello"}, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject query supplied alongside queryVector", + ), +] + +# Property [Score Metadata Unavailable]: requesting metadata that $vectorSearch +# does not produce (textScore) in a following $project fails, because the stage +# populates vectorSearchScore, not text-score metadata. +VECTORSEARCH_SCORE_METADATA_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "score_metadata_text_score_unavailable", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + } + }, + {"$project": {"_id": 1, "score": {"$meta": "textScore"}}}, + ], + error_code=QUERY_METADATA_NOT_AVAILABLE_ERROR, + msg="$vectorSearch should not provide text-score metadata for a following $meta textScore", + ), +] + +VECTORSEARCH_STAGE_BASICS_ALL_TESTS = ( + VECTORSEARCH_QUANTIZATION_TESTS + + VECTORSEARCH_UNKNOWN_FIELD_TESTS + + VECTORSEARCH_MODEL_QUERY_EXCLUSIVITY_ERROR_TESTS + + VECTORSEARCH_SCORE_METADATA_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_STAGE_BASICS_ALL_TESTS)) +def test_vectorSearch_stage_basics(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: stage-level acceptance and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + raw_res=test_case.raw_res, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stored_source.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stored_source.py new file mode 100644 index 000000000..545bcd61a --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/test_vectorSearch_stored_source.py @@ -0,0 +1,251 @@ +"""Tests for the $vectorSearch stage: returnStoredSource behavior and errors.""" + +from __future__ import annotations + +import time +from datetime import datetime, timezone + +import pytest +from bson import ( + Code, + Int64, + MaxKey, + MinKey, + ObjectId, + Regex, + Timestamp, +) +from bson.binary import Binary + +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + NotExists, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_ZERO, +) + +from .utils.vectorSearch_common import ( + VectorSearchTest, +) + +pytestmark = pytest.mark.requires(search=True) + +_INDEX_READY_TIMEOUT_SECONDS = 120 + +_STORED_SOURCE_CORPUS = [ + { + "_id": 1, + "name": "a", + "title": "T1", + "body": "B1", + "embedding": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + }, + { + "_id": 2, + "name": "b", + "title": "T2", + "body": "B2", + "embedding": [DOUBLE_ZERO, 1.0, DOUBLE_ZERO], + }, + # Doc 3 intentionally lacks the "title" field. + {"_id": 3, "name": "c", "body": "B3", "embedding": [0.9, 0.1, DOUBLE_ZERO]}, +] + + +@pytest.fixture(scope="module") +def stored_source_vector_search_collection(engine_client, worker_id): + """Provide a collection with a READY vectorSearch index configured with storedSource.""" + db_name = f"vs_stored_source_{worker_id}" + db = engine_client[db_name] + coll = db["stored_source_vectors"] + db.drop_collection(coll.name) + db.create_collection(coll.name) + coll.insert_many([dict(doc) for doc in _STORED_SOURCE_CORPUS]) + db.command( + { + "createSearchIndexes": coll.name, + "indexes": [ + { + "name": "vs_stored_source_index", + "type": "vectorSearch", + "definition": { + "storedSource": {"include": ["name"]}, + "fields": [ + { + "type": "vector", + "path": "embedding", + "numDimensions": 3, + "similarity": "cosine", + }, + ], + }, + } + ], + } + ) + deadline = time.monotonic() + _INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(coll.aggregate([{"$listSearchIndexes": {}}])) + if indexes and indexes[0].get("status") == "READY": + break + time.sleep(2) + else: + raise TimeoutError("storedSource vectorSearch index did not reach READY state") + yield coll + engine_client.drop_database(db_name) + + +# Property [returnStoredSource Full Documents]: against a storedSource-configured +# index, both returnStoredSource false (the default) and true return the full +# document with all fields, because the configured stored-source field +# restriction is non-functional on this target. +VECTORSEARCH_RETURN_STORED_SOURCE_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"return_stored_source_{tid}", + collection_fixture="stored_source_vector_search_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_stored_source_index", + "path": "embedding", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "returnStoredSource": rss, + } + }, + {"$sort": {"_id": 1}}, + ], + expected=PerDoc( + {"_id": Eq(1), "name": Eq("a"), "title": Eq("T1"), "body": Eq("B1")}, + {"_id": Eq(2), "name": Eq("b"), "title": Eq("T2"), "body": Eq("B2")}, + {"_id": Eq(3), "name": Eq("c"), "title": NotExists(), "body": Eq("B3")}, + ), + msg=f"$vectorSearch should return the full document when returnStoredSource is {tid}", + ) + for tid, rss in [("false", False), ("true", True)] +] + +# Property [returnStoredSource Null As Omitted]: returnStoredSource null is +# treated as field-absent and the query succeeds with full documents as if the +# field were omitted, rather than triggering the not-configured error that +# returnStoredSource true raises on an index without a storedSource configuration. +VECTORSEARCH_RETURN_STORED_SOURCE_NULL_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "return_stored_source_null_omitted", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "returnStoredSource": None, + } + }, + ], + expected=PerDoc( + {"_id": Eq(1)}, {"_id": Eq(2)}, {"_id": Eq(3)}, {"_id": Eq(4)}, {"_id": Eq(5)} + ), + msg="$vectorSearch should treat returnStoredSource null as omitted and succeed " + "on an index without a storedSource configuration", + ), +] + +# Property [returnStoredSource Type Rejection]: a non-boolean returnStoredSource +# value is rejected as not a boolean, with no coercion. +VECTORSEARCH_RETURN_STORED_SOURCE_TYPE_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + f"return_stored_source_type_{tid}", + collection_fixture="vector_search_no_index_collection", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "returnStoredSource": val, + } + } + ], + error_code=UNKNOWN_ERROR, + msg=f"$vectorSearch should reject a {tid} returnStoredSource value as a non-boolean", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.0), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("string", "true"), + ("array", [True]), + ("object", {"a": 1}), + ("objectid", ObjectId("5a9427648b0beebeb69537a5")), + ("datetime", datetime(2020, 1, 1, tzinfo=timezone.utc)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [returnStoredSource Not Configured]: returnStoredSource true against an +# index that has no storedSource configuration is rejected because stored source +# is not configured for that index. +VECTORSEARCH_RETURN_STORED_SOURCE_NOT_CONFIGURED_ERROR_TESTS: list[VectorSearchTest] = [ + VectorSearchTest( + "return_stored_source_not_configured", + pipeline=[ + { + "$vectorSearch": { + "index": "vs_core_index", + "path": "vc", + "queryVector": [1.0, DOUBLE_ZERO, DOUBLE_ZERO], + "numCandidates": 10, + "limit": 5, + "returnStoredSource": True, + } + } + ], + error_code=UNKNOWN_ERROR, + msg="$vectorSearch should reject returnStoredSource true when the index has no " + "storedSource configuration", + ), +] + +VECTORSEARCH_STORED_SOURCE_ALL_TESTS = ( + VECTORSEARCH_RETURN_STORED_SOURCE_TESTS + + VECTORSEARCH_RETURN_STORED_SOURCE_NULL_TESTS + + VECTORSEARCH_RETURN_STORED_SOURCE_TYPE_ERROR_TESTS + + VECTORSEARCH_RETURN_STORED_SOURCE_NOT_CONFIGURED_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(VECTORSEARCH_STORED_SOURCE_ALL_TESTS)) +def test_vectorSearch_stored_source(test_case: VectorSearchTest, engine_client, request): + """$vectorSearch: returnStoredSource behavior and errors.""" + coll = request.getfixturevalue(test_case.collection_fixture) + result = execute_command( + coll, + {"aggregate": coll.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/vectorSearch_common.py b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/vectorSearch_common.py new file mode 100644 index 000000000..e2bc13248 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/vectorSearch/utils/vectorSearch_common.py @@ -0,0 +1,44 @@ +"""Shared dataclass and pre-filter constants for $vectorSearch stage tests. + +The ``VectorSearchTest`` dataclass tags each case with the fixture and execution +mode it runs under. The ObjectId/UUID constants are stored on the shared corpus +(see conftest.py) and queried back by the filter, parentFilter, and +explainOptions test files, so they live here rather than in any single file.""" + +from __future__ import annotations + +import uuid +from dataclasses import dataclass + +from bson import ObjectId +from bson.binary import Binary, UuidRepresentation + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) + + +@dataclass(frozen=True) +class VectorSearchTest(StageTestCase): + """A $vectorSearch case, tagged with the collection fixture and execution mode it runs under.""" + + collection_fixture: str = "vector_search_collection" + explain: bool = False + raw_res: bool = False + + +# ObjectId and UUID values for the filter pre-filtering tests. The specific +# values are arbitrary; only the A/B partition matters: each is stored on some +# corpus docs and queried back, so a filter on "A" must return exactly the docs +# that stored "A" and none that stored "B". +_FILTER_OID_A = ObjectId("5a9427648b0beebeb69537a5") + +_FILTER_OID_B = ObjectId("5a9427648b0beebeb69537b6") + +_FILTER_UUID_A = Binary.from_uuid( + uuid.UUID("11111111-1111-1111-1111-111111111111"), UuidRepresentation.STANDARD +) + +_FILTER_UUID_B = Binary.from_uuid( + uuid.UUID("22222222-2222-2222-2222-222222222222"), UuidRepresentation.STANDARD +) diff --git a/documentdb_tests/framework/error_codes.py b/documentdb_tests/framework/error_codes.py index 86884ce4c..5b9eb63b1 100644 --- a/documentdb_tests/framework/error_codes.py +++ b/documentdb_tests/framework/error_codes.py @@ -6,6 +6,7 @@ BAD_VALUE_ERROR = 2 NO_SUCH_KEY_ERROR = 4 GRAPH_CONTAINS_CYCLE_ERROR = 5 +UNKNOWN_ERROR = 8 FAILED_TO_PARSE_ERROR = 9 UNAUTHORIZED_ERROR = 13 TYPE_MISMATCH_ERROR = 14 @@ -34,6 +35,7 @@ OPERATION_FAILED_ERROR = 96 DOCUMENT_VALIDATION_FAILURE_ERROR = 121 NOT_A_REPLICA_SET_ERROR = 123 +VECTOR_SEARCH_NESTED_ROOT_MISMATCH_ERROR = 125 CAPPED_POSITION_LOST_ERROR = 136 INCOMPATIBLE_COLLATION_VERSION_ERROR = 161 VIEW_DEPTH_LIMIT_ERROR = 165 @@ -506,7 +508,9 @@ PIPELINE_LENGTH_LIMIT_ERROR = 7749501 PERCENTILE_INVALID_P_FIELD_ERROR = 7750301 PERCENTILE_INVALID_P_VALUE_ERROR = 7750303 +VECTOR_SEARCH_LIMIT_NOT_POSITIVE_ERROR = 7912700 ENCRYPTED_FIELD_TRIM_FACTOR_OUT_OF_RANGE_ERROR = 8574000 +VECTOR_SEARCH_LIMIT_NOT_NUMBER_ERROR = 8575100 COUNT_FIELD_ID_RESERVED_ERROR = 9039800 CONVERT_BYTE_ORDER_TYPE_ERROR = 9130001 CONVERT_BYTE_ORDER_VALUE_ERROR = 9130002 diff --git a/documentdb_tests/framework/property_checks.py b/documentdb_tests/framework/property_checks.py index 0ffb575cf..26c53a3d3 100644 --- a/documentdb_tests/framework/property_checks.py +++ b/documentdb_tests/framework/property_checks.py @@ -311,6 +311,23 @@ def __repr__(self) -> str: return f"{type(self).__name__}({self.minimum!r})" +class Lte(Check): + """Assert that the field is less than or equal to a value.""" + + def __init__(self, maximum: Any) -> None: + self.maximum = maximum + + def check(self, value: Any, path: str) -> str | None: + if value is _FIELD_ABSENT: + return f"expected '{path}' <= {self.maximum!r}, but field is missing" + if value > self.maximum: + return f"expected '{path}' <= {self.maximum!r}, got {value!r}" + return None + + def __repr__(self) -> str: + return f"{type(self).__name__}({self.maximum!r})" + + class NonEmptyStr(Check): """Assert that the field is a non-empty string. From 0d064072291be9754c965d449f0bd4cbc67b9bb3 Mon Sep 17 00:00:00 2001 From: Daniel Frankcom Date: Fri, 26 Jun 2026 16:17:54 -0700 Subject: [PATCH 3/3] Allow conftest.py in test folders without naming-rule violations Signed-off-by: Daniel Frankcom --- documentdb_tests/framework/test_structure_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentdb_tests/framework/test_structure_validator.py b/documentdb_tests/framework/test_structure_validator.py index 57f811f08..dd40a3f75 100644 --- a/documentdb_tests/framework/test_structure_validator.py +++ b/documentdb_tests/framework/test_structure_validator.py @@ -15,7 +15,7 @@ def validate_python_files_in_tests(tests_dir: Path) -> list[str]: allowed_folders = {"utils", "fixtures", "__pycache__"} for py_file in tests_dir.rglob("*.py"): - if py_file.name == "__init__.py": + if py_file.name in ("__init__.py", "conftest.py"): continue if any(folder in py_file.parts for folder in allowed_folders): continue