diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index d4f394227..ec0404df2 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -61,6 +61,17 @@ jobs: --json-report --json-report-file=${{ github.workspace }}/.test-results/${{ matrix.target.name }}-report.json \ --junitxml=${{ github.workspace }}/.test-results/${{ matrix.target.name }}-results.xml + - name: Dump container logs + if: always() + run: | + # One file per service in the profile + mkdir -p "${{ github.workspace }}/.test-results/container-logs" + for svc in $(docker compose -f dev/compose.yaml --profile ${{ matrix.target.profile }} config --services); do + docker compose -f dev/compose.yaml --profile ${{ matrix.target.profile }} \ + logs --no-color --timestamps "$svc" \ + > "${{ github.workspace }}/.test-results/container-logs/${svc}.log" 2>&1 || true + done + - name: Upload test results if: always() uses: actions/upload-artifact@v7 diff --git a/dev/compose.yaml b/dev/compose.yaml index 4258cecfb..327e81aa8 100644 --- a/dev/compose.yaml +++ b/dev/compose.yaml @@ -27,7 +27,7 @@ # query: # # A service with no `x-test-target` is not a test target and is ignored by the -# registry. +# registry (e.g. the mongot sidecar, which is reached only through its mongod). # # Memory: each mongod caps its WiredTiger cache (--wiredTigerCacheSizeGB). By # default a mongod sizes its cache to ~50% of the host/VM RAM; with several @@ -60,7 +60,26 @@ services: mongo-replset: image: mongo:8.2.4 profiles: ["mongo-replset", "all"] - command: ["--replSet", "rs0", "--bind_ip_all", "--wiredTigerCacheSizeGB", "1.5"] + command: + - "--replSet" + - "rs0" + - "--bind_ip_all" + - "--wiredTigerCacheSizeGB" + - "1.5" + # Point at the mongot search sidecar so this replica set also serves the + # search surfaces. mongot is transparent to all other behavior, so the + # set behaves identically to a plain replica set apart from gaining + # search; it is one target, not two. + - "--setParameter" + - "mongotHost=mongot:27028" + - "--setParameter" + - "searchIndexManagementHostAndPort=mongot:27028" + - "--setParameter" + - "useGrpcForSearch=true" + - "--setParameter" + - "skipAuthenticationToMongot=true" + - "--setParameter" + - "skipAuthenticationToSearchIndexManagementServer=true" ports: - "27018:27017" healthcheck: @@ -71,3 +90,31 @@ services: x-test-target: engine: mongodb query: directConnection=true + + # mongot: the search sidecar for the mongo-replset target. Not a test target + # on its own; the suite reaches it only through mongo-replset. mongot is + # MongoDB Search Community Edition (SSPL, same license as the server). It + # replicates from the replica set as an authenticated sync source and reads + # its password from a file, so the entrypoint writes that file (a fixed + # local-dev secret, matched by the searchCoordinator user the harness creates + # on the replica set) with owner-only permissions before launching. It retries + # the connection until that user exists. + mongot: + image: mongodb/mongodb-community-search:latest + profiles: ["mongo-replset", "all"] + entrypoint: + - "sh" + - "-c" + - > + umask 077 && + mkdir -p /mongot-secrets && + printf '%s' "$$MONGOT_SYNC_PASSWORD" > /mongot-secrets/passwordFile && + exec /mongot-community/mongot --config /mongot-config/mongot.yml + environment: + # Fixed local-dev secret shared with the searchCoordinator user the + # harness provisions on mongo-replset. Not a real credential. + MONGOT_SYNC_PASSWORD: "searchSyncPassword" + # Cap mongot's JVM heap. Unset, the JVM sizes its max heap to ~25% of host RAM. + JAVA_TOOL_OPTIONS: "-Xmx1g" + volumes: + - ./mongot.yml:/mongot-config/mongot.yml:ro diff --git a/dev/mongot.yml b/dev/mongot.yml new file mode 100644 index 000000000..9860f54a2 --- /dev/null +++ b/dev/mongot.yml @@ -0,0 +1,30 @@ +# mongot configuration for the mongo-replset target (dev/compose.yaml service +# "mongot"). mongot is MongoDB Search Community Edition (SSPL), the same license +# as the server. It runs alongside the replica set's mongod and serves the +# search and vector search surfaces. +# +# mongot replicates from the mongod replica set as a sync source. It requires an +# authenticated connection (it has no unauthenticated mode), so it logs in as a +# dedicated user holding the searchCoordinator role. That user and its password +# file are provisioned by the target's startup (see dev/compose.yaml). +syncSource: + replicaSet: + hostAndPort: "mongo-replset:27017" + username: "searchSyncUser" + passwordFile: "/mongot-secrets/passwordFile" + authSource: "admin" + tls: false +storage: + dataPath: "/var/lib/mongot" +server: + grpc: + # mongod reaches mongot here (see mongotHost / searchIndexManagementHostAndPort + # on the mongo-replset service). Bound on all interfaces so the mongod + # container can connect over the compose network. + address: "0.0.0.0:27028" + tls: + mode: "disabled" +healthCheck: + address: "0.0.0.0:8080" +logging: + verbosity: INFO diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/__init__.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/conftest.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/conftest.py new file mode 100644 index 000000000..963019c77 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/conftest.py @@ -0,0 +1,31 @@ +"""Shared fixtures for $search stage tests. + +The dynamic-mapping ``indexed_collection`` corpus is queried read-only by most +$search test files, so it is built once per package here rather than duplicated +per file. Single-operator corpora (wildcard, equals, in, ...) stay inline in +their own test file, since each is used by exactly one file.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + FIXTURE_DOCS, + create_dynamic_search_index, +) +from documentdb_tests.framework import fixtures + + +@pytest.fixture(scope="package") +def indexed_collection(engine_client, worker_id): + """A package-scoped collection populated with the fixture docs and a ready + dynamic search index, shared read-only across the matching tests so the + index is built and polled once rather than per test.""" + db_name = fixtures.generate_database_name("stages_search_shared", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["indexed"] + coll.insert_many(FIXTURE_DOCS) + create_dynamic_search_index(coll) + yield coll + fixtures.cleanup_database(engine_client, db_name) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_autocomplete.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_autocomplete.py new file mode 100644 index 000000000..fe19b334f --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_autocomplete.py @@ -0,0 +1,493 @@ +"""Tests for the $search autocomplete operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +_AUTOCOMPLETE_DOCS = [ + {"_id": 1, "ac": "september"}, # matches sep, sept, and the fuzzy typos + {"_id": 2, "ac": "october"}, # control: shares no probed prefix + {"_id": 3, "ac": "separate"}, # shares the sep prefix but not sept + {"_id": 4, "ac": "quick brown"}, # two tokens for the tokenOrder cases +] + +_AUTOCOMPLETE_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "ac": {"type": "autocomplete"}, + }, + } +} + + +@pytest.fixture(scope="module") +def autocomplete_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping an + autocomplete-typed field, shared read-only across the autocomplete cases so + the index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_autocomplete", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["autocomplete"] + coll.insert_many(_AUTOCOMPLETE_DOCS) + create_search_index(coll, _AUTOCOMPLETE_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Autocomplete Edge-Gram Prefix Matching]: on an autocomplete-mapped +# path the query matches stored tokens by edge-gram prefix. +SEARCH_AUTOCOMPLETE_PREFIX_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_prefix_short", + pipeline=[ + {"$search": {"autocomplete": {"path": "ac", "query": "sep"}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 3)]}, + msg="$search autocomplete should match every stored token sharing the query prefix", + ), + StageTestCase( + "autocomplete_score_boost", + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "sep", + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 3)]}, + msg="$search autocomplete should accept a score modifier and still return its matches", + ), + StageTestCase( + "autocomplete_prefix_longer", + pipeline=[ + {"$search": {"autocomplete": {"path": "ac", "query": "sept"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search autocomplete should match only the tokens extending the longer query prefix", + ), +] + +# Property [Autocomplete Fuzzy Matching]: autocomplete accepts a fuzzy.maxEdits of +# 1 or 2 and matches a query within that many edits of a stored token. +SEARCH_AUTOCOMPLETE_FUZZY_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_fuzzy_max_edits_1", + # "septemer" is one deletion (the "b") away from the stored "september". + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "septemer", + "fuzzy": {"maxEdits": 1}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search autocomplete should match a query within one edit at fuzzy.maxEdits 1", + ), + StageTestCase( + "autocomplete_fuzzy_max_edits_2", + # "septmer" is two deletions (the "e" and the "b") from the stored "september". + pipeline=[ + { + "$search": { + "autocomplete": {"path": "ac", "query": "septmer", "fuzzy": {"maxEdits": 2}} + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search autocomplete should match a query within two edits at fuzzy.maxEdits 2", + ), +] + +# Property [Autocomplete Token Order]: tokenOrder "any" matches the query terms in +# any order while "sequential" requires them in the stored order. +SEARCH_AUTOCOMPLETE_TOKEN_ORDER_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_token_order_any", + pipeline=[ + { + "$search": { + "autocomplete": {"path": "ac", "query": "brown quick", "tokenOrder": "any"} + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search autocomplete tokenOrder any should match the query terms in any order", + ), + StageTestCase( + "autocomplete_token_order_sequential_in_order", + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "quick brown", + "tokenOrder": "sequential", + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search autocomplete tokenOrder sequential should match terms in the stored order", + ), + StageTestCase( + "autocomplete_token_order_sequential_reversed", + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "brown quick", + "tokenOrder": "sequential", + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search autocomplete tokenOrder sequential should not match terms out of order", + ), +] + +# Property [Autocomplete Query Array OR]: autocomplete.query accepts an array of +# strings, matching the union of the documents matched by each element prefix. +SEARCH_AUTOCOMPLETE_QUERY_ARRAY_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_query_array_or", + pipeline=[ + {"$search": {"autocomplete": {"path": "ac", "query": ["sept", "octo"]}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search autocomplete should match the union of a multi-element query array's prefixes", + ), +] + +# Property [Autocomplete Fuzzy prefixLength And maxExpansions]: autocomplete.fuzzy +# accepts prefixLength and maxExpansions, where prefixLength locks a +# code-point-counted prefix from edits (a typo inside the locked prefix does not +# match while prefixLength 0 still allows the fuzzy match) and maxExpansions is +# accepted across its 1..1000 bound. +SEARCH_AUTOCOMPLETE_FUZZY_PREFIX_EXPANSION_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_fuzzy_prefix_unlocked", + # "xeptember" is one substitution (x for s) at code point 0 from the stored + # "september"; prefixLength 0 locks nothing so the edit is allowed. + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "xeptember", + "fuzzy": {"maxEdits": 1, "prefixLength": 0}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search autocomplete should allow a fuzzy edit outside the prefix when " + "prefixLength is 0", + ), + StageTestCase( + "autocomplete_fuzzy_prefix_locked", + # prefixLength 2 locks code points 0 and 1, so the substitution at code + # point 0 falls inside the locked prefix and cannot match. + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "xeptember", + "fuzzy": {"maxEdits": 1, "prefixLength": 2}, + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search autocomplete should lock the prefix from edits so a locked-prefix typo " + "does not match", + ), + *[ + StageTestCase( + f"autocomplete_fuzzy_max_expansions_{label}", + # "septemer" is one deletion from the stored "september", matching only it + # regardless of the maxExpansions bound. + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "septemer", + "fuzzy": {"maxEdits": 1, "maxExpansions": val}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg=f"$search autocomplete should accept fuzzy.maxExpansions at the {label} bound " + "and still match", + ) + for label, val in [("lower", 1), ("upper", 1000)] + ], +] + +SEARCH_AUTOCOMPLETE_TESTS = ( + SEARCH_AUTOCOMPLETE_PREFIX_TESTS + + SEARCH_AUTOCOMPLETE_FUZZY_TESTS + + SEARCH_AUTOCOMPLETE_TOKEN_ORDER_TESTS + + SEARCH_AUTOCOMPLETE_QUERY_ARRAY_TESTS + + SEARCH_AUTOCOMPLETE_FUZZY_PREFIX_EXPANSION_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_AUTOCOMPLETE_TESTS)) +def test_search_autocomplete_cases(autocomplete_collection, test_case: StageTestCase): + """Test $search autocomplete edge-gram prefix, fuzzy, and tokenOrder matching.""" + result = execute_command( + autocomplete_collection, + {"aggregate": autocomplete_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Autocomplete Path Mapping Required]: autocomplete requires the queried +# path to be mapped as an autocomplete field. +SEARCH_AUTOCOMPLETE_PATH_MAPPING_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_path_not_autocomplete_mapped", + pipeline=[ + {"$search": {"autocomplete": {"path": "title", "query": "sep"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject a path with no autocomplete index field definition", + ), +] + +# Property [Autocomplete fuzzy.maxEdits Range]: autocomplete.fuzzy.maxEdits must +# be 1 or 2, so any value outside that range is rejected. +SEARCH_AUTOCOMPLETE_FUZZY_MAX_EDITS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"autocomplete_fuzzy_max_edits_{val}", + pipeline=[ + { + "$search": { + "autocomplete": {"path": "ac", "query": "sep", "fuzzy": {"maxEdits": val}} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search autocomplete should reject a fuzzy.maxEdits of {val}, which is not 1 or 2", + ) + for val in [0, 3] +] + +# Property [Autocomplete tokenOrder Enum]: autocomplete.tokenOrder must be one of +# "any" or "sequential", so any other value is rejected. +SEARCH_AUTOCOMPLETE_TOKEN_ORDER_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_token_order_bogus", + pipeline=[ + {"$search": {"autocomplete": {"path": "ac", "query": "sep", "tokenOrder": "bogus"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject a tokenOrder outside the [any, sequential] enum", + ), +] + +# Property [Autocomplete query Validation]: autocomplete.query is required and +# must be a non-empty string or array of non-null strings. +SEARCH_AUTOCOMPLETE_QUERY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_query_missing", + pipeline=[{"$search": {"autocomplete": {"path": "ac"}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject an operator missing the required query", + ), + StageTestCase( + "autocomplete_query_empty_string", + pipeline=[{"$search": {"autocomplete": {"path": "ac", "query": ""}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject an empty-string query", + ), + StageTestCase( + "autocomplete_query_empty_array", + pipeline=[{"$search": {"autocomplete": {"path": "ac", "query": []}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject an empty-array query", + ), + *[ + StageTestCase( + f"autocomplete_query_non_string_{tid}", + pipeline=[{"$search": {"autocomplete": {"path": "ac", "query": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search autocomplete should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "sep"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "autocomplete_query_array_element_null", + pipeline=[{"$search": {"autocomplete": {"path": "ac", "query": ["sep", None]}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject a null query-array element", + ), + StageTestCase( + "autocomplete_query_array_element_non_string", + pipeline=[{"$search": {"autocomplete": {"path": "ac", "query": ["sep", 1]}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject a non-string query-array element", + ), +] + +# Property [Autocomplete path Validation]: autocomplete.path is required and +# string-only, unlike the document and array forms text accepts. +SEARCH_AUTOCOMPLETE_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_path_missing", + pipeline=[{"$search": {"autocomplete": {"query": "sep"}}}], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject an operator missing the required path", + ), + *[ + StageTestCase( + f"autocomplete_path_{tid}", + pipeline=[{"$search": {"autocomplete": {"path": val, "query": "sep"}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search autocomplete should reject a {tid} path as a non-string type", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"value": "ac"}), + ("array", ["ac"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [Autocomplete fuzzy.prefixLength And maxExpansions Bounds]: +# autocomplete.fuzzy.prefixLength must be non-negative and maxExpansions must fall +# within 1..1000, so a negative prefixLength and a maxExpansions outside those +# bounds are rejected. +SEARCH_AUTOCOMPLETE_FUZZY_BOUNDS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "autocomplete_fuzzy_prefix_length_negative", + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "sep", + "fuzzy": {"maxEdits": 1, "prefixLength": -1}, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search autocomplete should reject a negative fuzzy.prefixLength", + ), + *[ + StageTestCase( + f"autocomplete_fuzzy_max_expansions_{label}", + pipeline=[ + { + "$search": { + "autocomplete": { + "path": "ac", + "query": "sep", + "fuzzy": {"maxEdits": 1, "maxExpansions": val}, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search autocomplete should reject a fuzzy.maxExpansions of {val} outside the " + "bounds 1 to 1000", + ) + for label, val in [("zero", 0), ("over_max", 1001)] + ], +] + +SEARCH_AUTOCOMPLETE_ERROR_TESTS = ( + SEARCH_AUTOCOMPLETE_PATH_MAPPING_ERROR_TESTS + + SEARCH_AUTOCOMPLETE_FUZZY_MAX_EDITS_ERROR_TESTS + + SEARCH_AUTOCOMPLETE_TOKEN_ORDER_ERROR_TESTS + + SEARCH_AUTOCOMPLETE_QUERY_ERROR_TESTS + + SEARCH_AUTOCOMPLETE_PATH_ERROR_TESTS + + SEARCH_AUTOCOMPLETE_FUZZY_BOUNDS_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_AUTOCOMPLETE_ERROR_TESTS)) +def test_search_autocomplete_errors(autocomplete_collection, test_case: StageTestCase): + """Test $search autocomplete rejects unmapped paths and bad fuzzy.maxEdits/tokenOrder values.""" + result = execute_command( + autocomplete_collection, + {"aggregate": autocomplete_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_compound.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_compound.py new file mode 100644 index 000000000..d5c9c4aee --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_compound.py @@ -0,0 +1,463 @@ +"""Tests for the $search compound operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Compound Clause Composition]: the must, should, mustNot, and filter +# clauses compose into one clause set, so the matched documents are the must/filter +# intersection minus the mustNot matches, with should only optional. +SEARCH_COMPOUND_COMPOSITION_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_must_intersects", + pipeline=[ + { + "$search": { + "compound": { + "must": [ + {"text": {"query": "quick", "path": "title"}}, + {"text": {"query": "rabbit", "path": "title"}}, + ] + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 3)]}, + msg="$search compound should intersect multiple must clauses, matching only the " + "document satisfying every clause", + ), + StageTestCase( + "compound_must_not_excludes", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "mustNot": [{"text": {"query": "rabbit", "path": "title"}}], + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 4)]}, + msg="$search compound should exclude the mustNot matches from the must matches", + ), + StageTestCase( + "compound_filter_selects", + pipeline=[ + {"$search": {"compound": {"filter": [{"text": {"query": "quick", "path": "title"}}]}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should select the documents matching a filter clause", + ), + StageTestCase( + "compound_all_clause_types", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "should": [{"text": {"query": "brown", "path": "title"}}], + "mustNot": [{"text": {"query": "rabbit", "path": "title"}}], + "filter": [{"text": {"query": "fox", "path": "title"}}], + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search compound should compose all four clause types in one clause set, " + "intersecting must and filter and removing the mustNot matches", + ), +] + +# Property [Compound Should-Only Matching]: when a compound has no must or filter +# clause, the should clauses become a required OR (minimumShouldMatch defaults to 1), +# so the matched set is the union of the should clauses. +SEARCH_COMPOUND_SHOULD_ONLY_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_should_optional_or", + pipeline=[ + { + "$search": { + "compound": { + "should": [ + {"text": {"query": "quick", "path": "title"}}, + {"text": {"query": "turtle", "path": "title"}}, + ] + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should match the union of its should clauses when no must " + "clause is present", + ), +] + +# Property [Compound minimumShouldMatch]: minimumShouldMatch accepts 0 through the +# number of should clauses and requires at least that many should clauses to match +# alongside the must clause. +SEARCH_COMPOUND_MIN_SHOULD_MATCH_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_min_should_match_0", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "should": [ + {"text": {"query": "brown", "path": "title"}}, + {"text": {"query": "rabbit", "path": "title"}}, + ], + "minimumShouldMatch": 0, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound with minimumShouldMatch 0 should require no should clause, " + "matching every must document", + ), + StageTestCase( + "compound_min_should_match_1", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "should": [ + {"text": {"query": "brown", "path": "title"}}, + {"text": {"query": "rabbit", "path": "title"}}, + ], + "minimumShouldMatch": 1, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 3)]}, + msg="$search compound with minimumShouldMatch 1 should require at least one should " + "clause to match alongside the must clause", + ), + StageTestCase( + "compound_min_should_match_all", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "should": [ + {"text": {"query": "brown", "path": "title"}}, + {"text": {"query": "rabbit", "path": "title"}}, + ], + "minimumShouldMatch": 2, + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search compound with minimumShouldMatch equal to the should-clause count should " + "require every should clause to match", + ), +] + +# Property [Compound Score And Nesting]: a clause-level score boost, a +# compound-level score, and a compound nested at least three levels deep all +# execute and return the matched documents. +SEARCH_COMPOUND_SCORE_NESTING_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_clause_level_score_boost", + pipeline=[ + { + "$search": { + "compound": { + "must": [ + { + "text": { + "query": "quick", + "path": "title", + "score": {"boost": {"value": 2.0}}, + } + } + ] + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should accept a clause-level score boost and still return the " + "matches", + ), + StageTestCase( + "compound_level_score", + pipeline=[ + { + "$search": { + "compound": { + "must": [{"text": {"query": "quick", "path": "title"}}], + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should accept a compound-level score and still return the matches", + ), + StageTestCase( + "compound_nested_three_levels", + pipeline=[ + { + "$search": { + "compound": { + "must": [ + { + "compound": { + "must": [ + { + "compound": { + "must": [ + { + "text": { + "query": "quick", + "path": "title", + } + } + ] + } + } + ] + } + } + ] + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should execute a compound nested at least three levels deep", + ), +] + +# Property [Compound Single-Document Clause]: a compound clause accepts a single +# operator document in place of a one-element array, matching identically to the +# array-wrapped form. +SEARCH_COMPOUND_SINGLE_DOC_CLAUSE_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_single_doc_must", + pipeline=[ + {"$search": {"compound": {"must": {"text": {"query": "quick", "path": "title"}}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search compound should accept a single operator document as a must clause, " + "matching identically to the array-wrapped form", + ), +] + +SEARCH_COMPOUND_TESTS = ( + SEARCH_COMPOUND_COMPOSITION_TESTS + + SEARCH_COMPOUND_SHOULD_ONLY_TESTS + + SEARCH_COMPOUND_MIN_SHOULD_MATCH_TESTS + + SEARCH_COMPOUND_SCORE_NESTING_TESTS + + SEARCH_COMPOUND_SINGLE_DOC_CLAUSE_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_COMPOUND_TESTS)) +def test_search_compound_cases(indexed_collection, test_case: StageTestCase): + """Test $search compound clause composition, should-only, minimumShouldMatch, scoring.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Compound minimumShouldMatch Bounds]: compound.minimumShouldMatch must +# be between 0 and the number of should clauses, so a negative value or one +# greater than the should-clause count is rejected. +SEARCH_COMPOUND_MIN_SHOULD_MATCH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "min_should_match_negative", + pipeline=[ + { + "$search": { + "compound": { + "should": [{"text": {"query": "quick", "path": "title"}}], + "minimumShouldMatch": -1, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a negative compound.minimumShouldMatch", + ), + StageTestCase( + "min_should_match_over_clause_count", + pipeline=[ + { + "$search": { + "compound": { + "should": [{"text": {"query": "quick", "path": "title"}}], + "minimumShouldMatch": 2, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a compound.minimumShouldMatch greater than the should-clause " + "count", + ), +] + +# Property [Compound minimumShouldMatch Type]: compound.minimumShouldMatch must +# be an integer, so a non-integer type is rejected and null is treated as the default. +SEARCH_COMPOUND_MIN_SHOULD_MATCH_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"min_should_match_type_{tid}", + pipeline=[ + { + "$search": { + "compound": { + "should": [{"text": {"query": "quick", "path": "title"}}], + "minimumShouldMatch": val, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} compound.minimumShouldMatch as a non-integer", + ) + for tid, val in [ + ("string", "1"), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", [1]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Compound Requires A Clause]: a compound with none of the four clause +# types present is rejected as it composes no clauses. +SEARCH_COMPOUND_EMPTY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_empty", + pipeline=[{"$search": {"compound": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty compound with no clause type present", + ), +] + +# Property [Compound Clause Array Non-Empty]: a present compound clause array must +# contain at least one clause, so an empty array in any clause slot is rejected. +SEARCH_COMPOUND_EMPTY_CLAUSE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "compound_empty_must", + pipeline=[{"$search": {"compound": {"must": []}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty compound must clause array", + ), +] + +SEARCH_COMPOUND_ERROR_TESTS = ( + SEARCH_COMPOUND_MIN_SHOULD_MATCH_ERROR_TESTS + + SEARCH_COMPOUND_MIN_SHOULD_MATCH_TYPE_ERROR_TESTS + + SEARCH_COMPOUND_EMPTY_ERROR_TESTS + + SEARCH_COMPOUND_EMPTY_CLAUSE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_COMPOUND_ERROR_TESTS)) +def test_search_compound_errors(indexed_collection, test_case: StageTestCase): + """Test $search compound minimumShouldMatch and empty-clause validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_count.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_count.py new file mode 100644 index 000000000..f5fef1cad --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_count.py @@ -0,0 +1,296 @@ +"""Tests for the $search count option and count metadata.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Eq, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + FLOAT_INFINITY, + FLOAT_NEGATIVE_INFINITY, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Count Option Default]: an empty count document is accepted (the +# default lowerBound behavior) and the search still returns its matches. +SEARCH_COUNT_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + "count_empty_document", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept an empty count document and still return its matches", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_COUNT_OPTION_TESTS)) +def test_search_count_option_cases(indexed_collection, test_case: StageTestCase): + """Test $search accepts the count option document.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Count Metadata]: a count option exposes the result count through +# $$SEARCH_META.count under the requested type key, and accepts count.threshold +# for both count types. +SEARCH_COUNT_TESTS: list[StageTestCase] = [ + StageTestCase( + "count_total", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {"type": "total"}}}, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"total": Eq(Int64(3))}}, + msg="$search should expose an exact total result count through $$SEARCH_META", + ), + StageTestCase( + "count_lower_bound", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"type": "lowerBound"}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"lowerBound": Eq(Int64(3))}}, + msg="$search should expose a lowerBound result count through $$SEARCH_META", + ), + StageTestCase( + "count_total_threshold", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"type": "total", "threshold": 10_000}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"total": Eq(Int64(3))}}, + msg="$search should accept count.threshold for a total count and still expose the " + "exact count", + ), + StageTestCase( + "count_lower_bound_threshold", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"type": "lowerBound", "threshold": 10_000}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"lowerBound": Eq(Int64(3))}}, + msg="$search should accept count.threshold for a lowerBound count and still expose " + "the exact count", + ), + StageTestCase( + "count_type_null_default", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {"type": None}}}, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"lowerBound": Eq(Int64(3))}}, + msg="$search should treat a null count.type as the unset default, exposing a " + "lowerBound count", + ), + StageTestCase( + "count_threshold_null_default", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"type": "total", "threshold": None}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "count": "$$SEARCH_META.count"}}, + ], + expected={"count": {"total": Eq(Int64(3))}}, + msg="$search should treat a null count.threshold as the unset default and still " + "expose the exact count", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_COUNT_TESTS)) +def test_search_count_metadata(indexed_collection, test_case: StageTestCase): + """Test $search exposes a result count through $$SEARCH_META.count.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, expected=test_case.expected, msg=test_case.msg) + + +# Property [Count Type Enum]: count.type must be one of the recognized count-type +# strings, so an unrecognized string or a non-string type is rejected (null is +# the default). +SEARCH_COUNT_TYPE_ENUM_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "count_type_bogus", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {"type": "bogus"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a count.type outside the recognized set of count types", + ), + *[ + StageTestCase( + f"count_type_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {"type": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} count.type as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", ["total"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [Count threshold Value And Type]: count.threshold must be a +# non-negative integer (null is the default). +SEARCH_COUNT_THRESHOLD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "count_threshold_negative", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": {"threshold": -1}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a negative count.threshold", + ), + StageTestCase( + "count_threshold_positive_infinity", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"threshold": FLOAT_INFINITY}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an infinite count.threshold as not fitting in a 32-bit " + "integer", + ), + StageTestCase( + "count_threshold_negative_infinity", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"threshold": FLOAT_NEGATIVE_INFINITY}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a negative-infinite count.threshold as not fitting in a " + "32-bit integer", + ), + *[ + StageTestCase( + f"count_threshold_type_{tid}", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "count": {"threshold": val}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} count.threshold as a non-integer", + ) + for tid, val in [ + ("string", "10"), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", [10]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +SEARCH_COUNT_ERROR_TESTS = SEARCH_COUNT_TYPE_ENUM_ERROR_TESTS + SEARCH_COUNT_THRESHOLD_ERROR_TESTS + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_COUNT_ERROR_TESTS)) +def test_search_count_errors(indexed_collection, test_case: StageTestCase): + """Test $search count type/enum and threshold validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_equals.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_equals.py new file mode 100644 index 000000000..472643068 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_equals.py @@ -0,0 +1,409 @@ +"""Tests for the $search equals operator.""" + +from __future__ import annotations + +import datetime +import uuid + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_FROM_INT64_MAX, + DOUBLE_MAX_SAFE_INTEGER, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_PRECISION_LOSS, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT64_MAX, +) + +pytestmark = pytest.mark.requires(search=True) + + +_EQUALS_OBJECT_ID = ObjectId("0123456789abcdef01234567") + +_EQUALS_UUID = uuid.UUID("12345678-1234-4567-8901-123456789abc") + +_EQUALS_DATE = datetime.datetime(2020, 1, 1) + +_EQUALS_DOCS = [ + {"_id": 1, "b": True}, + {"_id": 2, "b": False}, + {"_id": 3, "oid": _EQUALS_OBJECT_ID}, + {"_id": 4, "dt": _EQUALS_DATE}, + {"_id": 5, "s": "apple"}, # stored on a token-mapped path + {"_id": 6, "nullf": None}, + {"_id": 7, "u": Binary.from_uuid(_EQUALS_UUID)}, + {"_id": 8, "num": 20}, # int32 + {"_id": 9, "num": Int64(20)}, # int64 + {"_id": 10, "num": 20.0}, # double + {"_id": 11, "big": Int64(DOUBLE_PRECISION_LOSS)}, # 2^53+1 (no exact double) + {"_id": 12, "big": float(DOUBLE_MAX_SAFE_INTEGER)}, # 2^53 + {"_id": 13, "imax": INT64_MAX}, # int64-max + {"_id": 14, "imax": DOUBLE_FROM_INT64_MAX}, # int64-max's double approximation + {"_id": 15, "zero": DOUBLE_ZERO}, + {"_id": 16, "zero": DOUBLE_NEGATIVE_ZERO}, + {"_id": 17, "nf": FLOAT_NAN}, + {"_id": 18, "nf": FLOAT_INFINITY}, + {"_id": 19, "nf": FLOAT_NEGATIVE_INFINITY}, +] + +_EQUALS_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "b": {"type": "boolean"}, + "oid": {"type": "objectId"}, + "dt": {"type": "date"}, + "s": {"type": "token"}, + "txt": {"type": "string"}, + "nf": {"type": "number"}, + "nullf": {"type": "token"}, + "u": {"type": "uuid"}, + "num": {"type": "number"}, + "big": {"type": "number"}, + "imax": {"type": "number"}, + "zero": {"type": "number"}, + }, + } +} + + +@pytest.fixture(scope="module") +def equals_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a field of + each equals-supported value type (boolean, objectId, date, token string, null, + uuid, and number), shared read-only across the equals cases so the index is + built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_equals", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["equals"] + coll.insert_many(_EQUALS_DOCS) + create_search_index(coll, _EQUALS_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Equals Value-Type Match]: equals returns the document storing a value +# exactly equal to the queried value, for each supported value type. +SEARCH_EQUALS_VALUE_TYPE_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_bool_true", + pipeline=[{"$search": {"equals": {"path": "b", "value": True}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search equals should match the document storing the queried boolean true", + ), + StageTestCase( + "equals_score_boost", + pipeline=[ + { + "$search": { + "equals": {"path": "b", "value": True, "score": {"boost": {"value": 2.0}}} + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search equals should accept a score modifier and still return its match", + ), + StageTestCase( + "equals_bool_false", + pipeline=[{"$search": {"equals": {"path": "b", "value": False}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 2)]}, + msg="$search equals should match the document storing the queried boolean false", + ), + StageTestCase( + "equals_object_id", + pipeline=[ + {"$search": {"equals": {"path": "oid", "value": _EQUALS_OBJECT_ID}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 3)]}, + msg="$search equals should match the document storing the queried ObjectId", + ), + StageTestCase( + "equals_date", + pipeline=[ + {"$search": {"equals": {"path": "dt", "value": _EQUALS_DATE}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search equals should match the document storing the queried date", + ), + StageTestCase( + "equals_string_token", + pipeline=[ + {"$search": {"equals": {"path": "s", "value": "apple"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 5)]}, + msg="$search equals should match the document storing the queried string on a " + "token-mapped path", + ), + StageTestCase( + "equals_null", + pipeline=[ + {"$search": {"equals": {"path": "nullf", "value": None}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 6)]}, + msg="$search equals should match the document storing the queried null value", + ), + StageTestCase( + "equals_uuid", + pipeline=[ + {"$search": {"equals": {"path": "u", "value": Binary.from_uuid(_EQUALS_UUID)}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 7)]}, + msg="$search equals should match the document storing the queried UUID (Binary subtype 4)", + ), +] + +# Property [Equals Lossy Double Numeric Equality]: equals compares numbers in +# double space, so all numeric representations of one value match each other and +# a value with no exact double matches every representation that narrows to the +# same double. +SEARCH_EQUALS_NUMERIC_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_int_matches_all_representations", + pipeline=[{"$search": {"equals": {"path": "num", "value": 20}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 8), + Contains("_id", 9), + Contains("_id", 10), + ] + }, + msg="$search equals with an int value should match the int32, int64, and double " + "representations of the same integer", + ), + StageTestCase( + "equals_double_matches_all_representations", + pipeline=[ + {"$search": {"equals": {"path": "num", "value": 20.0}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 8), + Contains("_id", 9), + Contains("_id", 10), + ] + }, + msg="$search equals with a double value should match the int32, int64, and double " + "representations of the same integer", + ), + StageTestCase( + "equals_int64_2pow53_plus_1", + pipeline=[ + {"$search": {"equals": {"path": "big", "value": Int64(DOUBLE_PRECISION_LOSS)}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 11), Contains("_id", 12)]}, + msg="$search equals with an int64 having no exact double should match both stored " + "representations that narrow to the same double", + ), + StageTestCase( + "equals_double_2pow53", + pipeline=[ + {"$search": {"equals": {"path": "big", "value": float(DOUBLE_MAX_SAFE_INTEGER)}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 11), Contains("_id", 12)]}, + msg="$search equals with a double should match both stored representations that " + "narrow to the same double", + ), + StageTestCase( + "equals_int64_max", + pipeline=[ + {"$search": {"equals": {"path": "imax", "value": INT64_MAX}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 13), Contains("_id", 14)]}, + msg="$search equals with int64-max should match int64-max and its double approximation", + ), +] + +# Property [Equals Negative Zero]: equals treats negative zero as equal to positive +# zero, so a 0.0 or -0.0 query each matches both a stored 0.0 and a stored -0.0. +SEARCH_EQUALS_NEGATIVE_ZERO_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_zero_double_positive", + pipeline=[ + {"$search": {"equals": {"path": "zero", "value": DOUBLE_ZERO}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 15), Contains("_id", 16)]}, + msg="$search equals with positive-zero double should match both stored 0.0 and -0.0", + ), + StageTestCase( + "equals_zero_double_negative", + pipeline=[ + {"$search": {"equals": {"path": "zero", "value": DOUBLE_NEGATIVE_ZERO}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 15), Contains("_id", 16)]}, + msg="$search equals with negative-zero double should match both stored 0.0 and -0.0", + ), +] + +# Property [Equals Non-Finite No Match]: equals never matches a stored NaN, +inf, +# or -inf, unlike in which matches them. +SEARCH_EQUALS_NON_FINITE_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_nan", + pipeline=[ + {"$search": {"equals": {"path": "nf", "value": FLOAT_NAN}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search equals should never match a stored NaN", + ), + StageTestCase( + "equals_positive_infinity", + pipeline=[ + {"$search": {"equals": {"path": "nf", "value": FLOAT_INFINITY}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search equals should never match a stored +inf", + ), + StageTestCase( + "equals_negative_infinity", + pipeline=[ + {"$search": {"equals": {"path": "nf", "value": FLOAT_NEGATIVE_INFINITY}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search equals should never match a stored -inf", + ), +] + +# Property [Equals doesNotAffect Option]: equals recognizes a string doesNotAffect +# option (unlike text or near, which reject the field), accepting it and still +# returning its match. +SEARCH_EQUALS_DOES_NOT_AFFECT_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_does_not_affect_string", + pipeline=[{"$search": {"equals": {"path": "b", "value": True, "doesNotAffect": "score"}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search equals should accept a string doesNotAffect option and still return its match", + ), +] + +SEARCH_EQUALS_TESTS = ( + SEARCH_EQUALS_VALUE_TYPE_TESTS + + SEARCH_EQUALS_NUMERIC_TESTS + + SEARCH_EQUALS_NEGATIVE_ZERO_TESTS + + SEARCH_EQUALS_NON_FINITE_TESTS + + SEARCH_EQUALS_DOES_NOT_AFFECT_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_EQUALS_TESTS)) +def test_search_equals_cases(equals_collection, test_case: StageTestCase): + """Test $search equals value semantics across the supported value types.""" + result = execute_command( + equals_collection, + {"aggregate": equals_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Equals Required Fields]: equals.path and equals.value are both +# required, so a spec omitting either is rejected. +SEARCH_EQUALS_REQUIRED_FIELD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_path_missing", + pipeline=[{"$search": {"equals": {"value": True}}}], + error_code=UNKNOWN_ERROR, + msg="$search equals should reject a spec that omits the required path", + ), + StageTestCase( + "equals_value_missing", + pipeline=[{"$search": {"equals": {"path": "b"}}}], + error_code=UNKNOWN_ERROR, + msg="$search equals should reject a spec that omits the required value", + ), +] + +# Property [Equals Value Type Rejection]: equals.value rejects any type outside +# the supported set (bool, objectId, number, string, date, uuid, null). +SEARCH_EQUALS_VALUE_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"equals_value_type_{tid}", + pipeline=[{"$search": {"equals": {"path": "num", "value": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search equals should reject a {tid} value as an unsupported type", + ) + for tid, val in [ + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("timestamp", Timestamp(1, 1)), + ("array", [1, 2]), + ("object", {"a": 1}), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] +] + +# Property [Equals Binary Subtype]: a Binary equals.value is accepted only as a +# UUID (subtype 4), so a Binary of any other subtype is rejected. +SEARCH_EQUALS_BINARY_SUBTYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_binary_non_uuid", + pipeline=[ + {"$search": {"equals": {"path": "u", "value": Binary(b"\x01\x02\x03")}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search equals should reject a Binary value that is not UUID subtype 4", + ), +] + +# Property [Equals Analyzed Path]: a string equals.value requires a token-mapped +# path. +SEARCH_EQUALS_ANALYZED_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "equals_string_analyzed_path", + pipeline=[ + {"$search": {"equals": {"path": "txt", "value": "quick brown"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search equals should reject a string value against an analyzed non-token path", + ), +] + +SEARCH_EQUALS_ERROR_TESTS = ( + SEARCH_EQUALS_REQUIRED_FIELD_ERROR_TESTS + + SEARCH_EQUALS_VALUE_TYPE_ERROR_TESTS + + SEARCH_EQUALS_BINARY_SUBTYPE_ERROR_TESTS + + SEARCH_EQUALS_ANALYZED_PATH_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_EQUALS_ERROR_TESTS)) +def test_search_equals_errors(equals_collection, test_case: StageTestCase): + """Test $search equals rejects unsupported value types, non-UUID Binary, and analyzed paths.""" + result = execute_command( + equals_collection, + {"aggregate": equals_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_exists.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_exists.py new file mode 100644 index 000000000..7a8a15420 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_exists.py @@ -0,0 +1,177 @@ +"""Tests for the $search exists operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Exists Field Presence]: exists selects only the documents where the +# named field is present, and inside a compound mustNot clause selects the +# complement (the absent-field documents). +SEARCH_EXISTS_PRESENCE_TESTS: list[StageTestCase] = [ + StageTestCase( + "exists_field_present", + pipeline=[{"$search": {"exists": {"path": "body"}}}], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search exists should select only the documents where the named field is present", + ), + StageTestCase( + "exists_score_boost", + pipeline=[ + {"$search": {"exists": {"path": "body", "score": {"boost": {"value": 2.0}}}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search exists should accept a score modifier and still return its matches", + ), + StageTestCase( + "exists_nonexistent_field", + pipeline=[{"$search": {"exists": {"path": "nope"}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search exists should match nothing for a field no document carries", + ), + StageTestCase( + "exists_compound_must_not_complement", + pipeline=[ + {"$search": {"compound": {"mustNot": [{"exists": {"path": "body"}}]}}}, + ], + expected={ + "cursor.firstBatch": [Len(16), *[Contains("_id", _id) for _id in list(range(3, 19))]] + }, + msg="$search exists inside a compound mustNot should select the complement of the " + "present-field documents", + ), +] + +# Property [Exists Path No Validation]: an empty or dotted absent path resolves to +# no covered field and returns no documents without field-path validation or error. +SEARCH_EXISTS_PATH_NO_VALIDATION_TESTS: list[StageTestCase] = [ + StageTestCase( + "exists_path_empty", + pipeline=[{"$search": {"exists": {"path": ""}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search exists should treat an empty path as an absent field and match nothing " + "without error", + ), + StageTestCase( + "exists_path_dotted", + pipeline=[{"$search": {"exists": {"path": "a.b"}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search exists should treat a dotted absent path as an absent field and match " + "nothing without field-path validation", + ), +] + +SEARCH_EXISTS_TESTS = SEARCH_EXISTS_PRESENCE_TESTS + SEARCH_EXISTS_PATH_NO_VALIDATION_TESTS + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_EXISTS_TESTS)) +def test_search_exists_cases(indexed_collection, test_case: StageTestCase): + """Test $search exists field presence and path no-validation.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Exists Path Type Rejection]: exists.path is string-only, so a path of +# any non-string type - including the document forms and the array of paths that +# text and wildcard accept - is rejected. +SEARCH_EXISTS_PATH_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"exists_path_type_{tid}", + pipeline=[{"$search": {"exists": {"path": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search exists should reject a {tid} path as a non-string type", + ) + for tid, val in [ + ("object", {"value": "body"}), + ("array", ["body"]), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Exists Path Required]: a missing or null exists.path is treated as +# absent and produces a spec validation error. +SEARCH_EXISTS_PATH_REQUIRED_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "exists_path_missing", + pipeline=[{"$search": {"exists": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search exists should reject a missing path as required", + ), + StageTestCase( + "exists_path_null", + pipeline=[{"$search": {"exists": {"path": None}}}], + error_code=UNKNOWN_ERROR, + msg="$search exists should reject a null path treated as missing", + ), +] + +# Property [Exists Unknown Sub-field]: an unrecognized exists sub-field produces a +# spec validation error. +SEARCH_EXISTS_UNKNOWN_FIELD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "exists_unknown_field", + pipeline=[{"$search": {"exists": {"path": "body", "bogus": 1}}}], + error_code=UNKNOWN_ERROR, + msg="$search exists should reject an unrecognized sub-field", + ), +] + +SEARCH_EXISTS_ERROR_TESTS = ( + SEARCH_EXISTS_PATH_TYPE_ERROR_TESTS + + SEARCH_EXISTS_PATH_REQUIRED_ERROR_TESTS + + SEARCH_EXISTS_UNKNOWN_FIELD_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_EXISTS_ERROR_TESTS)) +def test_search_exists_errors(indexed_collection, test_case: StageTestCase): + """Test $search exists rejects non-string, missing/null, and unknown-field paths.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_facet.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_facet.py new file mode 100644 index 000000000..9a8ecaffb --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_facet.py @@ -0,0 +1,94 @@ +"""Tests for the $search facet collector.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) + +pytestmark = pytest.mark.requires(search=True) + + +_FACET_DOCS = [ + {"_id": 1, "title": "the quick brown fox", "cat": "a"}, + {"_id": 2, "title": "slow green turtle", "cat": "b"}, + {"_id": 3, "title": "a quick quick rabbit", "cat": "a"}, +] + +_FACET_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "title": {"type": "string"}, + "cat": {"type": "token"}, + }, + } +} + + +@pytest.fixture(scope="module") +def facet_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a + text-analyzed field driving the inner operator and a token-mapped field that + is facetable, shared read-only across the facet cases so the index is built + and polled once.""" + db_name = fixtures.generate_database_name("stages_search_facet", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["facet"] + coll.insert_many(_FACET_DOCS) + create_search_index(coll, _FACET_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Facet Collector Recognition]: the facet collector is recognized in the +# operator slot and executed, returning the documents selected by its inner search +# operator. +SEARCH_FACET_RECOGNITION_TESTS: list[StageTestCase] = [ + StageTestCase( + "facet_collector_executes", + pipeline=[ + { + "$search": { + "facet": { + "operator": {"text": {"query": "quick", "path": "title"}}, + "facets": {"catF": {"type": "string", "path": "cat"}}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 3)]}, + msg="$search should recognize the facet collector and execute it, returning the " + "documents selected by its inner operator", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_FACET_RECOGNITION_TESTS)) +def test_search_facet_recognition(facet_collection, test_case: StageTestCase): + """Test $search recognizes and executes the facet collector.""" + result = execute_command( + facet_collection, + {"aggregate": facet_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_geo.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_geo.py new file mode 100644 index 000000000..240a18baa --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_geo.py @@ -0,0 +1,578 @@ +"""Tests for the $search geoWithin and geoShape operators.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DOUBLE_ZERO, +) + +pytestmark = pytest.mark.requires(search=True) + + +_GEO_DOCS = [ + { + "_id": 1, + "loc": {"type": "Point", "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO]}, + "shaped": {"type": "Point", "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO]}, + }, + { + "_id": 2, + "loc": {"type": "Point", "coordinates": [1.0, 1.0]}, + "shaped": {"type": "Point", "coordinates": [1.0, 1.0]}, + }, + { + "_id": 3, + "loc": {"type": "Point", "coordinates": [5.0, 5.0]}, + "shaped": {"type": "Point", "coordinates": [5.0, 5.0]}, + }, + { + "_id": 4, + "loc": {"type": "Point", "coordinates": [10.0, 10.0]}, + "shaped": {"type": "Point", "coordinates": [10.0, 10.0]}, + }, + { + "_id": 5, + "loc": {"type": "Point", "coordinates": [-3.0, -3.0]}, + "shaped": {"type": "Point", "coordinates": [-3.0, -3.0]}, + }, +] + +_GEO_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "loc": {"type": "geo"}, + "shaped": {"type": "geo", "indexShapes": True}, + }, + } +} + + +@pytest.fixture(scope="module") +def geo_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a geo-typed + field, shared read-only across the geoWithin cases so the index is built and + polled once.""" + db_name = fixtures.generate_database_name("stages_search_geo", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["geo"] + coll.insert_many(_GEO_DOCS) + create_search_index(coll, _GEO_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [GeoWithin Region Matching]: geoWithin selects exactly the documents +# whose stored point lies inside the requested box or circle region. +SEARCH_GEO_WITHIN_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_within_box", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "box": { + "bottomLeft": {"type": "Point", "coordinates": [-1.0, -1.0]}, + "topRight": {"type": "Point", "coordinates": [6.0, 6.0]}, + }, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search geoWithin should match the documents whose point lies inside the box region", + ), + StageTestCase( + "geo_within_score_boost", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "box": { + "bottomLeft": {"type": "Point", "coordinates": [-1.0, -1.0]}, + "topRight": {"type": "Point", "coordinates": [6.0, 6.0]}, + }, + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search geoWithin should accept a score modifier and still return its matches", + ), + StageTestCase( + "geo_within_circle", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "circle": { + "center": { + "type": "Point", + "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO], + }, + "radius": 200_000, + }, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search geoWithin should match the documents whose point lies inside the circle " + "region", + ), + StageTestCase( + "geo_within_geometry_polygon", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-1.0, -1.0], + [-1.0, 3.0], + [3.0, 3.0], + [3.0, -1.0], + [-1.0, -1.0], + ] + ], + }, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search geoWithin should match the documents whose point lies inside the geometry " + "polygon region", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_GEO_WITHIN_TESTS)) +def test_search_geo_within_cases(geo_collection, test_case: StageTestCase): + """Test $search geoWithin box and circle region matching over a geo-mapped path.""" + result = execute_command( + geo_collection, + {"aggregate": geo_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [GeoShape Region Matching]: over a geo path indexed with +# indexShapes=true, geoShape selects exactly the documents whose stored point +# satisfies the requested relation to the geometry. +SEARCH_GEO_SHAPE_MATCH_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_shape_within_polygon", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "shaped", + "relation": "within", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-1.0, -1.0], + [-1.0, 3.0], + [3.0, 3.0], + [3.0, -1.0], + [-1.0, -1.0], + ] + ], + }, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search geoShape within should match the points inside the polygon and exclude " + "those outside it", + ), + StageTestCase( + "geo_shape_score_boost", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "shaped", + "relation": "within", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-1.0, -1.0], + [-1.0, 3.0], + [3.0, 3.0], + [3.0, -1.0], + [-1.0, -1.0], + ] + ], + }, + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search geoShape should accept a score modifier and still return its matches", + ), + StageTestCase( + "geo_shape_intersects_polygon", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "shaped", + "relation": "intersects", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-1.0, -1.0], + [-1.0, 3.0], + [3.0, 3.0], + [3.0, -1.0], + [-1.0, -1.0], + ] + ], + }, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search geoShape intersects should match the points that intersect the polygon", + ), + StageTestCase( + "geo_shape_disjoint_polygon", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "shaped", + "relation": "disjoint", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-1.0, -1.0], + [-1.0, 3.0], + [3.0, 3.0], + [3.0, -1.0], + [-1.0, -1.0], + ] + ], + }, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$search geoShape disjoint should match the points that do not intersect the polygon", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_GEO_SHAPE_MATCH_TESTS)) +def test_search_geo_shape_cases(geo_collection, test_case: StageTestCase): + """Test $search geoShape region matching over an indexShapes geo-mapped path.""" + result = execute_command( + geo_collection, + {"aggregate": geo_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [GeoWithin Coordinate Validation]: geoWithin validates shape +# coordinates before the index check. +SEARCH_GEO_WITHIN_COORDINATE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_within_invalid_latitude", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "box": { + "bottomLeft": {"type": "Point", "coordinates": [DOUBLE_ZERO, 91.0]}, + "topRight": {"type": "Point", "coordinates": [6.0, 6.0]}, + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoWithin should reject a latitude outside the valid range", + ), + StageTestCase( + "geo_within_invalid_longitude", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "box": { + "bottomLeft": { + "type": "Point", + "coordinates": [181.0, DOUBLE_ZERO], + }, + "topRight": {"type": "Point", "coordinates": [6.0, 6.0]}, + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoWithin should reject a longitude outside the valid range", + ), + StageTestCase( + "geo_within_negative_radius", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "circle": { + "center": { + "type": "Point", + "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO], + }, + "radius": -1, + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoWithin should reject a negative circle radius", + ), + StageTestCase( + "geo_within_coordinate_too_few_numbers", + pipeline=[ + { + "$search": { + "geoWithin": { + "path": "loc", + "box": { + "bottomLeft": {"type": "Point", "coordinates": [DOUBLE_ZERO]}, + "topRight": {"type": "Point", "coordinates": [6.0, 6.0]}, + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoWithin should reject a coordinate with fewer than two numbers", + ), +] + +# Property [GeoWithin Shape Required]: a geoWithin with no shape key produces an +# error. +SEARCH_GEO_WITHIN_SHAPE_REQUIRED_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_within_no_shape", + pipeline=[{"$search": {"geoWithin": {"path": "loc"}}}], + error_code=UNKNOWN_ERROR, + msg="$search geoWithin should reject a spec with no shape key", + ), +] + +SEARCH_GEO_WITHIN_ERROR_TESTS = ( + SEARCH_GEO_WITHIN_COORDINATE_ERROR_TESTS + SEARCH_GEO_WITHIN_SHAPE_REQUIRED_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_GEO_WITHIN_ERROR_TESTS)) +def test_search_geo_within_errors(geo_collection, test_case: StageTestCase): + """Test $search geoWithin rejects invalid coordinates and a missing shape key.""" + result = execute_command( + geo_collection, + {"aggregate": geo_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) + + +# Property [GeoShape Geometry Validation]: geoShape validates the relation enum +# and geometry shape before the index check. +SEARCH_GEO_SHAPE_GEOMETRY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_shape_invalid_relation", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "loc", + "relation": "bogus", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [DOUBLE_ZERO, DOUBLE_ZERO], + [DOUBLE_ZERO, 5.0], + [5.0, 5.0], + [5.0, DOUBLE_ZERO], + [DOUBLE_ZERO, DOUBLE_ZERO], + ] + ], + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoShape should reject a relation outside the allowed set", + ), + StageTestCase( + "geo_shape_polygon_too_few_positions", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "loc", + "relation": "intersects", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [DOUBLE_ZERO, DOUBLE_ZERO], + [DOUBLE_ZERO, 5.0], + [DOUBLE_ZERO, DOUBLE_ZERO], + ] + ], + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoShape should reject a polygon with fewer than four positions", + ), + StageTestCase( + "geo_shape_within_with_point", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "loc", + "relation": "within", + "geometry": { + "type": "Point", + "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO], + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoShape should reject the within relation applied to a Point geometry", + ), +] + +# Property [GeoShape IndexShapes Requirement]: geoShape against a geo path not +# indexed with indexShapes=true produces an error. +SEARCH_GEO_SHAPE_INDEX_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "geo_shape_path_not_index_shapes", + pipeline=[ + { + "$search": { + "geoShape": { + "path": "loc", + "relation": "intersects", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [DOUBLE_ZERO, DOUBLE_ZERO], + [DOUBLE_ZERO, 5.0], + [5.0, 5.0], + [5.0, DOUBLE_ZERO], + [DOUBLE_ZERO, DOUBLE_ZERO], + ] + ], + }, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search geoShape should reject a geo path not indexed with indexShapes=true", + ), +] + +SEARCH_GEO_SHAPE_ERROR_TESTS = ( + SEARCH_GEO_SHAPE_GEOMETRY_ERROR_TESTS + SEARCH_GEO_SHAPE_INDEX_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_GEO_SHAPE_ERROR_TESTS)) +def test_search_geo_shape_errors(geo_collection, test_case: StageTestCase): + """Test $search geoShape rejects invalid geometry and a non-indexShapes geo path.""" + result = execute_command( + geo_collection, + {"aggregate": geo_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_highlight.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_highlight.py new file mode 100644 index 000000000..600afa7ff --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_highlight.py @@ -0,0 +1,253 @@ +"""Tests for the $search highlight option and output.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Eq, + Gt, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Highlight Path Forms]: highlight.path accepts a {wildcard} document +# and an array of paths (the string form is owned by the searchHighlights output +# property). +SEARCH_HIGHLIGHT_PATH_FORM_TESTS: list[StageTestCase] = [ + StageTestCase( + "highlight_path_wildcard", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": {"wildcard": "*"}}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a {wildcard} highlight.path and still return its matches", + ), + StageTestCase( + "highlight_path_array", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": ["title", "body"]}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept an array highlight.path and still return its matches", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_HIGHLIGHT_PATH_FORM_TESTS)) +def test_search_highlight_path_cases(indexed_collection, test_case: StageTestCase): + """Test $search highlight path forms.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [SearchHighlights Output]: with highlight enabled, {$meta: +# "searchHighlights"} projects per-path entries that split matched tokens into +# "hit" spans and surrounding context into "text" spans, and a multi-byte matched +# token is highlighted intact as a single hit span with no offset corruption. +SEARCH_HIGHLIGHT_TESTS: list[StageTestCase] = [ + StageTestCase( + "highlight_ascii_hit_and_text_spans", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": "title"}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "hl": {"$meta": "searchHighlights"}}}, + ], + expected={ + "hl.0.path": Eq("title"), + "hl.0.score": Gt(0), + "hl.0.texts": [ + Contains("type", "hit"), + Contains("type", "text"), + Contains("value", "quick"), + ], + }, + msg="$search should tag matched tokens as hit spans and surrounding context as " + "text spans", + ), + StageTestCase( + "highlight_multibyte_intact_span", + pipeline=[ + { + "$search": { + "text": {"query": "résumé", "path": "title"}, + "highlight": {"path": "title"}, + } + }, + {"$limit": 1}, + {"$project": {"_id": 0, "hl": {"$meta": "searchHighlights"}}}, + ], + expected={ + "hl.0.path": Eq("title"), + "hl.0.score": Gt(0), + "hl.0.texts": [Contains("type", "hit"), Contains("value", "résumé")], + }, + msg="$search should highlight a multi-byte token intact as a single hit span " + "without offset corruption", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_HIGHLIGHT_TESTS)) +def test_search_highlights(indexed_collection, test_case: StageTestCase): + """Test $search projects searchHighlights spans tagging hits and context.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, expected=test_case.expected, msg=test_case.msg) + + +# Property [Highlight Sub-field Validation]: highlight requires a path, rejects a +# path of an unaccepted type (anything but a string, document, or array of paths), +# and rejects an unknown sub-field. +SEARCH_HIGHLIGHT_SUBFIELD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "highlight_missing_path", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "highlight": {}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a highlight document missing the required path", + ), + *[ + StageTestCase( + f"highlight_path_{tid}", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": val}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} highlight.path as neither a string, document, " + "nor array", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "highlight_unknown_subfield", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": "title", "bogus": 1}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an unknown highlight sub-field", + ), +] + +# Property [Highlight Integer Bounds]: highlight.maxCharsToExamine and +# highlight.maxNumPassages must each be positive - a tighter bound than the +# non-negative phrase.slop, which accepts zero. +SEARCH_HIGHLIGHT_INTEGER_BOUNDS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"highlight_{opt_id}_{tid}", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "highlight": {"path": "title", opt: val}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} highlight.{opt} as non-positive", + ) + for opt, opt_id in [ + ("maxCharsToExamine", "max_chars"), + ("maxNumPassages", "max_passages"), + ] + for tid, val in [("zero", 0), ("negative", -1)] +] + +SEARCH_HIGHLIGHT_ERROR_TESTS = ( + SEARCH_HIGHLIGHT_SUBFIELD_ERROR_TESTS + SEARCH_HIGHLIGHT_INTEGER_BOUNDS_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_HIGHLIGHT_ERROR_TESTS)) +def test_search_highlight_errors(indexed_collection, test_case: StageTestCase): + """Test $search highlight subfield and integer-bounds validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_in.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_in.py new file mode 100644 index 000000000..1bf902e2e --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_in.py @@ -0,0 +1,485 @@ +"""Tests for the $search in operator.""" + +from __future__ import annotations + +import datetime +import uuid + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + QUERY_CLAUSE_CAP, + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_MAX_SAFE_INTEGER, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_PRECISION_LOSS, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, +) + +pytestmark = pytest.mark.requires(search=True) + + +_IN_OBJECT_ID = ObjectId("0123456789abcdef01234567") + +_IN_UUID = uuid.UUID("12345678-1234-4567-8901-123456789abc") + +_IN_DATE = datetime.datetime(2020, 1, 1) + +_IN_DOCS = [ + {"_id": 1, "num": 20}, # int32 + {"_id": 2, "num": Int64(20)}, # int64 + {"_id": 3, "num": 20.0}, # double + {"_id": 4, "num": 5}, # int32, distinct value + {"_id": 5, "big": Int64(DOUBLE_PRECISION_LOSS)}, # 2^53+1 (no exact double) + {"_id": 6, "big": float(DOUBLE_MAX_SAFE_INTEGER)}, # 2^53 + {"_id": 7, "zero": DOUBLE_ZERO}, + {"_id": 8, "zero": DOUBLE_NEGATIVE_ZERO}, + {"_id": 9, "nf": FLOAT_NAN}, + {"_id": 10, "nf": FLOAT_INFINITY}, + {"_id": 11, "nf": FLOAT_NEGATIVE_INFINITY}, + {"_id": 12, "b": True}, + {"_id": 13, "oid": _IN_OBJECT_ID}, + {"_id": 14, "dt": _IN_DATE}, + {"_id": 15, "s": "apple"}, # stored on a token-mapped path + {"_id": 16, "u": Binary.from_uuid(_IN_UUID)}, +] + +_IN_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "num": {"type": "number"}, + "big": {"type": "number"}, + "zero": {"type": "number"}, + "nf": {"type": "number"}, + "b": {"type": "boolean"}, + "oid": {"type": "objectId"}, + "dt": {"type": "date"}, + "s": {"type": "token"}, + "u": {"type": "uuid"}, + }, + } +} + + +@pytest.fixture(scope="module") +def in_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping four + number-typed fields (mixed numeric representations, a lossy-double pair, a + signed-zero pair, and the non-finite doubles) plus one field of each other + supported value type (boolean, objectId, date, token string, and uuid), + shared read-only across the in cases so the index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_in", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["in_op"] + coll.insert_many(_IN_DOCS) + create_search_index(coll, _IN_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [In Value Or Array]: in.value accepts both a single scalar and an +# array, each selecting the documents storing a value equal to a listed value. +SEARCH_IN_VALUE_OR_ARRAY_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_single_scalar", + pipeline=[{"$search": {"in": {"path": "num", "value": 20}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should accept a single scalar value and match the documents storing it", + ), + StageTestCase( + "in_score_boost", + pipeline=[ + {"$search": {"in": {"path": "num", "value": 20, "score": {"boost": {"value": 2.0}}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should accept a score modifier and still return its matches", + ), + StageTestCase( + "in_array_single_element", + pipeline=[{"$search": {"in": {"path": "num", "value": [20]}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should accept an array value and match the documents storing a " + "listed value", + ), +] + +# Property [In Value-Type Acceptance]: like equals, in matches each supported +# non-numeric value type, selecting the document storing a listed value. +SEARCH_IN_VALUE_TYPE_ACCEPTANCE_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_value_type_bool", + pipeline=[{"$search": {"in": {"path": "b", "value": [True]}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 12)]}, + msg="$search in should match the document storing a listed boolean value", + ), + StageTestCase( + "in_value_type_objectid", + pipeline=[{"$search": {"in": {"path": "oid", "value": [_IN_OBJECT_ID]}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 13)]}, + msg="$search in should match the document storing a listed ObjectId value", + ), + StageTestCase( + "in_value_type_date", + pipeline=[{"$search": {"in": {"path": "dt", "value": [_IN_DATE]}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 14)]}, + msg="$search in should match the document storing a listed date value", + ), + StageTestCase( + "in_value_type_string", + pipeline=[{"$search": {"in": {"path": "s", "value": ["apple"]}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 15)]}, + msg="$search in should match the document storing a listed string value on a " + "token-mapped path", + ), + StageTestCase( + "in_value_type_uuid", + pipeline=[{"$search": {"in": {"path": "u", "value": [Binary.from_uuid(_IN_UUID)]}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 16)]}, + msg="$search in should match the document storing a listed UUID value (Binary subtype 4)", + ), +] + +# Property [In Mixed-Numeric Homogeneity]: in treats int32/int64/double as one +# type, so a mixed-numeric array is accepted and matches every stored numeric +# representation that narrows to a listed value's double, with -0.0 equal to 0.0. +SEARCH_IN_MIXED_NUMERIC_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_mixed_distinct_values", + pipeline=[ + {"$search": {"in": {"path": "num", "value": [5, 20.0]}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search in should accept a mixed-numeric array and match the union of its " + "listed values", + ), + StageTestCase( + "in_mixed_int64_and_double", + pipeline=[ + {"$search": {"in": {"path": "num", "value": [Int64(20), 20.0]}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should treat int64 and double as one type in a mixed array and match " + "every numeric representation of the listed value", + ), + StageTestCase( + "in_lossy_double_equality", + pipeline=[ + {"$search": {"in": {"path": "big", "value": [Int64(DOUBLE_PRECISION_LOSS)]}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 5), Contains("_id", 6)]}, + msg="$search in should compare in double space, matching both stored representations " + "that narrow to the same double", + ), + StageTestCase( + "in_negative_zero_positive", + pipeline=[ + {"$search": {"in": {"path": "zero", "value": [DOUBLE_ZERO]}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 7), Contains("_id", 8)]}, + msg="$search in with positive-zero should match both stored 0.0 and -0.0", + ), + StageTestCase( + "in_negative_zero_negative", + pipeline=[ + {"$search": {"in": {"path": "zero", "value": [DOUBLE_NEGATIVE_ZERO]}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 7), Contains("_id", 8)]}, + msg="$search in with negative-zero should match both stored 0.0 and -0.0", + ), +] + +# Property [In No Clause Cap]: in imposes no clause cap, so query arrays sized at +# and one past the text.query clause cap are both accepted. +SEARCH_IN_CLAUSE_CAP_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_clause_count_1024", + pipeline=[ + { + "$search": { + "in": { + "path": "num", + "value": [20] + list(range(1000, 1000 + QUERY_CLAUSE_CAP - 1)), + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should accept a value array sized at the text.query clause cap " + "with no clause cap of its own", + ), + StageTestCase( + "in_clause_count_1025", + pipeline=[ + { + "$search": { + "in": { + "path": "num", + "value": [20] + list(range(1000, 1000 + QUERY_CLAUSE_CAP)), + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search in should accept a value array one past the text.query clause cap", + ), +] + +# Property [In Non-Finite Doubles]: in matches a stored NaN, +inf, or -inf when +# that non-finite double is listed, the opposite of equals which never matches +# them. +SEARCH_IN_NON_FINITE_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_nan", + pipeline=[ + {"$search": {"in": {"path": "nf", "value": [FLOAT_NAN]}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 9)]}, + msg="$search in should match a stored NaN, unlike equals", + ), + StageTestCase( + "in_positive_infinity", + pipeline=[ + {"$search": {"in": {"path": "nf", "value": [FLOAT_INFINITY]}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 10)]}, + msg="$search in should match a stored +inf, unlike equals", + ), + StageTestCase( + "in_negative_infinity", + pipeline=[ + {"$search": {"in": {"path": "nf", "value": [FLOAT_NEGATIVE_INFINITY]}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 11)]}, + msg="$search in should match a stored -inf, unlike equals", + ), +] + +# Property [In doesNotAffect Option]: in recognizes a string doesNotAffect option +# (unlike text or near, which reject the field), accepting it and still returning +# its matches. +SEARCH_IN_DOES_NOT_AFFECT_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_does_not_affect_string", + pipeline=[{"$search": {"in": {"path": "b", "value": [True], "doesNotAffect": "score"}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 12)]}, + msg="$search in should accept a string doesNotAffect option and still return its matches", + ), +] + +SEARCH_IN_TESTS = ( + SEARCH_IN_VALUE_OR_ARRAY_TESTS + + SEARCH_IN_VALUE_TYPE_ACCEPTANCE_TESTS + + SEARCH_IN_MIXED_NUMERIC_TESTS + + SEARCH_IN_CLAUSE_CAP_TESTS + + SEARCH_IN_NON_FINITE_TESTS + + SEARCH_IN_DOES_NOT_AFFECT_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_IN_TESTS)) +def test_search_in_cases(in_collection, test_case: StageTestCase): + """Test $search in value semantics over a number-mapped path.""" + result = execute_command( + in_collection, + {"aggregate": in_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [In Required Fields]: in.path and in.value are both required, so a +# spec omitting either is rejected. +SEARCH_IN_REQUIRED_FIELD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_path_missing", + pipeline=[{"$search": {"in": {"value": [20]}}}], + error_code=UNKNOWN_ERROR, + msg="$search in should reject a spec that omits the required path", + ), + StageTestCase( + "in_value_missing", + pipeline=[{"$search": {"in": {"path": "num"}}}], + error_code=UNKNOWN_ERROR, + msg="$search in should reject a spec that omits the required value", + ), +] + +# Property [In doesNotAffect Type]: the doesNotAffect option must be a string, so +# a non-string is rejected. +SEARCH_IN_DOES_NOT_AFFECT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_does_not_affect_non_string", + pipeline=[{"$search": {"in": {"path": "num", "value": [20], "doesNotAffect": 1}}}], + error_code=UNKNOWN_ERROR, + msg="$search in should reject a non-string doesNotAffect option", + ), +] + +# Property [In Empty Value Array]: an empty in.value array is rejected as it lists +# no value to match. +SEARCH_IN_EMPTY_VALUE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_empty_value_array", + pipeline=[{"$search": {"in": {"path": "num", "value": []}}}], + error_code=UNKNOWN_ERROR, + msg="$search in should reject an empty value array", + ), +] + +# Property [In Null Element]: a null element in the in.value array is rejected, +# unlike equals which accepts null. +SEARCH_IN_NULL_ELEMENT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_null_element", + pipeline=[{"$search": {"in": {"path": "num", "value": [None]}}}], + error_code=UNKNOWN_ERROR, + msg="$search in should reject a null element in the value array", + ), +] + +# Property [In Element Homogeneity]: every in.value element must share one type +# category, so an array mixing distinct categories is rejected (numeric subtypes +# count as one category). +SEARCH_IN_MIXED_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "in_mixed_number_string", + pipeline=[ + {"$search": {"in": {"path": "num", "value": [20, "apple"]}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search in should reject an array mixing a number and a string", + ), +] + +# Property [In Element Value Type Dispatch]: in dispatches each in.value array +# element through the same value-type validator as equals.value, so a Binary +# element is accepted only as a UUID (subtype 4). +SEARCH_IN_VALUE_TYPE_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"in_value_type_{tid}", + pipeline=[ + {"$search": {"in": {"path": "num", "value": [val]}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search in should route a {tid} element through the value-type validator " + "and reject it", + ) + for tid, val in [ + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("timestamp", Timestamp(1, 1)), + ("object", {"a": 1}), + ("nested_array", [1, 2]), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ] + ], + StageTestCase( + "in_value_type_binary_non_uuid", + pipeline=[ + {"$search": {"in": {"path": "u", "value": [Binary(b"\x01\x02\x03")]}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search in should reject a Binary element that is not UUID subtype 4", + ), +] + +SEARCH_IN_ERROR_TESTS = ( + SEARCH_IN_REQUIRED_FIELD_ERROR_TESTS + + SEARCH_IN_DOES_NOT_AFFECT_ERROR_TESTS + + SEARCH_IN_EMPTY_VALUE_ERROR_TESTS + + SEARCH_IN_NULL_ELEMENT_ERROR_TESTS + + SEARCH_IN_MIXED_TYPE_ERROR_TESTS + + SEARCH_IN_VALUE_TYPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_IN_ERROR_TESTS)) +def test_search_in_errors(in_collection, test_case: StageTestCase): + """Test $search in rejects empty, null, mixed-type, and unsupported-type values.""" + result = execute_command( + in_collection, + {"aggregate": in_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_near.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_near.py new file mode 100644 index 000000000..a3d4190e9 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_near.py @@ -0,0 +1,443 @@ +"""Tests for the $search near operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + Gt, + Len, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_FROM_INT64_MAX, + DOUBLE_MAX, + DOUBLE_NEGATIVE_ZERO, + DOUBLE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + INT64_MAX, +) + +pytestmark = pytest.mark.requires(search=True) + + +_NEAR_DOCS = [ + {"_id": 1, "num": 0}, + {"_id": 2, "num": 10}, + {"_id": 3, "num": 40}, + {"_id": 4, "dt": datetime.datetime(2020, 1, 1)}, + {"_id": 5, "dt": datetime.datetime(2020, 1, 11)}, + {"_id": 6, "dt": datetime.datetime(2020, 2, 10)}, + {"_id": 7, "loc": {"type": "Point", "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO]}}, + {"_id": 8, "loc": {"type": "Point", "coordinates": [DOUBLE_ZERO, 0.1]}}, + {"_id": 9, "loc": {"type": "Point", "coordinates": [DOUBLE_ZERO, 1.0]}}, + {"_id": 10, "big": INT64_MAX}, + {"_id": 11, "big": DOUBLE_FROM_INT64_MAX}, + {"_id": 12, "big": 5}, +] + +_NEAR_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "num": {"type": "number"}, + "dt": {"type": "date"}, + "loc": {"type": "geo"}, + "big": {"type": "number"}, + }, + } +} + + +@pytest.fixture(scope="module") +def near_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a numeric, a + date, a geo, and a second numeric (cross-type) field, shared read-only across + the near cases so the index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_near", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["near_op"] + coll.insert_many(_NEAR_DOCS) + create_search_index(coll, _NEAR_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Near Proximity Ordering]: near orders the documents on the queried +# path by ascending distance from a numeric, date, or geo origin (closest first), +# scoring a document at the exact origin 1.0 and scaling the rest by +# pivot/(pivot+distance). +SEARCH_NEAR_PROXIMITY_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_numeric_proximity", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": 10}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(2), "score": Eq(1.0)}, + {"_id": Eq(1), "score": Eq(0.5)}, + {"_id": Eq(3), "score": Eq(0.25)}, + ), + msg="$search near should order numeric results by proximity and score them by " + "pivot/(pivot+distance)", + ), + StageTestCase( + "near_score_boost", + pipeline=[ + { + "$search": { + "near": { + "path": "num", + "origin": 10, + "pivot": 10, + "score": {"boost": {"value": 2.0}}, + } + } + }, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(2), "score": Gt(0)}, + {"_id": Eq(1), "score": Gt(0)}, + {"_id": Eq(3), "score": Gt(0)}, + ), + msg="$search near should accept a score modifier and still order its matches by " + "proximity", + ), + StageTestCase( + "near_date_proximity", + # The pivot is 10 days expressed in milliseconds, the distance unit for a + # date origin, so the docs 10 and 30 days away score 0.5 and 0.25. + pipeline=[ + { + "$search": { + "near": { + "path": "dt", + "origin": datetime.datetime(2020, 1, 11), + "pivot": 864_000_000, + } + } + }, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(5), "score": Eq(1.0)}, + {"_id": Eq(4), "score": Eq(0.5)}, + {"_id": Eq(6), "score": Eq(0.25)}, + ), + msg="$search near should order date results by proximity and score them by " + "pivot/(pivot+distance)", + ), + StageTestCase( + "near_geo_proximity", + pipeline=[ + { + "$search": { + "near": { + "path": "loc", + "origin": {"type": "Point", "coordinates": [DOUBLE_ZERO, DOUBLE_ZERO]}, + "pivot": 10_000, + } + } + }, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(7), "score": Eq(1.0)}, + {"_id": Eq(8), "score": Gt(0)}, + {"_id": Eq(9), "score": Gt(0)}, + ), + msg="$search near should order geo results by geodesic proximity and score a " + "document at the exact origin 1.0", + ), +] + +# Property [Near Pivot Acceptance]: any positive finite pivot is accepted and +# tunes the proximity falloff. +SEARCH_NEAR_PIVOT_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_pivot_fractional", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": 0.5}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(2), "score": Eq(1.0)}, + {"_id": Eq(1), "score": Gt(0)}, + {"_id": Eq(3), "score": Gt(0)}, + ), + msg="$search near should accept a fractional pivot and still score the " + "exact-origin document 1.0", + ), + StageTestCase( + "near_pivot_very_large", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": DOUBLE_MAX}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected={"score": Eq(1.0)}, + msg="$search near should accept a very large pivot, saturating every score to 1.0", + ), + StageTestCase( + "near_pivot_very_small", + # A 1e-300 pivot underflows pivot/(pivot+distance) to 0.0 for any nonzero + # distance, so only the exact-origin document keeps a 1.0 score. + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": 1e-300}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(2), "score": Eq(1.0)}, + {"score": Eq(DOUBLE_ZERO)}, + {"score": Eq(DOUBLE_ZERO)}, + ), + msg="$search near should accept a very small pivot, scoring only the " + "exact-origin document 1.0 and decaying the rest to 0.0", + ), +] + +# Property [Near Numeric Origin Boundaries]: non-finite and extreme numeric +# origins execute without error. +SEARCH_NEAR_ORIGIN_BOUNDARY_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_origin_nan", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": FLOAT_NAN, "pivot": 10}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected={"score": Eq(DOUBLE_ZERO)}, + msg="$search near should execute a NaN origin with no error, scoring every document 0.0", + ), + StageTestCase( + "near_origin_infinite", + # The sign of an infinite-magnitude origin is erased by the absolute + # distance, so a single infinite-origin case covers both signs. + pipeline=[ + {"$search": {"near": {"path": "num", "origin": FLOAT_INFINITY, "pivot": 10}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected={"score": Eq(DOUBLE_ZERO)}, + msg="$search near should execute an infinite origin with no error, scoring every " + "document 0.0", + ), + StageTestCase( + "near_origin_double_max", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": DOUBLE_MAX, "pivot": 10}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected={"score": Eq(DOUBLE_ZERO)}, + msg="$search near should execute a DBL_MAX origin with no error, scoring every " + "document 0.0", + ), + StageTestCase( + "near_origin_int64_max_cross_type", + pipeline=[ + {"$search": {"near": {"path": "big", "origin": INT64_MAX, "pivot": 1}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"score": Eq(1.0)}, + {"score": Eq(1.0)}, + {"_id": Eq(12), "score": Gt(0)}, + ), + msg="$search near should score both the int64-max document and its double " + "approximation 1.0 for an int64-max origin", + ), + StageTestCase( + "near_origin_negative_zero", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": DOUBLE_NEGATIVE_ZERO, "pivot": 10}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(1), "score": Eq(1.0)}, + {"_id": Eq(2), "score": Eq(0.5)}, + {"_id": Eq(3), "score": Gt(0)}, + ), + msg="$search near should treat a -0.0 origin identically to 0.0, scoring the " + "zero-valued document 1.0", + ), +] + +SEARCH_NEAR_CASES_TESTS = ( + SEARCH_NEAR_PROXIMITY_TESTS + SEARCH_NEAR_PIVOT_TESTS + SEARCH_NEAR_ORIGIN_BOUNDARY_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_NEAR_CASES_TESTS)) +def test_search_near_cases(near_collection, test_case: StageTestCase): + """Test $search near proximity ordering, scoring, pivot, and origin boundaries.""" + result = execute_command( + near_collection, + {"aggregate": near_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, expected=test_case.expected, msg=test_case.msg) + + +# Property [Near Type-Mismatched Origin Silent No-Match]: a numeric origin against +# a date path returns no documents and no error, unlike the string, null, or +# Decimal128 origins that fail. +SEARCH_NEAR_SILENT_NO_MATCH_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_numeric_origin_date_path", + pipeline=[ + {"$search": {"near": {"path": "dt", "origin": 10, "pivot": 10}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search near should return no documents and no error for a numeric origin " + "on a date path", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_NEAR_SILENT_NO_MATCH_TESTS)) +def test_search_near_silent_no_match(near_collection, test_case: StageTestCase): + """Test $search near returns a silent empty result for a type-mismatched origin.""" + result = execute_command( + near_collection, + {"aggregate": near_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Near Pivot Validation]: near.pivot must be a positive finite number, +# so a non-positive, non-finite, or non-number pivot is rejected. +SEARCH_NEAR_PIVOT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_pivot_zero", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": 0}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a pivot of zero as not positive", + ), + StageTestCase( + "near_pivot_negative", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": -1}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a negative pivot as not positive", + ), + StageTestCase( + "near_pivot_nan", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": FLOAT_NAN}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a NaN pivot as not finite", + ), + StageTestCase( + "near_pivot_infinity", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": FLOAT_INFINITY}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search near should reject an infinite pivot as not finite", + ), + StageTestCase( + "near_pivot_non_number", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": 10, "pivot": "ten"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a non-number pivot", + ), +] + +# Property [Near Origin Type Match]: a near.origin of any type outside the +# supported set (number, date, geo Point) is rejected, unlike a type-mismatched +# numeric origin which silently matches nothing. +SEARCH_NEAR_ORIGIN_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"near_origin_{tid}", + pipeline=[ + {"$search": {"near": {"path": "num", "origin": val, "pivot": 10}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search near should reject a {tid} origin as an unsupported type", + ) + for tid, val in [ + ("string", "ten"), + ("bool", True), + ("array", [1, 2]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("null", None), + ] +] + +# Property [Near Required Fields]: near.path, near.origin, and near.pivot are all +# required, so a spec omitting any one of them is rejected. +SEARCH_NEAR_REQUIRED_FIELD_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "near_path_missing", + pipeline=[{"$search": {"near": {"origin": 10, "pivot": 10}}}], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a spec that omits the required path", + ), + StageTestCase( + "near_origin_missing", + pipeline=[{"$search": {"near": {"path": "num", "pivot": 10}}}], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a spec that omits the required origin", + ), + StageTestCase( + "near_pivot_missing", + pipeline=[{"$search": {"near": {"path": "num", "origin": 10}}}], + error_code=UNKNOWN_ERROR, + msg="$search near should reject a spec that omits the required pivot", + ), +] + +SEARCH_NEAR_ERROR_TESTS = ( + SEARCH_NEAR_PIVOT_ERROR_TESTS + + SEARCH_NEAR_ORIGIN_TYPE_ERROR_TESTS + + SEARCH_NEAR_REQUIRED_FIELD_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_NEAR_ERROR_TESTS)) +def test_search_near_errors(near_collection, test_case: StageTestCase): + """Test $search near rejects invalid pivot values and type-mismatched origins.""" + result = execute_command( + near_collection, + {"aggregate": near_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_option_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_option_errors.py new file mode 100644 index 000000000..454332c7c --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_option_errors.py @@ -0,0 +1,270 @@ +"""Tests for $search cross-cutting stage option validation errors.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + FLOAT_INFINITY, + FLOAT_NAN, + INT32_MAX, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Document-Typed Option Non-Document]: the count, highlight, and sort +# options must each be a document (a null value is treated as omitted). +SEARCH_DOCUMENT_OPTION_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"{opt}_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, opt: val}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} {opt} option as a non-document", + ) + for opt in ("count", "highlight", "sort") + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("array", [{}]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [phrase.slop Bound And Shared Integer Coercion]: phrase.slop has a +# non-negative bound (it accepts zero, unlike the positive-bound highlight integer +# sub-fields) and exercises the shared $search integer parser, which rejects a +# fractional, non-finite, or out-of-32-bit-range numeric value. +SEARCH_INTEGER_COERCION_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "slop_negative", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "slop": -1}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a negative phrase.slop", + ), + StageTestCase( + "slop_fractional_double", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "slop": 1.5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a fractional-double phrase.slop as a non-integer", + ), + StageTestCase( + "slop_nan", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "slop": FLOAT_NAN}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a NaN phrase.slop as a non-integer", + ), + StageTestCase( + "slop_positive_infinity", + pipeline=[ + { + "$search": { + "phrase": {"query": "quick brown", "path": "title", "slop": FLOAT_INFINITY} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an infinite phrase.slop as not fitting in a 32-bit integer", + ), + StageTestCase( + "slop_int64_over_int32", + pipeline=[ + { + "$search": { + "phrase": { + "query": "quick brown", + "path": "title", + "slop": Int64(INT32_MAX + 1), + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an int64 phrase.slop past the 32-bit integer range", + ), +] + +# Property [Shared Integer Type Rejection]: the shared $search integer parser, +# exercised here through phrase.slop, accepts only whole numbers, so any +# non-numeric BSON type (plus Decimal128) is rejected. +SEARCH_INTEGER_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"slop_type_{tid}", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "slop": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} phrase.slop as a non-integer", + ) + for tid, val in [ + ("string", "1"), + ("bool", True), + ("object", {"a": 1}), + ("array", [1]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Unknown And Case-Variant Option]: an unknown top-level option field is +# rejected, and option names are matched exactly (case-sensitive and not +# whitespace-trimmed). +SEARCH_UNKNOWN_OPTION_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "option_unknown_field", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "bogus": 1}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an unknown top-level option field", + ), + *[ + StageTestCase( + f"option_name_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, name: value}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject the {tid} option name as an unrecognized option", + ) + for tid, name, value in [ + ("capitalized_index", "Index", "default"), + ("trailing_space_index", "index ", "default"), + ] + ], +] + +# Property [searchNodePreference Validation]: searchNodePreference must be a +# document carrying a required string key, so a non-document, a missing key, and +# a non-string key are each rejected. +SEARCH_NODE_PREFERENCE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "search_node_preference_non_document", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "searchNodePreference": "primary", + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a non-document searchNodePreference option", + ), + StageTestCase( + "search_node_preference_key_missing", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "searchNodePreference": {}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a searchNodePreference that omits the required key", + ), + StageTestCase( + "search_node_preference_key_non_string", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "searchNodePreference": {"key": 1}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a non-string searchNodePreference.key", + ), +] + +# Property [returnScope Validation]: returnScope must be a document carrying a +# required path, and a non-empty returnScope additionally requires +# returnStoredSource to be enabled. +SEARCH_RETURN_SCOPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "return_scope_non_document", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnScope": True}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a non-document returnScope option", + ), + StageTestCase( + "return_scope_path_missing", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnScope": {}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a returnScope that omits the required path", + ), + StageTestCase( + "return_scope_requires_stored_source", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "returnScope": {"path": "title"}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a non-empty returnScope when returnStoredSource is not " + "enabled", + ), +] + +SEARCH_OPTION_GENERAL_ERROR_TESTS = ( + SEARCH_DOCUMENT_OPTION_TYPE_ERROR_TESTS + + SEARCH_INTEGER_COERCION_ERROR_TESTS + + SEARCH_INTEGER_TYPE_ERROR_TESTS + + SEARCH_UNKNOWN_OPTION_ERROR_TESTS + + SEARCH_NODE_PREFERENCE_ERROR_TESTS + + SEARCH_RETURN_SCOPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_OPTION_GENERAL_ERROR_TESTS)) +def test_search_option_errors(indexed_collection, test_case: StageTestCase): + """Test $search cross-cutting option validation: type, integer coercion, unknown options.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_options.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_options.py new file mode 100644 index 000000000..e957b519e --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_options.py @@ -0,0 +1,446 @@ +"""Tests for $search stage options (index, sort, tracking) and view resolution.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + SEARCH_INDEX_NAME, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Index Option]: the index option names the search index to query, so +# a name no index has returns nothing silently, and any string is accepted with +# no validation error. +SEARCH_INDEX_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + "index_named_existing", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": SEARCH_INDEX_NAME}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should query the index named by a non-empty string index option", + ), + StageTestCase( + "index_nonexistent_silent", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": "no_such_index"}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should return no documents and no error for a nonexistent index name", + ), + StageTestCase( + "index_name_1000_chars", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": "a" * 1_000}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should accept a 1000-character index name with no length validation", + ), + StageTestCase( + "index_name_special_chars", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "index": "name with spaces!@#$%", + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should accept a special-character index name with no charset validation", + ), + StageTestCase( + "index_name_unicode", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "index": "\u00edndax\u00f1\u00e9\U0001f600", + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should accept a Unicode index name with no charset validation", + ), +] + +# Property [Sort Option]: a sort option document is accepted as a tiebreaker and +# the search still returns its matches. +SEARCH_SORT_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + "sort_ascending", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "sort": {"_id": 1}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a single ascending sort key and still return its matches", + ), + StageTestCase( + "sort_descending", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "sort": {"_id": -1}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a single descending sort key and still return its matches", + ), + StageTestCase( + "sort_meta_search_score", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "sort": {"sc": {"$meta": "searchScore"}}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a $meta searchScore sort key and still return its matches", + ), + StageTestCase( + "sort_multi_key", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "sort": {"_id": 1, "title": 1}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a multi-key sort tiebreaker and still return its matches", + ), + StageTestCase( + "sort_meta_then_key", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "sort": {"sc": {"$meta": "searchScore"}, "_id": 1}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a $meta key with a multi-key tiebreaker and still return " + "its matches", + ), +] + +# Property [Tracking Option]: the tracking option is a recognized stage option +# and is accepted so the search still returns its matches. +SEARCH_TRACKING_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + "tracking_recognized", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "tracking": {"searchTerms": "quick"}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept the tracking option and still return its matches", + ), +] + +# Property [Concurrent Option]: the concurrent option is a recognized boolean stage +# option, so both true and false are accepted with no coercion and the search +# still returns its matches. +SEARCH_CONCURRENT_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + f"concurrent_{label}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "concurrent": val}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg=f"$search should accept a {label} concurrent option and still return its matches", + ) + for label, val in [("true", True), ("false", False)] +] + +# Property [searchNodePreference Option]: searchNodePreference is a recognized +# stage option taking a document with a string key, so a valid preference is +# accepted and the search still returns its matches. +SEARCH_NODE_PREFERENCE_OPTION_TESTS: list[StageTestCase] = [ + StageTestCase( + "search_node_preference_recognized", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "searchNodePreference": {"key": "primary"}, + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a searchNodePreference document and still return its matches", + ), +] + +SEARCH_OPTION_TESTS = ( + SEARCH_INDEX_OPTION_TESTS + + SEARCH_SORT_OPTION_TESTS + + SEARCH_TRACKING_OPTION_TESTS + + SEARCH_CONCURRENT_OPTION_TESTS + + SEARCH_NODE_PREFERENCE_OPTION_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_OPTION_TESTS)) +def test_search_options_cases(indexed_collection, test_case: StageTestCase): + """Test $search index, sort, and tracking stage options.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [View Index Resolution]: a $search against a view resolves the +# underlying collection's search index and returns the underlying collection's +# matching documents. +@pytest.mark.aggregate +def test_search_view_resolves_underlying_index(indexed_collection): + """Test $search over a view resolves the underlying collection's search index.""" + db = indexed_collection.database + view_name = "view_over_indexed" + db.command({"create": view_name, "viewOn": indexed_collection.name, "pipeline": []}) + try: + result = execute_command( + db[view_name], + { + "aggregate": db[view_name].name, + "pipeline": [{"$search": {"text": {"query": "quick", "path": "title"}}}], + "cursor": {}, + }, + ) + finally: + db.drop_collection(view_name) + assertResult( + result, + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search over a view should resolve the underlying collection's index for its matches", + raw_res=True, + ) + + +# Property [Index Option Type And Value]: the index option must be a non-empty +# string (a null index is treated as the default). +SEARCH_INDEX_OPTION_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"index_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": val}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} index option as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"name": "default"}), + ("array", ["default"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "index_empty_string", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": ""}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty-string index option", + ), +] + +# Property [Sort Value Validation]: sort requires at least one field, rejects a +# direction other than 1 or -1, and rejects a $meta sort key other than +# searchScore. +SEARCH_SORT_VALUE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "sort_empty", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "sort": {}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a sort document with no sort field", + ), + StageTestCase( + "sort_bad_direction", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "sort": {"n": 2}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a sort direction other than 1 or -1", + ), + StageTestCase( + "sort_bad_meta_key", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "sort": {"n": {"$meta": "bogus"}}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a $meta sort key other than searchScore", + ), +] + +# Property [Concurrent Option Type]: the concurrent option must be a boolean, so a +# value of any non-boolean BSON type is rejected with no coercion. A null concurrent +# is treated as the default (a success), so it is excluded. +SEARCH_CONCURRENT_OPTION_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"concurrent_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "concurrent": val}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} concurrent option as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +SEARCH_OPTION_ERROR_TESTS = ( + SEARCH_INDEX_OPTION_ERROR_TESTS + + SEARCH_SORT_VALUE_ERROR_TESTS + + SEARCH_CONCURRENT_OPTION_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_OPTION_ERROR_TESTS)) +def test_search_options_errors(indexed_collection, test_case: StageTestCase): + """Test $search index and sort option validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_pagination.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_pagination.py new file mode 100644 index 000000000..33254eea4 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_pagination.py @@ -0,0 +1,184 @@ +"""Tests for the $search searchAfter and searchBefore pagination options.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Eq, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +def _search_ids(collection, spec: dict) -> list: + """Run a $search spec and return the matched _id values in result order.""" + result = execute_command( + collection, + { + "aggregate": collection.name, + "pipeline": [{"$search": spec}, {"$project": {"_id": 1}}], + "cursor": {}, + }, + ) + return [doc["_id"] for doc in result["cursor"]["firstBatch"]] + + +def _sequence_token(collection, spec: dict, position: int) -> str: + """Capture the searchSequenceToken of the result at the given position.""" + result = execute_command( + collection, + { + "aggregate": collection.name, + "pipeline": [ + {"$search": spec}, + {"$project": {"tok": {"$meta": "searchSequenceToken"}}}, + ], + "cursor": {}, + }, + ) + return str(result["cursor"]["firstBatch"][position]["tok"]) + + +def _expected_id_order(ids: list) -> dict: + """Build an assertResult expected dict asserting firstBatch holds exactly these + _id values in this order.""" + expected: dict = {"cursor.firstBatch": Len(len(ids))} + for i, _id in enumerate(ids): + expected[f"cursor.firstBatch.{i}._id"] = Eq(_id) + return expected + + +# Property [searchAfter Pagination]: searchAfter resumes the result stream +# immediately after the result whose searchSequenceToken is supplied, so paging +# from the first result's token yields exactly the remaining results in order. +@pytest.mark.aggregate +def test_search_after_pages_to_following_results(indexed_collection): + """Test $search searchAfter resumes immediately after a captured sequence token.""" + spec = {"text": {"query": "quick", "path": "title"}} + full_ids = _search_ids(indexed_collection, spec) + first_token = _sequence_token(indexed_collection, spec, 0) + result = execute_command( + indexed_collection, + { + "aggregate": indexed_collection.name, + "pipeline": [ + {"$search": {**spec, "searchAfter": first_token}}, + {"$project": {"_id": 1}}, + ], + "cursor": {}, + }, + ) + assertResult( + result, + expected=_expected_id_order(full_ids[1:]), + msg="$search searchAfter should resume immediately after the first result's token", + raw_res=True, + ) + + +# Property [searchBefore Pagination]: searchBefore returns the results preceding +# the result whose searchSequenceToken is supplied, in reverse result order, so +# paging from the last result's token yields the earlier results reversed. +@pytest.mark.aggregate +def test_search_before_pages_to_preceding_results(indexed_collection): + """Test $search searchBefore returns the results preceding a captured sequence token.""" + spec = {"text": {"query": "quick", "path": "title"}} + full_ids = _search_ids(indexed_collection, spec) + last_token = _sequence_token(indexed_collection, spec, len(full_ids) - 1) + result = execute_command( + indexed_collection, + { + "aggregate": indexed_collection.name, + "pipeline": [ + {"$search": {**spec, "searchBefore": last_token}}, + {"$project": {"_id": 1}}, + ], + "cursor": {}, + }, + ) + assertResult( + result, + expected=_expected_id_order(list(reversed(full_ids[:-1]))), + msg="$search searchBefore should return the results preceding the last result's token " + "in reverse order", + raw_res=True, + ) + + +# Property [Pagination Token Type]: searchAfter and searchBefore are string-only +# pagination tokens, so a value of any non-string BSON type is rejected with no +# coercion. A null token is treated as omitted (a default), so it is excluded. +SEARCH_PAGINATION_TOKEN_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"{opt}_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, opt: val}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} {opt} token as a non-string", + ) + for opt in ("searchAfter", "searchBefore") + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", ["abc"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Pagination Token Format]: a non-empty string that is not a valid +# encoded sequence token is rejected as a malformed token value. +SEARCH_PAGINATION_TOKEN_FORMAT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"{opt}_bad_format", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, opt: "not_a_token"}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a malformed {opt} string as an invalid token value", + ) + for opt in ("searchAfter", "searchBefore") +] + +SEARCH_PAGINATION_ERROR_TESTS = ( + SEARCH_PAGINATION_TOKEN_TYPE_ERROR_TESTS + SEARCH_PAGINATION_TOKEN_FORMAT_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_PAGINATION_ERROR_TESTS)) +def test_search_pagination_errors(indexed_collection, test_case: StageTestCase): + """Test $search searchAfter and searchBefore reject non-string and malformed tokens.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_phrase.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_phrase.py new file mode 100644 index 000000000..162d2402a --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_phrase.py @@ -0,0 +1,224 @@ +"""Tests for the $search phrase operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [phrase slop Proximity]: phrase.slop bounds how far apart the query +# terms may sit and still match: slop 0 requires strict adjacency and a positive +# integer permits that many intervening positions. +SEARCH_PHRASE_SLOP_TESTS: list[StageTestCase] = [ + StageTestCase( + "phrase_slop_0_adjacent", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "slop": 0}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search phrase with slop 0 should match adjacent query terms", + ), + StageTestCase( + "phrase_score_boost", + pipeline=[ + { + "$search": { + "phrase": { + "query": "quick brown", + "path": "title", + "slop": 0, + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search phrase should accept a score modifier and still return its matches", + ), + StageTestCase( + "phrase_slop_0_excludes_gap", + pipeline=[ + {"$search": {"phrase": {"query": "quick fox", "path": "title", "slop": 0}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search phrase with slop 0 should require strict adjacency and exclude a " + "document with an intervening token", + ), + StageTestCase( + "phrase_slop_positive_int", + pipeline=[ + {"$search": {"phrase": {"query": "quick fox", "path": "title", "slop": 1}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search phrase with a positive integer slop should permit the intervening token", + ), + StageTestCase( + "phrase_slop_whole_double", + pipeline=[ + {"$search": {"phrase": {"query": "quick fox", "path": "title", "slop": 2.0}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search phrase should accept a whole-number double slop as the proximity bound", + ), + StageTestCase( + "phrase_slop_large", + pipeline=[ + {"$search": {"phrase": {"query": "quick fox", "path": "title", "slop": 1_000_000}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search phrase should accept a very large slop as the proximity bound", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_PHRASE_SLOP_TESTS)) +def test_search_phrase_slop_cases(indexed_collection, test_case: StageTestCase): + """Test $search phrase slop proximity matching.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [phrase query Validation]: phrase.query is required and must be a +# string or array of non-null strings. +SEARCH_PHRASE_QUERY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "phrase_query_missing", + pipeline=[{"$search": {"phrase": {"path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search phrase should reject an operator missing the required query", + ), + *[ + StageTestCase( + f"phrase_query_non_string_{tid}", + pipeline=[{"$search": {"phrase": {"query": val, "path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search phrase should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "quick"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "phrase_query_array_element_null", + pipeline=[ + {"$search": {"phrase": {"query": ["quick brown", None], "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search phrase should reject a null query-array element", + ), + StageTestCase( + "phrase_query_array_element_non_string", + pipeline=[ + {"$search": {"phrase": {"query": ["quick brown", 1], "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search phrase should reject a non-string query-array element", + ), +] + +# Property [phrase path Validation]: phrase.path is required and must be a string, +# document, or array of paths. +SEARCH_PHRASE_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "phrase_path_missing", + pipeline=[{"$search": {"phrase": {"query": "quick brown"}}}], + error_code=UNKNOWN_ERROR, + msg="$search phrase should reject an operator missing the required path", + ), + *[ + StageTestCase( + f"phrase_path_{tid}", + pipeline=[{"$search": {"phrase": {"query": "quick brown", "path": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search phrase should reject a {tid} path as neither a string, document, " + "nor array", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [phrase fuzzy Rejection]: phrase does not accept a fuzzy sub-field. +SEARCH_PHRASE_FUZZY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "phrase_fuzzy_unrecognized", + pipeline=[ + {"$search": {"phrase": {"query": "quick brown", "path": "title", "fuzzy": {}}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a phrase.fuzzy sub-field as unrecognized", + ), +] + +SEARCH_PHRASE_ERROR_TESTS = ( + SEARCH_PHRASE_QUERY_ERROR_TESTS + + SEARCH_PHRASE_PATH_ERROR_TESTS + + SEARCH_PHRASE_FUZZY_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_PHRASE_ERROR_TESTS)) +def test_search_phrase_errors(indexed_collection, test_case: StageTestCase): + """Test $search phrase rejects bad query/path values and an unsupported fuzzy sub-field.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_range.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_range.py new file mode 100644 index 000000000..097c319eb --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_range.py @@ -0,0 +1,683 @@ +"""Tests for the $search range operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + DOUBLE_MAX, + DOUBLE_MIN_SUBNORMAL, + DOUBLE_NEGATIVE_ZERO, + FLOAT_INFINITY, + FLOAT_NAN, + FLOAT_NEGATIVE_INFINITY, + INT32_MIN, +) + +pytestmark = pytest.mark.requires(search=True) + + +_RANGE_DOCS = [ + {"_id": 1, "num": -5}, + {"_id": 2, "num": 0}, + {"_id": 3, "num": 5}, + {"_id": 4, "num": 10}, + {"_id": 5, "num": 20}, + {"_id": 6, "dt": datetime.datetime(1960, 1, 1)}, # pre-epoch + {"_id": 7, "dt": datetime.datetime(1970, 1, 1)}, # epoch + {"_id": 8, "dt": datetime.datetime(2020, 1, 1, 0, 0, 0, 123000)}, # sub-second + {"_id": 9, "dt": datetime.datetime(9999, 12, 31)}, # far future + {"_id": 10, "tok": "mango"}, # lowercase token + {"_id": 11, "tok": "Mango"}, # capitalized token, sorts before lowercase + {"_id": 12, "tok": ""}, # empty-string token, sorts first + {"_id": 13, "tok": "papaya"}, # token sorting after mango + {"_id": 14, "oid": ObjectId("000000000000000000000001")}, + {"_id": 15, "oid": ObjectId("000000000000000000000002")}, + {"_id": 16, "oid": ObjectId("000000000000000000000003")}, +] + +_RANGE_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "num": {"type": "number"}, + "dt": {"type": "date"}, + "tok": {"type": "token"}, + "oid": {"type": "objectId"}, + }, + } +} + + +@pytest.fixture(scope="module") +def range_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a numeric, a + date, and a token-typed field, shared read-only across the range cases so the + index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_range", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["range_op"] + coll.insert_many(_RANGE_DOCS) + create_search_index(coll, _RANGE_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Range Numeric Bound Type And Value Acceptance]: range accepts int32, +# int64, and double numeric bounds (matching identically for the same value) +# across the full numeric range with no error. +SEARCH_RANGE_NUMERIC_BOUND_TESTS: list[StageTestCase] = [ + StageTestCase( + "numeric_bound_int32", + pipeline=[{"$search": {"range": {"path": "num", "lte": 5}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search range should apply an int32 numeric bound", + ), + StageTestCase( + "range_score_boost", + pipeline=[ + {"$search": {"range": {"path": "num", "lte": 5, "score": {"boost": {"value": 2.0}}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search range should accept a score modifier and still return its matches", + ), + StageTestCase( + "numeric_bound_int64", + pipeline=[ + {"$search": {"range": {"path": "num", "lte": Int64(5)}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search range should apply an int64 numeric bound identically to int32", + ), + StageTestCase( + "numeric_bound_double", + pipeline=[{"$search": {"range": {"path": "num", "lte": 5.0}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search range should apply a double numeric bound identically to int32", + ), + StageTestCase( + "numeric_bound_negative", + pipeline=[{"$search": {"range": {"path": "num", "lte": -5}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search range should apply a negative numeric bound", + ), + StageTestCase( + "numeric_bound_zero", + pipeline=[{"$search": {"range": {"path": "num", "lt": 0}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search range should apply a zero numeric bound", + ), + StageTestCase( + "numeric_bound_negative_zero", + pipeline=[ + {"$search": {"range": {"path": "num", "lt": DOUBLE_NEGATIVE_ZERO}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search range should treat a -0.0 bound identically to a 0 bound", + ), + StageTestCase( + "numeric_bound_subnormal", + # The smallest positive subnormal double is a tiny positive bound, so it + # excludes the 0 and -5 docs that a 0 bound would otherwise admit. + pipeline=[ + {"$search": {"range": {"path": "num", "gte": DOUBLE_MIN_SUBNORMAL}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$search range should apply the smallest subnormal double as a tiny positive bound", + ), + StageTestCase( + "numeric_bound_int32_min", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": INT32_MIN}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$search range should apply an int32-min bound, admitting every greater document", + ), + StageTestCase( + "numeric_bound_double_max", + pipeline=[ + {"$search": {"range": {"path": "num", "gt": DOUBLE_MAX}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search range should apply a DBL_MAX bound, matching nothing in a finite sample", + ), +] + +# Property [Range Datetime Bound Full Precision]: a datetime bound on a date path +# honors full millisecond precision. +SEARCH_RANGE_DATE_BOUND_TESTS: list[StageTestCase] = [ + StageTestCase( + "date_bound_epoch", + pipeline=[ + {"$search": {"range": {"path": "dt", "gte": datetime.datetime(1970, 1, 1)}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 7), + Contains("_id", 8), + Contains("_id", 9), + ] + }, + msg="$search range should apply an epoch bound, excluding the pre-epoch document", + ), + StageTestCase( + "date_bound_pre_epoch", + pipeline=[ + {"$search": {"range": {"path": "dt", "lt": datetime.datetime(1970, 1, 1)}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 6)]}, + msg="$search range should select the pre-epoch document below an epoch bound", + ), + StageTestCase( + "date_bound_far_future", + pipeline=[ + {"$search": {"range": {"path": "dt", "gte": datetime.datetime(9999, 12, 31)}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 9)]}, + msg="$search range should apply a far-future year-9999 bound", + ), + StageTestCase( + "date_bound_millisecond_exact", + pipeline=[ + { + "$search": { + "range": { + "path": "dt", + "gte": datetime.datetime(2020, 1, 1, 0, 0, 0, 123000), + } + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 8), Contains("_id", 9)]}, + msg="$search range should select the millisecond-precision document at its exact bound", + ), + StageTestCase( + "date_bound_millisecond_after", + # One millisecond past the stored sub-second time excludes that document, + # so the bound's millisecond component is honored rather than truncated. + pipeline=[ + { + "$search": { + "range": { + "path": "dt", + "gte": datetime.datetime(2020, 1, 1, 0, 0, 0, 124000), + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 9)]}, + msg="$search range should honor the millisecond component of a datetime bound", + ), +] + +# Property [Range Inclusive And Exclusive Bounds]: gte and lte include the +# boundary value while gt and lt exclude it, and an inclusive degenerate interval +# (gte equal to lte) matches the boundary value. +SEARCH_RANGE_INCLUSIVE_EXCLUSIVE_TESTS: list[StageTestCase] = [ + StageTestCase( + "inclusive_gte", + pipeline=[{"$search": {"range": {"path": "num", "gte": 10}}}], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 4), Contains("_id", 5)]}, + msg="$search range gte should include the boundary value", + ), + StageTestCase( + "exclusive_gt", + pipeline=[{"$search": {"range": {"path": "num", "gt": 10}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 5)]}, + msg="$search range gt should exclude the boundary value", + ), + StageTestCase( + "inclusive_lte", + pipeline=[{"$search": {"range": {"path": "num", "lte": 10}}}], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search range lte should include the boundary value", + ), + StageTestCase( + "exclusive_lt", + pipeline=[{"$search": {"range": {"path": "num", "lt": 10}}}], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search range lt should exclude the boundary value", + ), + StageTestCase( + "degenerate_inclusive_interval", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": 10, "lte": 10}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search range should match the boundary value for an inclusive degenerate interval", + ), +] + +# Property [Range Non-Finite Bounds]: a +inf bound matches no documents, a -inf +# bound matches every document, and a NaN bound matches no documents, all with no +# error. +SEARCH_RANGE_NON_FINITE_TESTS: list[StageTestCase] = [ + StageTestCase( + "non_finite_positive_infinity", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": FLOAT_INFINITY}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search range should match no documents for a +inf bound", + ), + StageTestCase( + "non_finite_negative_infinity", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": FLOAT_NEGATIVE_INFINITY}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + Contains("_id", 5), + ] + }, + msg="$search range should match every document for a -inf bound", + ), + StageTestCase( + "non_finite_nan", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": FLOAT_NAN}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search range should match no documents for a NaN bound", + ), +] + +# Property [Range Type-Mismatched Bound Silent No-Match]: a string bound on a +# numeric path and a numeric bound on a token/string path each return no documents +# with no error. +SEARCH_RANGE_TYPE_MISMATCH_TESTS: list[StageTestCase] = [ + StageTestCase( + "mismatch_string_bound_numeric_path", + pipeline=[{"$search": {"range": {"path": "num", "gte": "5"}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search range should return no documents and no error for a string bound on a " + "numeric path", + ), + StageTestCase( + "mismatch_numeric_bound_token_path", + pipeline=[{"$search": {"range": {"path": "tok", "gte": 5}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search range should return no documents and no error for a numeric bound on a " + "token path", + ), +] + +# Property [Range Lexicographic Case-Sensitive Order]: over a token-mapped path, +# string range bounds compare in raw code-point order with no case folding. +SEARCH_RANGE_LEXICOGRAPHIC_ORDER_TESTS: list[StageTestCase] = [ + StageTestCase( + "lexicographic_capital_in_upper_range", + pipeline=[ + {"$search": {"range": {"path": "tok", "gte": "A", "lte": "z"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 10), + Contains("_id", 11), + Contains("_id", 13), + ] + }, + msg="$search range should include a capitalized token within an A-to-z code-point range", + ), + StageTestCase( + "lexicographic_capital_excluded_below_lowercase", + pipeline=[ + {"$search": {"range": {"path": "tok", "gte": "m", "lte": "z"}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 10), Contains("_id", 13)]}, + msg="$search range should exclude a capitalized token below a lowercase lower bound, " + "confirming case sensitivity", + ), +] + +# Property [Range String Inclusive And Exclusive Bounds]: gte and lte include a +# string bound while gt and lt exclude it, and an inclusive degenerate interval +# matches the boundary token. +SEARCH_RANGE_STRING_INCLUSIVE_EXCLUSIVE_TESTS: list[StageTestCase] = [ + StageTestCase( + "string_gte_lte_include_bounds", + pipeline=[ + {"$search": {"range": {"path": "tok", "gte": "mango", "lte": "papaya"}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 10), Contains("_id", 13)]}, + msg="$search range gte and lte should include both string boundary tokens", + ), + StageTestCase( + "string_gt_excludes_lower", + pipeline=[ + {"$search": {"range": {"path": "tok", "gt": "mango", "lte": "papaya"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 13)]}, + msg="$search range gt should exclude the lower string boundary token", + ), + StageTestCase( + "string_lt_excludes_upper", + pipeline=[ + {"$search": {"range": {"path": "tok", "gte": "mango", "lt": "papaya"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 10)]}, + msg="$search range lt should exclude the upper string boundary token", + ), + StageTestCase( + "string_degenerate_inclusive", + pipeline=[ + {"$search": {"range": {"path": "tok", "gte": "mango", "lte": "mango"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 10)]}, + msg="$search range should match the boundary token for an inclusive degenerate string " + "interval", + ), +] + +# Property [Range Empty-String Bounds]: an empty-string lower bound matches every +# stored token while an empty-string upper bound matches only the stored +# empty-string token. +SEARCH_RANGE_EMPTY_STRING_BOUND_TESTS: list[StageTestCase] = [ + StageTestCase( + "empty_string_gte_matches_all", + pipeline=[{"$search": {"range": {"path": "tok", "gte": ""}}}], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 10), + Contains("_id", 11), + Contains("_id", 12), + Contains("_id", 13), + ] + }, + msg="$search range should match every stored token for an empty-string lower bound", + ), + StageTestCase( + "empty_string_lte_matches_empty_only", + pipeline=[{"$search": {"range": {"path": "tok", "lte": ""}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 12)]}, + msg="$search range should match only the empty-string token for an empty-string upper " + "bound", + ), +] + +# Property [Range ObjectId Bounds]: ObjectId is a supported bound type, so over an +# objectId-mapped path the bounds order by ObjectId value, with gte/lte inclusive +# and gt/lt exclusive. +SEARCH_RANGE_OBJECTID_BOUND_TESTS: list[StageTestCase] = [ + StageTestCase( + "objectid_gte", + pipeline=[ + {"$search": {"range": {"path": "oid", "gte": ObjectId("000000000000000000000002")}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 15), Contains("_id", 16)]}, + msg="$search range gte should include the boundary ObjectId and every greater one", + ), + StageTestCase( + "objectid_gt", + pipeline=[ + {"$search": {"range": {"path": "oid", "gt": ObjectId("000000000000000000000002")}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 16)]}, + msg="$search range gt should exclude the boundary ObjectId", + ), + StageTestCase( + "objectid_lte", + pipeline=[ + {"$search": {"range": {"path": "oid", "lte": ObjectId("000000000000000000000002")}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 14), Contains("_id", 15)]}, + msg="$search range lte should include the boundary ObjectId and every lesser one", + ), +] + +# Property [Range doesNotAffect Option]: range recognizes a string doesNotAffect +# option (unlike text or near, which reject the field), accepting it and still +# returning its matches. +SEARCH_RANGE_DOES_NOT_AFFECT_TESTS: list[StageTestCase] = [ + StageTestCase( + "range_does_not_affect_string", + pipeline=[{"$search": {"range": {"path": "num", "gte": 10, "doesNotAffect": "score"}}}], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 4), Contains("_id", 5)]}, + msg="$search range should accept a string doesNotAffect option and still return its " + "matches", + ), +] + +SEARCH_RANGE_TESTS = ( + SEARCH_RANGE_NUMERIC_BOUND_TESTS + + SEARCH_RANGE_DATE_BOUND_TESTS + + SEARCH_RANGE_INCLUSIVE_EXCLUSIVE_TESTS + + SEARCH_RANGE_NON_FINITE_TESTS + + SEARCH_RANGE_TYPE_MISMATCH_TESTS + + SEARCH_RANGE_LEXICOGRAPHIC_ORDER_TESTS + + SEARCH_RANGE_STRING_INCLUSIVE_EXCLUSIVE_TESTS + + SEARCH_RANGE_EMPTY_STRING_BOUND_TESTS + + SEARCH_RANGE_OBJECTID_BOUND_TESTS + + SEARCH_RANGE_DOES_NOT_AFFECT_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_RANGE_TESTS)) +def test_search_range_cases(range_collection, test_case: StageTestCase): + """Test $search range numeric, datetime, and lexicographic string bound semantics.""" + result = execute_command( + range_collection, + {"aggregate": range_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Range Bound Unsupported Type]: number, string, date, and ObjectId are +# the supported bound types, so a bound of any other type is rejected regardless +# of the path type. +SEARCH_RANGE_BOUND_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"bound_type_{tid}", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search range should reject a {tid} bound as an unsupported type", + ) + for tid, val in [ + ("bool", True), + ("object", {"a": 1}), + ("array", [1, 2]), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("null", None), + ] +] + +# Property [Range String Bound On Date Path]: a string bound on a date path is +# rejected as needing a token index, unlike a string bound on a numeric path +# which returns no documents with no error. +SEARCH_RANGE_STRING_BOUND_DATE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "string_bound_date_path", + pipeline=[ + {"$search": {"range": {"path": "dt", "gte": "2020-01-01"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search range should reject a string bound on a date path as needing a token index", + ), +] + +# Property [Range Required Path]: range.path is required, so a spec omitting it +# (even with a valid bound) is rejected. +SEARCH_RANGE_REQUIRED_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "range_path_missing", + pipeline=[{"$search": {"range": {"gte": 5}}}], + error_code=UNKNOWN_ERROR, + msg="$search range should reject a spec that omits the required path", + ), +] + +# Property [Range Requires A Bound]: a range specifying none of lt/lte/gt/gte is +# rejected. +SEARCH_RANGE_NO_BOUND_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "no_bound", + pipeline=[{"$search": {"range": {"path": "num"}}}], + error_code=UNKNOWN_ERROR, + msg="$search range should reject a spec that specifies no bound", + ), +] + +# Property [Range Single Bound Per Direction]: specifying both bounds for one +# direction (gt and gte, or lt and lte) is rejected. +SEARCH_RANGE_DUPLICATE_BOUND_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "duplicate_lower_bound", + pipeline=[ + {"$search": {"range": {"path": "num", "gt": 0, "gte": 0}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search range should reject specifying both gt and gte", + ), + StageTestCase( + "duplicate_upper_bound", + pipeline=[ + {"$search": {"range": {"path": "num", "lt": 10, "lte": 10}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search range should reject specifying both lt and lte", + ), +] + +# Property [Range Interval Validity]: an inverted interval (gte greater than lte) +# and an exclusive degenerate interval (gt equal to lt) are rejected. +SEARCH_RANGE_INTERVAL_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "inverted_interval", + pipeline=[ + {"$search": {"range": {"path": "num", "gte": 10, "lte": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search range should reject an inverted interval where gte exceeds lte", + ), + StageTestCase( + "exclusive_degenerate_interval", + pipeline=[ + {"$search": {"range": {"path": "num", "gt": 5, "lt": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search range should reject an exclusive degenerate interval where gt equals lt", + ), +] + +SEARCH_RANGE_ERROR_TESTS = ( + SEARCH_RANGE_BOUND_TYPE_ERROR_TESTS + + SEARCH_RANGE_STRING_BOUND_DATE_ERROR_TESTS + + SEARCH_RANGE_REQUIRED_PATH_ERROR_TESTS + + SEARCH_RANGE_NO_BOUND_ERROR_TESTS + + SEARCH_RANGE_DUPLICATE_BOUND_ERROR_TESTS + + SEARCH_RANGE_INTERVAL_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_RANGE_ERROR_TESTS)) +def test_search_range_errors(range_collection, test_case: StageTestCase): + """Test $search range rejects unsupported bound types and invalid bound intervals.""" + result = execute_command( + range_collection, + {"aggregate": range_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_regex.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_regex.py new file mode 100644 index 000000000..7f3e40d35 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_regex.py @@ -0,0 +1,319 @@ +"""Tests for the $search regex operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + REGEX_PATTERN_LIMIT_BYTES, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Regex Analyzed Path Matching]: with allowAnalyzedField true, the regex +# operator matches against an analyzed path's tokens, returning the documents +# whose token satisfies the pattern. +SEARCH_REGEX_MATCHING_TESTS: list[StageTestCase] = [ + StageTestCase( + "regex_analyzed_prefix", + pipeline=[ + {"$search": {"regex": {"query": "qu.*", "path": "title", "allowAnalyzedField": True}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search regex should match analyzed-path tokens against the pattern when " + "allowAnalyzedField is true", + ), + StageTestCase( + "regex_score_boost", + pipeline=[ + { + "$search": { + "regex": { + "query": "qu.*", + "path": "title", + "allowAnalyzedField": True, + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search regex should accept a score modifier and still return its matches", + ), + StageTestCase( + "regex_analyzed_distinct_token", + pipeline=[ + {"$search": {"regex": {"query": "tur.*", "path": "title", "allowAnalyzedField": True}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 2)]}, + msg="$search regex should match only the documents whose analyzed token satisfies " + "the pattern", + ), +] + +# Property [Regex Pattern Length]: the regex operator imposes no byte-based +# pattern-length limit. +SEARCH_REGEX_PATTERN_LENGTH_TESTS: list[StageTestCase] = [ + StageTestCase( + f"regex_pattern_length_{n}", + pipeline=[ + {"$search": {"regex": {"query": "a" * n, "path": "title", "allowAnalyzedField": True}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg=f"$search regex should accept a {n}-byte pattern with no byte-based length limit", + ) + for n in [ + REGEX_PATTERN_LIMIT_BYTES - 1, + REGEX_PATTERN_LIMIT_BYTES, + REGEX_PATTERN_LIMIT_BYTES + 1, + 100_000, + ] +] + +# Property [Regex Query Array OR]: regex.query accepts an array of patterns, matching +# the union of the documents matched by each element pattern. +SEARCH_REGEX_QUERY_ARRAY_TESTS: list[StageTestCase] = [ + StageTestCase( + "regex_query_array_or", + pipeline=[ + { + "$search": { + "regex": { + "query": ["qu.*", "tur.*"], + "path": "title", + "allowAnalyzedField": True, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search regex should match the union of a multi-element query array's patterns", + ), +] + +SEARCH_REGEX_TESTS = ( + SEARCH_REGEX_MATCHING_TESTS + SEARCH_REGEX_PATTERN_LENGTH_TESTS + SEARCH_REGEX_QUERY_ARRAY_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_REGEX_TESTS)) +def test_search_regex_cases(indexed_collection, test_case: StageTestCase): + """Test $search regex matching and pattern length.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Regex Analyzed Path Rejection]: regex rejects a path that resolves to +# an analyzed (non-keyword) field unless allowAnalyzedField is true. +SEARCH_REGEX_ANALYZED_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "regex_analyzed_path_no_flag", + pipeline=[ + {"$search": {"regex": {"query": "qu.*", "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject an analyzed path when allowAnalyzedField is omitted", + ), +] + +# Property [Regex query Validation]: regex.query is required and must be a string +# or a non-empty array of non-null strings. +SEARCH_REGEX_QUERY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "regex_query_missing", + pipeline=[{"$search": {"regex": {"path": "title", "allowAnalyzedField": True}}}], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject an operator missing the required query", + ), + *[ + StageTestCase( + f"regex_query_non_string_{tid}", + pipeline=[ + {"$search": {"regex": {"query": val, "path": "title", "allowAnalyzedField": True}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search regex should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "qu.*"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "regex_query_empty_array", + pipeline=[ + {"$search": {"regex": {"query": [], "path": "title", "allowAnalyzedField": True}}} + ], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject an empty-array query", + ), + StageTestCase( + "regex_query_array_element_null", + pipeline=[ + { + "$search": { + "regex": {"query": ["qu.*", None], "path": "title", "allowAnalyzedField": True} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject a null query-array element", + ), + StageTestCase( + "regex_query_array_element_non_string", + pipeline=[ + { + "$search": { + "regex": {"query": ["qu.*", 1], "path": "title", "allowAnalyzedField": True} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject a non-string query-array element", + ), +] + +# Property [Regex allowAnalyzedField Type]: allowAnalyzedField must be a boolean. +SEARCH_REGEX_ALLOW_ANALYZED_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"regex_allow_analyzed_{tid}", + pipeline=[ + {"$search": {"regex": {"query": "qu.*", "path": "title", "allowAnalyzedField": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search regex should reject a {tid} allowAnalyzedField as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Regex path Validation]: regex.path is required and must be a string, +# document, or array of paths. +SEARCH_REGEX_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "regex_path_missing", + pipeline=[{"$search": {"regex": {"query": "qu.*", "allowAnalyzedField": True}}}], + error_code=UNKNOWN_ERROR, + msg="$search regex should reject an operator missing the required path", + ), + *[ + StageTestCase( + f"regex_path_{tid}", + pipeline=[ + {"$search": {"regex": {"query": "qu.*", "path": val, "allowAnalyzedField": True}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search regex should reject a {tid} path as neither a string, document, " + "nor array", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +SEARCH_REGEX_ERROR_TESTS = ( + SEARCH_REGEX_ANALYZED_PATH_ERROR_TESTS + + SEARCH_REGEX_QUERY_ERROR_TESTS + + SEARCH_REGEX_ALLOW_ANALYZED_TYPE_ERROR_TESTS + + SEARCH_REGEX_PATH_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_REGEX_ERROR_TESTS)) +def test_search_regex_errors(indexed_collection, test_case: StageTestCase): + """Test $search regex rejects analyzed paths and bad query/allowAnalyzedField/path values.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_score.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_score.py new file mode 100644 index 000000000..fb3915e5e --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_score.py @@ -0,0 +1,306 @@ +"""Tests for $search score ordering and scoreDetails.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_FIELD_NOT_BOOL_ERROR, + QUERY_METADATA_NOT_AVAILABLE_ERROR, + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Eq, + Exists, + Gt, + Len, + NonEmptyStr, + PerDoc, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [SearchScore Ordering]: results are returned in descending searchScore +# order so the document with the highest matching-term frequency ranks first, and +# {$meta: "searchScore"} projects a positive float for every result. +SEARCH_SCORE_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_term_frequency_ordering", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$project": {"_id": 1, "score": {"$meta": "searchScore"}}}, + ], + expected=PerDoc( + {"_id": Eq(3), "score": Gt(0)}, + {"_id": Eq(4), "score": Gt(0)}, + {"_id": Eq(1), "score": Gt(0)}, + ), + msg="$search should order results by descending searchScore, ranking the " + "highest term-frequency document first", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_TESTS)) +def test_search_score_ordering(indexed_collection, test_case: StageTestCase): + """Test $search orders results by descending searchScore.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, expected=test_case.expected, msg=test_case.msg) + + +# Property [ScoreDetails Output]: with scoreDetails enabled, {$meta: "scoreDetails"} +# projects a recursive object with a positive value, a populated description +# string, and a populated details array. +SEARCH_SCORE_DETAILS_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_details_shape", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "scoreDetails": True}}, + {"$limit": 1}, + {"$project": {"_id": 0, "sd": {"$meta": "scoreDetails"}}}, + ], + expected={ + "sd.value": Gt(0), + "sd.description": NonEmptyStr(), + "sd.details.0": Exists(), + }, + msg="$search should project a recursive scoreDetails object with a value, " + "description, and details array", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_DETAILS_TESTS)) +def test_search_score_details(indexed_collection, test_case: StageTestCase): + """Test $search projects the recursive scoreDetails shape (value, description, details).""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, expected=test_case.expected, msg=test_case.msg) + + +# Property [ScoreDetails Unavailable]: projecting {$meta: "scoreDetails"} without +# enabling scoreDetails on the $search stage is rejected because the metadata is +# not computed. +SEARCH_SCORE_DETAILS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_details_not_enabled", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$project": {"_id": 1, "sd": {"$meta": "scoreDetails"}}}, + ], + error_code=QUERY_METADATA_NOT_AVAILABLE_ERROR, + msg="$search should reject a scoreDetails metadata projection when scoreDetails " + "is not enabled", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_DETAILS_ERROR_TESTS)) +def test_search_score_details_unavailable(indexed_collection, test_case: StageTestCase): + """Test $search rejects a scoreDetails projection when scoreDetails is not enabled.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) + + +# Property [ScoreDetails Boolean Type]: the scoreDetails option is strictly +# boolean with no coercion, and a null is not treated as a missing value. +SEARCH_SCORE_DETAILS_BOOL_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"score_details_bool_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "scoreDetails": val}}, + ], + error_code=BSON_FIELD_NOT_BOOL_ERROR, + msg=f"$search should reject a {tid} scoreDetails as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("null", None), + ] +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_DETAILS_BOOL_TYPE_ERROR_TESTS)) +def test_search_score_details_bool_error(indexed_collection, test_case: StageTestCase): + """Test $search rejects a non-bool scoreDetails option value.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) + + +# Property [Operator Score Modifier]: an operator score modifier accepts exactly +# one of boost, constant, or function. Each alters scoring without changing the +# matched set, so the search still returns its matches. This is a shared operator +# option (every operator accepts it), so it is covered once here rather than per +# operator. +SEARCH_SCORE_MODIFIER_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_modifier_boost", + pipeline=[ + { + "$search": { + "text": { + "query": "quick", + "path": "title", + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a boost score modifier and still return its matches", + ), + StageTestCase( + "score_modifier_constant", + pipeline=[ + { + "$search": { + "text": { + "query": "quick", + "path": "title", + "score": {"constant": {"value": 5.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a constant score modifier and still return its matches", + ), + StageTestCase( + "score_modifier_function", + pipeline=[ + { + "$search": { + "text": { + "query": "quick", + "path": "title", + "score": {"function": {"score": "relevance"}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a function score modifier and still return its matches", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_MODIFIER_TESTS)) +def test_search_score_modifier(indexed_collection, test_case: StageTestCase): + """Test $search accepts each operator score modifier variant.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Operator Score Modifier Validity]: a score modifier must name exactly +# one of boost/constant/function, so an empty modifier and one naming more than +# one variant are each rejected. +SEARCH_SCORE_MODIFIER_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_modifier_empty", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "score": {}}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty score modifier naming no variant", + ), + StageTestCase( + "score_modifier_multiple_variants", + pipeline=[ + { + "$search": { + "text": { + "query": "quick", + "path": "title", + "score": {"boost": {"value": 2.0}, "constant": {"value": 5.0}}, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a score modifier naming more than one variant", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SCORE_MODIFIER_ERROR_TESTS)) +def test_search_score_modifier_errors(indexed_collection, test_case: StageTestCase): + """Test $search rejects an empty or multi-variant score modifier.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_span.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_span.py new file mode 100644 index 000000000..204b630c5 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_span.py @@ -0,0 +1,544 @@ +"""Tests for the $search span operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Span Sub-operator Execution]: span sub-operators execute positional +# and proximity matches against an analyzed path. +SEARCH_SPAN_TESTS: list[StageTestCase] = [ + StageTestCase( + "span_term_token_match", + pipeline=[ + {"$search": {"span": {"term": {"path": "title", "query": "quick"}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search span term should execute a token match over the analyzed path", + ), + StageTestCase( + "span_first_positional", + # first bounds the span's end position, so only a token at the start of + # the field matches; doc 4's "$quick" tokenizes to "quick" at position 0 + # while docs 1 and 3 carry "quick" later in the field. + pipeline=[ + { + "$search": { + "span": { + "first": { + "operator": {"term": {"path": "title", "query": "quick"}}, + "endPositionLte": 1, + } + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search span first should match only spans ending within the position bound", + ), + StageTestCase( + "span_near_adjacent", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "brown"}}, + ], + "slop": 0, + "inOrder": True, + } + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search span near with slop 0 should match adjacent in-order spans", + ), + StageTestCase( + "span_near_slop_excludes_gap", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "fox"}}, + ], + "slop": 0, + "inOrder": True, + } + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search span near with slop 0 should exclude spans separated by an " + "intervening token", + ), + StageTestCase( + "span_near_slop_permits_gap", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "fox"}}, + ], + "slop": 1, + "inOrder": True, + } + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search span near should permit an intervening token within the slop bound", + ), + StageTestCase( + "span_near_order_enforced", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "brown"}}, + {"term": {"path": "title", "query": "quick"}}, + ], + "slop": 0, + "inOrder": True, + } + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search span near with inOrder should not match clauses in the reversed order", + ), + StageTestCase( + "span_or_union", + pipeline=[ + { + "$search": { + "span": { + "or": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "turtle"}}, + ] + } + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search span or should match the union of its clause spans", + ), + StageTestCase( + "span_subtract_excludes_overlap", + # subtract removes include spans that overlap an exclude span, so + # subtracting the same token's spans from themselves leaves nothing. + pipeline=[ + { + "$search": { + "span": { + "subtract": { + "include": {"term": {"path": "title", "query": "quick"}}, + "exclude": {"term": {"path": "title", "query": "quick"}}, + } + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search span subtract should remove include spans that overlap the exclude spans", + ), + StageTestCase( + "span_contains_inner", + # little=quick is contained by the big quick-brown span, so spanToReturn + # inner returns the document carrying that containment. + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "quick"}}, + "big": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "brown"}}, + ], + "slop": 0, + "inOrder": True, + } + }, + "spanToReturn": "inner", + } + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search span contains spanToReturn inner should return the document whose little " + "span is contained by the big span", + ), + StageTestCase( + "span_contains_outer", + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "quick"}}, + "big": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "brown"}}, + ], + "slop": 0, + "inOrder": True, + } + }, + "spanToReturn": "outer", + } + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search span contains spanToReturn outer should return the document whose big span " + "contains the little span", + ), + StageTestCase( + "span_contains_excludes_non_contained", + # little=turtle is not contained by the big quick-brown span, so the + # containment constraint excludes every document. + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "turtle"}}, + "big": { + "near": { + "clauses": [ + {"term": {"path": "title", "query": "quick"}}, + {"term": {"path": "title", "query": "brown"}}, + ], + "slop": 0, + "inOrder": True, + } + }, + "spanToReturn": "inner", + } + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search span contains should exclude a document whose little span is not contained " + "by the big span", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SPAN_TESTS)) +def test_search_span_cases(indexed_collection, test_case: StageTestCase): + """Test $search span single sub-operator execution.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Span Operator Required]: a span document with no sub-operator key +# produces a spec validation error. +SEARCH_SPAN_OPERATOR_REQUIRED_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "span_empty", + pipeline=[{"$search": {"span": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search span should reject an empty document with no sub-operator", + ), +] + +# Property [Span Sub-operator Required Sub-fields]: a span sub-operator with a +# required sub-field omitted is rejected. +SEARCH_SPAN_SUBOPERATOR_REQUIRED_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "term_missing_path", + pipeline=[{"$search": {"span": {"term": {"query": "quick"}}}}], + error_code=UNKNOWN_ERROR, + msg="$search span term should reject a missing path", + ), + StageTestCase( + "term_missing_query", + pipeline=[{"$search": {"span": {"term": {"path": "title"}}}}], + error_code=UNKNOWN_ERROR, + msg="$search span term should reject a missing query", + ), + StageTestCase( + "first_missing_operator", + pipeline=[ + {"$search": {"span": {"first": {"endPositionLte": 1}}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search span first should reject a missing operator", + ), + StageTestCase( + "near_missing_clauses", + pipeline=[{"$search": {"span": {"near": {"slop": 0}}}}], + error_code=UNKNOWN_ERROR, + msg="$search span near should reject missing clauses", + ), + StageTestCase( + "or_missing_clauses", + pipeline=[{"$search": {"span": {"or": {}}}}], + error_code=UNKNOWN_ERROR, + msg="$search span or should reject missing clauses", + ), + StageTestCase( + "subtract_missing_include", + pipeline=[ + { + "$search": { + "span": {"subtract": {"exclude": {"term": {"path": "title", "query": "quick"}}}} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span subtract should reject a missing include", + ), + StageTestCase( + "subtract_missing_exclude", + pipeline=[ + { + "$search": { + "span": {"subtract": {"include": {"term": {"path": "title", "query": "quick"}}}} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span subtract should reject a missing exclude", + ), + StageTestCase( + "contains_missing_little", + pipeline=[ + { + "$search": { + "span": { + "contains": { + "big": {"term": {"path": "title", "query": "quick"}}, + "spanToReturn": "inner", + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span contains should reject a missing little", + ), + StageTestCase( + "contains_missing_span_to_return", + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "quick"}}, + "big": {"term": {"path": "title", "query": "brown"}}, + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span contains should reject a missing spanToReturn", + ), + StageTestCase( + "contains_missing_big", + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "quick"}}, + "spanToReturn": "inner", + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span contains should reject a missing big", + ), +] + +# Property [Span Contains spanToReturn Enum]: span.contains.spanToReturn accepts +# only the values inner or outer, so a value outside that set is rejected. +SEARCH_SPAN_CONTAINS_ENUM_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "contains_span_to_return_bogus", + pipeline=[ + { + "$search": { + "span": { + "contains": { + "little": {"term": {"path": "title", "query": "quick"}}, + "big": {"term": {"path": "title", "query": "brown"}}, + "spanToReturn": "bogus", + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search span contains should reject a spanToReturn outside the [inner, outer] enum", + ), +] + +# Property [Span Near Sub-field Type Rejection]: a near sub-operator's non-integer +# slop or non-boolean inOrder is rejected with no coercion. +SEARCH_SPAN_NEAR_TYPE_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"near_slop_type_{tid}", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [{"term": {"path": "title", "query": "quick"}}], + "slop": val, + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search span near should reject a {tid} slop as a non-integer", + ) + for tid, val in [ + ("string", "1"), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", [1]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + *[ + StageTestCase( + f"near_inorder_type_{tid}", + pipeline=[ + { + "$search": { + "span": { + "near": { + "clauses": [{"term": {"path": "title", "query": "quick"}}], + "inOrder": val, + } + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search span near should reject a {tid} inOrder as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +SEARCH_SPAN_ERROR_TESTS = ( + SEARCH_SPAN_OPERATOR_REQUIRED_ERROR_TESTS + + SEARCH_SPAN_SUBOPERATOR_REQUIRED_ERROR_TESTS + + SEARCH_SPAN_CONTAINS_ENUM_ERROR_TESTS + + SEARCH_SPAN_NEAR_TYPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SPAN_ERROR_TESTS)) +def test_search_span_errors(indexed_collection, test_case: StageTestCase): + """Test $search span rejects an empty operator, missing sub-fields, and mistyped near fields.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_spec_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_spec_errors.py new file mode 100644 index 000000000..ec8d03dec --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_spec_errors.py @@ -0,0 +1,164 @@ +"""Tests for $search stage spec and operator structural errors.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + SEARCH_INDEX_NAME, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Null Sub-field As Missing]: a null operator value or null required +# text sub-field (path/query) is treated as missing and hits the same +# downstream required-field error as omitting it entirely. +SEARCH_NULL_MISSING_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "null_missing_operator", + pipeline=[{"$search": {"text": None}}], + error_code=UNKNOWN_ERROR, + msg="$search should treat a null operator value as a missing operator and reject it", + ), + StageTestCase( + "null_missing_text_query", + pipeline=[ + {"$search": {"text": {"query": None, "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should treat a null text.query as missing and reject the required query", + ), + StageTestCase( + "null_missing_text_path", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": None}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should treat a null text.path as missing and reject the required path", + ), +] + +# Property [Operator Slot Missing]: a spec containing no recognized search +# operator (empty, options-only, or an unknown operator key) is rejected. +SEARCH_OPERATOR_MISSING_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "operator_missing_empty_spec", + pipeline=[{"$search": {}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty spec that contains no operator", + ), + StageTestCase( + "operator_missing_options_only", + pipeline=[{"$search": {"index": SEARCH_INDEX_NAME}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an options-only spec that contains no operator", + ), + StageTestCase( + "operator_missing_unknown_key", + pipeline=[ + {"$search": {"bogus": {"query": "quick", "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an unknown operator key as no recognized operator", + ), +] + +# Property [Operator Slot Duplicate]: a spec containing more than one search +# operator is rejected. +SEARCH_OPERATOR_DUPLICATE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "operator_duplicate_two", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title"}, + "exists": {"path": "title"}, + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a spec containing two search operators", + ), +] + +# Property [Operator Value Type]: a recognized operator whose value is not a +# document is rejected (a null value is owned by the null-as-missing property +# above). +SEARCH_OPERATOR_VALUE_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"operator_value_{tid}", + pipeline=[{"$search": {"text": val}}], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} operator value as a non-document", + ) + for tid, val in [ + ("string", "quick"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("array", ["quick"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Operator Name Exact Match]: operator names are matched exactly +# (case-sensitive and not whitespace-trimmed). +SEARCH_OPERATOR_NAME_CASE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"operator_name_{tid}", + pipeline=[ + {"$search": {name: {"query": "quick", "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject the {tid} operator name as an unrecognized operator", + ) + for tid, name in [ + ("capitalized", "Text"), + ("trailing_space", "text "), + ("leading_space", " text"), + ] +] + +SEARCH_SPEC_ERROR_TESTS = ( + SEARCH_NULL_MISSING_ERROR_TESTS + + SEARCH_OPERATOR_MISSING_ERROR_TESTS + + SEARCH_OPERATOR_DUPLICATE_ERROR_TESTS + + SEARCH_OPERATOR_VALUE_TYPE_ERROR_TESTS + + SEARCH_OPERATOR_NAME_CASE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SPEC_ERROR_TESTS)) +def test_search_spec_errors(indexed_collection, test_case: StageTestCase): + """Test $search rejects null-as-missing sub-fields and operator structural errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stage_basics.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stage_basics.py new file mode 100644 index 000000000..b56bd32b1 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stage_basics.py @@ -0,0 +1,170 @@ +"""Tests for $search stage value typing and silent-empty behavior.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + FIXTURE_DOCS, + create_dynamic_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + EXPRESSION_NOT_OBJECT_ERROR, + FAILED_TO_PARSE_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ZERO, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Stage Value Array Type]: a $search stage value that is an array is +# rejected with the array-specific parse error, a distinct code path from the +# non-object scalar value. +SEARCH_STAGE_VALUE_ARRAY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "stage_value_empty_array", + pipeline=[{"$search": []}], + error_code=FAILED_TO_PARSE_ERROR, + msg="$search should reject an empty-array stage value with the array-specific parse error", + ), + StageTestCase( + "stage_value_array_of_object", + pipeline=[{"$search": [{"text": {"query": "quick", "path": "title"}}]}], + error_code=FAILED_TO_PARSE_ERROR, + msg="$search should reject an array stage value even when it wraps a valid operator object", + ), + StageTestCase( + "stage_value_array_of_scalar", + pipeline=[{"$search": [1]}], + error_code=FAILED_TO_PARSE_ERROR, + msg="$search should reject an array stage value of a scalar element with the array error", + ), +] + +# Property [Stage Value Scalar Type]: a $search stage value that is any scalar or +# null is rejected as a non-object, and null is not treated as a missing argument. +SEARCH_STAGE_VALUE_SCALAR_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"stage_value_{tid}", + pipeline=[{"$search": val}], + error_code=EXPRESSION_NOT_OBJECT_ERROR, + msg=f"$search should reject a {tid} stage value as a non-object", + ) + for tid, val in [ + ("string", "quick"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("decimal128", DECIMAL128_ZERO), + ("bool", True), + ("object_id", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x00")), + ("regex", Regex("a")), + ("code", Code("x")), + ("min_key", MinKey()), + ("max_key", MaxKey()), + ] + ], + StageTestCase( + "stage_value_null", + pipeline=[{"$search": None}], + error_code=EXPRESSION_NOT_OBJECT_ERROR, + msg="$search should reject a null stage value as a non-object, not treat it as missing", + ), +] + +SEARCH_STAGE_VALUE_ERROR_TESTS = ( + SEARCH_STAGE_VALUE_ARRAY_ERROR_TESTS + SEARCH_STAGE_VALUE_SCALAR_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_STAGE_VALUE_ERROR_TESTS)) +def test_search_stage_value_type_errors(indexed_collection, test_case: StageTestCase): + """Test $search rejects a non-object stage value, distinguishing array from scalar/null.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) + + +# Property [Silent Empty Result]: a $search against a nonexistent collection or a +# collection with no search index returns no documents and no error. +SEARCH_SILENT_EMPTY_TESTS: list[StageTestCase] = [ + StageTestCase( + "empty_nonexistent_collection", + docs=None, + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search on a nonexistent collection should return no documents without error", + ), + StageTestCase( + "empty_no_search_index", + docs=FIXTURE_DOCS, + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search on a collection with no search index should return no documents without error", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_SILENT_EMPTY_TESTS)) +def test_search_silent_empty_cases(collection, test_case: StageTestCase): + """Test $search returns a silent empty result for a missing collection or missing index.""" + if test_case.docs: + collection.insert_many(test_case.docs) + result = execute_command( + collection, {"aggregate": collection.name, "pipeline": test_case.pipeline, "cursor": {}} + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Empty Indexed Collection]: a $search against a collection that has a +# search index but no documents returns no documents and no error. +@pytest.mark.aggregate +def test_search_empty_indexed_collection(collection): + """Test $search returns a silent empty result on an empty but indexed collection.""" + collection.database.create_collection(collection.name) + create_dynamic_search_index(collection) + result = execute_command( + collection, + { + "aggregate": collection.name, + "pipeline": [{"$search": {"text": {"query": "quick", "path": "title"}}}], + "cursor": {}, + }, + ) + assertResult( + result, + expected={"cursor.firstBatch": Len(0)}, + msg="$search on an empty indexed collection should return no documents without error", + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stored_source.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stored_source.py new file mode 100644 index 000000000..74540ac40 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_stored_source.py @@ -0,0 +1,194 @@ +"""Tests for the $search returnStoredSource option and behavior.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + BSON_FIELD_NOT_BOOL_ERROR, + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +_STORED_SOURCE_DOCS = [ + {"_id": 1, "title": "the quick brown fox", "body": "lazy dog"}, + {"_id": 2, "title": "slow green turtle", "body": "quick nap"}, + {"_id": 3, "title": "a quick quick rabbit", "body": "fast"}, +] + +_STORED_SOURCE_INDEX_DEFINITION = { + "mappings": {"dynamic": True}, + "storedSource": {"include": ["title"]}, +} + + +@pytest.fixture(scope="module") +def stored_source_collection(engine_client, worker_id): + """A module-scoped collection with a storedSource-configured search index that + stores only the title field, shared read-only across the returnStoredSource + cases so the index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_stored_source", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["stored_source"] + coll.insert_many(_STORED_SOURCE_DOCS) + create_search_index(coll, _STORED_SOURCE_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [ReturnStoredSource False Acceptance]: returnStoredSource accepts a +# boolean false with no coercion and the search still returns its matches (true +# is owned by the stored-source return property). +SEARCH_RETURN_STORED_SOURCE_FALSE_TESTS: list[StageTestCase] = [ + StageTestCase( + "return_stored_source_false", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnStoredSource": False}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept returnStoredSource false and still return its matches", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_RETURN_STORED_SOURCE_FALSE_TESTS)) +def test_search_return_stored_source_false_cases(indexed_collection, test_case: StageTestCase): + """Test $search accepts returnStoredSource false.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Stored Source Return]: returnStoredSource true against a +# storedSource-configured index returns the stored-source documents, exposing +# only the configured stored fields and omitting the unstored fields. +SEARCH_STORED_SOURCE_TESTS: list[StageTestCase] = [ + StageTestCase( + "return_stored_source_projects_stored_fields", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnStoredSource": True}} + ], + expected=[ + {"_id": 1, "title": "the quick brown fox"}, + {"_id": 3, "title": "a quick quick rabbit"}, + ], + msg="$search returnStoredSource true should return the stored-source documents " + "exposing only the configured stored fields", + ), +] + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_STORED_SOURCE_TESTS)) +def test_search_return_stored_source(stored_source_collection, test_case: StageTestCase): + """Test $search returnStoredSource true returns the stored-source documents.""" + result = execute_command( + stored_source_collection, + {"aggregate": stored_source_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + ignore_doc_order=True, + ) + + +# Property [ReturnStoredSource Without Configured Source]: returnStoredSource true +# against an index that does not configure storedSource is rejected. +SEARCH_RETURN_STORED_SOURCE_CONFIG_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "return_stored_source_unconfigured", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnStoredSource": True}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject returnStoredSource true against an index with no " + "storedSource configured", + ), +] + +# Property [ReturnStoredSource Boolean Type]: the returnStoredSource option is +# strictly boolean with no coercion, and a null is not treated as a missing value. +SEARCH_RETURN_STORED_SOURCE_BOOL_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"return_stored_source_bool_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "returnStoredSource": val}}, + ], + error_code=BSON_FIELD_NOT_BOOL_ERROR, + msg=f"$search should reject a {tid} returnStoredSource as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ("null", None), + ] +] + +SEARCH_STORED_SOURCE_ERROR_TESTS = ( + SEARCH_RETURN_STORED_SOURCE_CONFIG_ERROR_TESTS + + SEARCH_RETURN_STORED_SOURCE_BOOL_TYPE_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_STORED_SOURCE_ERROR_TESTS)) +def test_search_stored_source_errors(indexed_collection, test_case: StageTestCase): + """Test $search returnStoredSource config and bool-type validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_analysis.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_analysis.py new file mode 100644 index 000000000..14ffcd3cb --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_analysis.py @@ -0,0 +1,394 @@ +"""Tests for $search default analyzer tokenization and case folding.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [ASCII Case Folding]: ASCII letter case is folded during analysis, so a +# query token matches a stored token of any letter case. +SEARCH_ASCII_CASE_FOLD_TESTS: list[StageTestCase] = [ + StageTestCase( + "case_fold_upper", + pipeline=[ + {"$search": {"text": {"query": "QUICK", "path": "title"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should match an all-uppercase query against a stored lowercase token", + ), + StageTestCase( + "case_fold_mixed", + pipeline=[ + {"$search": {"text": {"query": "QuIcK", "path": "title"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should match a mixed-case query against a stored lowercase token", + ), +] + +# Property [Single-Character Token]: the analyzer preserves a single-character +# token so a one-character query matches its stored one-character form. +SEARCH_SINGLE_CHAR_TOKEN_TESTS: list[StageTestCase] = [ + StageTestCase( + "single_char_token", + pipeline=[{"$search": {"text": {"query": "x", "path": "title"}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 6)]}, + msg="$search should match a single-character query against a stored single-character token", + ), +] + +# Property [Non-ASCII Case Folding]: simple 1:1 case folding is applied to +# non-ASCII cased scripts, so a query matches a stored token of the opposite case. +SEARCH_NON_ASCII_CASE_FOLD_TESTS: list[StageTestCase] = [ + StageTestCase( + "fold_greek", + pipeline=[ + {"$search": {"text": {"query": "ΣΙΓΜΑ", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 7)]}, + msg="$search should fold uppercase Greek to match a stored lowercase Greek token", + ), + StageTestCase( + "fold_cyrillic", + pipeline=[ + {"$search": {"text": {"query": "ДЕНЬ", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 8)]}, + msg="$search should fold uppercase Cyrillic to match a stored lowercase Cyrillic token", + ), + StageTestCase( + "fold_supplementary_plane", + # Deseret capital long I (U+10400) folds to small letter long I (U+10428). + pipeline=[ + {"$search": {"text": {"query": "\U00010400", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 9)]}, + msg="$search should fold an uppercase supplementary-plane letter to match its " + "stored lowercase form", + ), +] + +# Property [No-Match Tokenization]: a query that produces no token matching a +# stored token returns no documents without error. +SEARCH_NO_MATCH_TOKEN_TESTS: list[StageTestCase] = [ + StageTestCase( + "no_match_whitespace", + pipeline=[ + {"$search": {"text": {"query": " ", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a whitespace-only query", + ), + StageTestCase( + "no_match_punctuation", + pipeline=[ + {"$search": {"text": {"query": "!!!", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a punctuation-only query", + ), + StageTestCase( + "no_match_null_byte", + pipeline=[ + {"$search": {"text": {"query": "\x00", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a lone null-byte query", + ), + StageTestCase( + "no_match_cjk", + pipeline=[ + {"$search": {"text": {"query": "日本語", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a CJK run with no stored token", + ), + StageTestCase( + "no_match_emoji", + pipeline=[ + {"$search": {"text": {"query": "😀😀", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for an emoji run with no stored token", + ), + StageTestCase( + "no_match_accented", + pipeline=[ + {"$search": {"text": {"query": "çàü", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for an accented run with no stored token", + ), + StageTestCase( + "no_match_zwsp", + # Zero-width space (U+200B) carries no token content. + pipeline=[ + {"$search": {"text": {"query": "\u200b", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a zero-width-space-only query", + ), + StageTestCase( + "no_match_digits", + pipeline=[ + {"$search": {"text": {"query": "12345", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should match nothing for a pure-digit query with no stored digit token", + ), +] + +# Property [No Diacritic/NFC/NFD/Ligature/Locale Normalization]: the default +# analyzer leaves canonically- or compatibility-equivalent forms as distinct +# tokens, so a query matches only its own stored form and never an equivalent one. +SEARCH_NO_NORMALIZATION_TESTS: list[StageTestCase] = [ + StageTestCase( + "norm_diacritic_plain", + pipeline=[ + {"$search": {"text": {"query": "resume", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 10)]}, + msg="$search should match a plain-ASCII query only against its undecorated stored token", + ), + StageTestCase( + "norm_diacritic_accented", + pipeline=[ + {"$search": {"text": {"query": "r\u00e9sum\u00e9", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 5)]}, + msg="$search should match an accented query only against its accented stored token", + ), + StageTestCase( + "norm_nfc_precomposed", + pipeline=[ + {"$search": {"text": {"query": "caf\u00e9", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 11)]}, + msg="$search should match a precomposed query against its precomposed stored token", + ), + StageTestCase( + "norm_nfd_combining", + # "cafe" followed by combining acute accent (U+0301). + pipeline=[ + {"$search": {"text": {"query": "cafe\u0301", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should not normalize a combining-form query to match a precomposed token", + ), + StageTestCase( + "norm_ligature_stored", + pipeline=[ + {"$search": {"text": {"query": "\ufb01le", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 12)]}, + msg="$search should match a ligature query against its ligature stored token", + ), + StageTestCase( + "norm_ligature_decomposed", + pipeline=[ + {"$search": {"text": {"query": "file", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should not decompose a ligature token to match a plain-letter query", + ), + StageTestCase( + "norm_german_stored", + pipeline=[ + {"$search": {"text": {"query": "stra\u00dfe", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 13)]}, + msg="$search should match an eszett query against its eszett stored token", + ), + StageTestCase( + "norm_german_lower_expansion", + pipeline=[ + {"$search": {"text": {"query": "strasse", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should not expand eszett to ss to match a lowercase query", + ), + StageTestCase( + "norm_german_upper_expansion", + pipeline=[ + {"$search": {"text": {"query": "STRASSE", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should not expand eszett to ss to match an uppercase query", + ), + StageTestCase( + "norm_turkish_stored", + pipeline=[ + {"$search": {"text": {"query": "\u0131rmak", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 14)]}, + msg="$search should match a dotless-i query against its dotless-i stored token", + ), + StageTestCase( + "norm_turkish_upper", + pipeline=[ + {"$search": {"text": {"query": "IRMAK", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should not locale-fold an uppercase I to match a dotless-i token", + ), + StageTestCase( + "norm_turkish_ascii_lower", + pipeline=[ + {"$search": {"text": {"query": "irmak", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should treat ASCII dotted i as distinct from a dotless-i token", + ), +] + +# Property [ASCII Fold Range Edges]: ASCII case folding applies precisely at the +# A-Z range boundaries, with no off-by-one at the first or last letter of the range. +SEARCH_ASCII_EDGE_FOLD_TESTS: list[StageTestCase] = [ + StageTestCase( + "edge_fold_first_letter", + pipeline=[{"$search": {"text": {"query": "A", "path": "title"}}}], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 3), Contains("_id", 15)]}, + msg="$search should fold an uppercase A at the range start to match a stored lowercase a", + ), + StageTestCase( + "edge_fold_last_letter", + pipeline=[{"$search": {"text": {"query": "Z", "path": "title"}}}], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 15)]}, + msg="$search should fold an uppercase Z at the range end to match a stored lowercase z", + ), +] + +# Property [Whitespace and Control Token Boundaries]: Unicode whitespace +# categories, control characters, and backslash each act as a token boundary +# identically to ASCII space, splitting a query into separate tokens, while a +# backslash-only query yields no token and matches nothing. +SEARCH_TOKEN_BOUNDARY_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"boundary_{name}", + pipeline=[ + {"$search": {"text": {"query": f"word{sep}joined", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 16)]}, + msg=f"$search should treat {desc} as a token boundary, matching the " + "two-token document and not the one-token document", + ) + for name, sep, desc in [ + ("nbsp", "\u00a0", "a no-break space (U+00A0)"), + ("en_space", "\u2000", "an en space (U+2000)"), + ("tab", "\t", "a tab"), + ("newline", "\n", "a newline"), + ("control_0001", "\u0001", "a control character (U+0001)"), + ("control_001f", "\u001f", "a control character (U+001F)"), + ("backslash", "\\", "a backslash"), + ] + ], + StageTestCase( + "boundary_backslash_only", + pipeline=[ + {"$search": {"text": {"query": "\\", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should produce no token for a backslash-only query and match nothing", + ), +] + +# Property [Zero-Width Token Boundaries]: a zero-width space splits a token while +# an embedded BOM or ZWJ is retained inside the token (matching neither the split +# nor the joined form), and a leading BOM is stripped so the remaining token still +# matches. +SEARCH_ZERO_WIDTH_BOUNDARY_TESTS: list[StageTestCase] = [ + StageTestCase( + "zwsp_boundary", + # Zero-width space (U+200B) acts as a token boundary. + pipeline=[ + {"$search": {"text": {"query": "word\u200bjoined", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 16)]}, + msg="$search should treat a zero-width space as a token boundary, matching the " + "two-token document and not the one-token document", + ), + StageTestCase( + "bom_embedded_retained", + # BOM (U+FEFF) embedded mid-token is retained inside the token. + pipeline=[ + {"$search": {"text": {"query": "word\ufeffjoined", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should retain an embedded BOM inside the token, matching neither the " + "split nor the joined form", + ), + StageTestCase( + "zwj_embedded_retained", + # ZWJ (U+200D) embedded mid-token is retained inside the token. + pipeline=[ + {"$search": {"text": {"query": "word\u200djoined", "path": "title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should retain an embedded ZWJ inside the token, matching neither the " + "split nor the joined form", + ), + StageTestCase( + "bom_leading_stripped", + # A leading BOM (U+FEFF) is stripped, leaving the joined token intact. + pipeline=[ + {"$search": {"text": {"query": "\ufeffwordjoined", "path": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 17)]}, + msg="$search should strip a leading BOM so the remaining one-token form still matches", + ), +] + +SEARCH_TEXT_ANALYSIS_TESTS = ( + SEARCH_ASCII_CASE_FOLD_TESTS + + SEARCH_SINGLE_CHAR_TOKEN_TESTS + + SEARCH_NON_ASCII_CASE_FOLD_TESTS + + SEARCH_NO_MATCH_TOKEN_TESTS + + SEARCH_NO_NORMALIZATION_TESTS + + SEARCH_ASCII_EDGE_FOLD_TESTS + + SEARCH_TOKEN_BOUNDARY_TESTS + + SEARCH_ZERO_WIDTH_BOUNDARY_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_ANALYSIS_TESTS)) +def test_search_text_analysis_cases(indexed_collection, test_case: StageTestCase): + """Test $search default analyzer tokenization and case folding.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_basics.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_basics.py new file mode 100644 index 000000000..7e458bf87 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_basics.py @@ -0,0 +1,231 @@ +"""Tests for $search text operator core matching behavior.""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [Index-Covered Matching]: a text operator returns exactly the +# documents whose covered path contains the query token, and a path no document +# covers matches nothing. +SEARCH_MATCHING_TESTS: list[StageTestCase] = [ + StageTestCase( + "matching_covered_title", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should return the documents whose covered path contains the query token", + ), + StageTestCase( + "matching_covered_body", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "body"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 2)]}, + msg="$search should match only on the specific covered path named in the operator", + ), + StageTestCase( + "matching_uncovered_path", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "nope"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should return no documents for a path no document covers", + ), +] + +# Property [Literal Spec Values]: $search spec values are interpreted as literal +# data, never as field paths, system variables, or expressions. +SEARCH_LITERAL_SPEC_TESTS: list[StageTestCase] = [ + StageTestCase( + "literal_path_empty", + pipeline=[{"$search": {"text": {"query": "quick", "path": ""}}}], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should treat an empty path as literal data and match nothing without error", + ), + StageTestCase( + "literal_path_field_ref", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "$title"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should treat a $-prefixed path as literal data, not a field reference", + ), + StageTestCase( + "literal_path_dotted", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "a.b"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should treat a dotted path as literal data with no field-path validation", + ), + StageTestCase( + "literal_path_null_byte", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "ti\x00tle"}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should treat a null-byte path as literal data and match nothing without error", + ), + StageTestCase( + "literal_query_dollar", + pipeline=[ + {"$search": {"text": {"query": "$quick", "path": "title"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should match a $-prefixed query as literal text, not as a field reference", + ), +] + +# Property [Null Sub-field As Default]: a null document- or string-typed +# sub-field (index, count, highlight, sort, text.fuzzy) is treated as +# missing/default. +SEARCH_NULL_DEFAULT_TESTS: list[StageTestCase] = [ + StageTestCase( + "null_default_index", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "index": None}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should treat a null index as the default index and still match", + ), + StageTestCase( + "null_default_count", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "count": None}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should treat a null count as omitted and still match", + ), + StageTestCase( + "null_default_highlight", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "highlight": None}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should treat a null highlight as omitted and still match", + ), + StageTestCase( + "null_default_sort", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}, "sort": None}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should treat a null sort as omitted and still match", + ), + StageTestCase( + "null_default_fuzzy", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "fuzzy": None}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should treat a null text.fuzzy as omitted and still match", + ), +] + +# Property [Multi-Term OR]: a multi-term query array matches documents containing +# any of its terms. +SEARCH_MULTI_TERM_OR_TESTS: list[StageTestCase] = [ + StageTestCase( + "multi_term_or", + pipeline=[ + {"$search": {"text": {"query": ["quick", "turtle"], "path": "title"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should match documents containing any term in a multi-term query array", + ), +] + +SEARCH_TEXT_BASICS_TESTS = ( + SEARCH_MATCHING_TESTS + + SEARCH_LITERAL_SPEC_TESTS + + SEARCH_NULL_DEFAULT_TESTS + + SEARCH_MULTI_TERM_OR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_BASICS_TESTS)) +def test_search_text_basics_cases(indexed_collection, test_case: StageTestCase): + """Test $search core text matching over an indexed collection.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_errors.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_errors.py new file mode 100644 index 000000000..c7d27e432 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_errors.py @@ -0,0 +1,398 @@ +"""Tests for $search text operator and fuzzy validation errors.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + QUERY_CLAUSE_CAP, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, + STRING_SIZE_LIMIT_BYTES, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [text query Validation]: text.query is required and must be a +# non-empty string or array of non-null strings. +SEARCH_TEXT_QUERY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "text_query_missing", + pipeline=[{"$search": {"text": {"path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject a text operator missing the required query", + ), + StageTestCase( + "text_query_empty_string", + pipeline=[{"$search": {"text": {"query": "", "path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty-string text.query", + ), + StageTestCase( + "text_query_empty_array", + pipeline=[{"$search": {"text": {"query": [], "path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty-array text.query", + ), + *[ + StageTestCase( + f"text_query_non_string_{tid}", + pipeline=[ + {"$search": {"text": {"query": val, "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} text.query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "quick"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "text_query_array_element_null", + pipeline=[ + {"$search": {"text": {"query": ["quick", None], "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a null query-array element", + ), + StageTestCase( + "text_query_array_element_non_string", + pipeline=[ + {"$search": {"text": {"query": ["quick", 1], "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a non-string query-array element", + ), +] + +# Property [text path Validation]: text.path is required and must be a string, +# document, or non-empty array, and a path document must carry a value and a +# configured multi-analyzer. +SEARCH_TEXT_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "text_path_missing", + pipeline=[{"$search": {"text": {"query": "quick"}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject a text operator missing the required path", + ), + StageTestCase( + "text_path_empty_array", + pipeline=[{"$search": {"text": {"query": "quick", "path": []}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject an empty-array text.path", + ), + *[ + StageTestCase( + f"text_path_non_string_non_document_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} text.path as neither a string nor a document", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "text_path_object_no_value", + pipeline=[{"$search": {"text": {"query": "quick", "path": {}}}}], + error_code=UNKNOWN_ERROR, + msg="$search should reject a text.path document with no value field", + ), + StageTestCase( + "text_path_object_absent_multi", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": {"value": "title", "multi": "nope"}}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a text.path document referencing an absent " + "multi-analyzer config", + ), +] + +# Property [text matchCriteria Validation]: text.matchCriteria must be the string +# "all" or "any". +SEARCH_TEXT_MATCH_CRITERIA_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "text_match_criteria_bad_enum", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "matchCriteria": "none"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a matchCriteria outside the set [all, any]", + ), + *[ + StageTestCase( + f"text_match_criteria_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "matchCriteria": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} matchCriteria as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"a": 1}), + ("array", ["all"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [text synonyms Validation]: text.synonyms must name a configured +# synonym mapping. +SEARCH_TEXT_SYNONYMS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "text_synonyms_unknown_name", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "synonyms": "nope"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a text.synonyms referencing an unknown synonym mapping name", + ), +] + +# Property [text fuzzy Validation]: text.fuzzy must be a document and rejects an +# unknown sub-field (a null fuzzy is treated as the default). +SEARCH_TEXT_FUZZY_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"text_fuzzy_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "fuzzy": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} text.fuzzy as a non-document", + ) + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("array", [{}]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "text_fuzzy_unknown_subfield", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "fuzzy": {"bogus": 1}}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject an unknown text.fuzzy sub-field", + ), +] + +# Property [text score Validation]: text.score must be a document (a null score +# is treated as the default). +SEARCH_TEXT_SCORE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"text_score_type_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "score": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a {tid} text.score as a non-document", + ) + for tid, val in [ + ("string", "x"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("array", [{}]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Query/Token Size]: a query that resolves to more than the clause cap is +# rejected, whether from a long query array or from a single string the analyzer +# splits into many sub-tokens, so the cap is clause-count based, not a byte-size limit. +SEARCH_QUERY_TOKEN_SIZE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "query_array_over_clause_cap", + pipeline=[ + { + "$search": { + "text": { + "query": ["quick"] + [f"nomatch{i}" for i in range(QUERY_CLAUSE_CAP)], + "path": "title", + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a query array one past the inclusive clause cap", + ), + # The run spans the BSON string size limit; it is still rejected with the + # clause-count error rather than a byte-size error. + StageTestCase( + "query_single_byte_run_over_cap", + pipeline=[ + {"$search": {"text": {"query": "a" * STRING_SIZE_LIMIT_BYTES, "path": "title"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a multi-megabyte single-character run the analyzer " + "splits into more sub-tokens than the clause cap", + ), + StageTestCase( + "query_multi_byte_run_over_cap", + pipeline=[ + { + "$search": { + "text": {"query": "\u00e9" * (STRING_SIZE_LIMIT_BYTES // 2), "path": "title"} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a multi-byte run at the same byte size, showing the " + "cap is clause-count based and not byte based", + ), +] + +# Property [Fuzzy maxEdits Enum]: text.fuzzy.maxEdits accepts only 1 or 2, so any +# other integer value is rejected. +SEARCH_FUZZY_MAX_EDITS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"fuzzy_max_edits_{tid}", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "fuzzy": {"maxEdits": val}}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a fuzzy.maxEdits of {tid} outside the set 1 or 2", + ) + for tid, val in [ + ("zero", 0), + ("three", 3), + ] +] + +# Property [Fuzzy prefixLength Lower Bound]: a negative text.fuzzy.prefixLength is +# rejected. +SEARCH_FUZZY_PREFIX_LENGTH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "fuzzy_prefix_length_negative", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title", "fuzzy": {"prefixLength": -1}} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search should reject a negative fuzzy.prefixLength", + ), +] + +# Property [Fuzzy maxExpansions Bounds]: text.fuzzy.maxExpansions must fall within +# 1..1000, so a value outside those bounds is rejected. +SEARCH_FUZZY_MAX_EXPANSIONS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"fuzzy_max_expansions_{tid}", + pipeline=[ + { + "$search": { + "text": {"query": "quick", "path": "title", "fuzzy": {"maxExpansions": val}} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search should reject a fuzzy.maxExpansions of {tid} outside the bounds 1 to 1000", + ) + for tid, val in [ + ("zero", 0), + ("over_max", 1001), + ] +] + +SEARCH_TEXT_ERROR_TESTS = ( + SEARCH_TEXT_QUERY_ERROR_TESTS + + SEARCH_TEXT_PATH_ERROR_TESTS + + SEARCH_TEXT_MATCH_CRITERIA_ERROR_TESTS + + SEARCH_TEXT_SYNONYMS_ERROR_TESTS + + SEARCH_TEXT_FUZZY_ERROR_TESTS + + SEARCH_TEXT_SCORE_ERROR_TESTS + + SEARCH_QUERY_TOKEN_SIZE_ERROR_TESTS + + SEARCH_FUZZY_MAX_EDITS_ERROR_TESTS + + SEARCH_FUZZY_PREFIX_LENGTH_ERROR_TESTS + + SEARCH_FUZZY_MAX_EXPANSIONS_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_ERROR_TESTS)) +def test_search_text_errors(indexed_collection, test_case: StageTestCase): + """Test $search text operator and fuzzy validation errors.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_operator.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_operator.py new file mode 100644 index 000000000..60d993a85 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_operator.py @@ -0,0 +1,328 @@ +"""Tests for $search text operator options (path forms, matchCriteria, score, fuzzy).""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + QUERY_CLAUSE_CAP, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [text Path Forms]: the text operator accepts a {value} document, a +# {wildcard} document, and an array of paths, each resolving to the covered +# field(s) it names. +SEARCH_TEXT_PATH_FORMS_TESTS: list[StageTestCase] = [ + StageTestCase( + "path_value_document", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": {"value": "title"}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a {value} path document resolving to the named field", + ), + StageTestCase( + "path_wildcard_document", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": {"wildcard": "*"}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a {wildcard} path document spanning every covered field", + ), + StageTestCase( + "path_array", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": ["title", "body"]}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept an array of paths and match across all of them", + ), +] + +# Property [text matchCriteria]: matchCriteria "all" requires every query term to +# be present (AND) while "any" requires only one (OR). +SEARCH_TEXT_MATCH_CRITERIA_TESTS: list[StageTestCase] = [ + StageTestCase( + "match_criteria_all", + pipeline=[ + { + "$search": { + "text": {"query": "quick brown", "path": "title", "matchCriteria": "all"} + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search should match only documents containing every term when matchCriteria is all", + ), + StageTestCase( + "match_criteria_any", + pipeline=[ + { + "$search": { + "text": {"query": "quick brown", "path": "title", "matchCriteria": "any"} + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should match documents containing any term when matchCriteria is any", + ), +] + +# Property [text score Document]: the text operator accepts a score document and +# still returns the matched documents. +SEARCH_TEXT_SCORE_TESTS: list[StageTestCase] = [ + StageTestCase( + "score_document", + pipeline=[ + { + "$search": { + "text": { + "query": "quick", + "path": "title", + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a text.score document and still return the matches", + ), +] + +# Property [text query Array Cap]: a query array of the maximum 1024 elements +# (inclusive) is accepted and matches as a multi-term OR. +SEARCH_TEXT_QUERY_ARRAY_CAP_TESTS: list[StageTestCase] = [ + StageTestCase( + "query_array_max_clauses", + pipeline=[ + { + "$search": { + "text": { + "query": ["quick"] + [f"nomatch{i}" for i in range(QUERY_CLAUSE_CAP - 1)], + "path": "title", + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept a query array sized at the inclusive clause cap", + ), +] + +# Property [text Fuzzy Matching]: the text operator accepts a fuzzy document and +# matches within a code-point-based edit distance applied independently per +# query-array element. +SEARCH_TEXT_FUZZY_TESTS: list[StageTestCase] = [ + StageTestCase( + "fuzzy_empty_document", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title", "fuzzy": {}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should accept an empty fuzzy document and still match the exact term", + ), + StageTestCase( + "fuzzy_max_edits_1_codepoint", + # cafe -> café is one code-point edit (é ↔ e), not two bytes. + pipeline=[ + {"$search": {"text": {"query": "cafe", "path": "title", "fuzzy": {"maxEdits": 1}}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 11)]}, + msg="$search should treat an accent difference as one code-point edit at maxEdits 1", + ), + StageTestCase( + "fuzzy_max_edits_2_codepoint", + # resume -> résumé is two code-point edits. + pipeline=[ + {"$search": {"text": {"query": "resume", "path": "title", "fuzzy": {"maxEdits": 2}}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 5), Contains("_id", 10)]}, + msg="$search should match within two code-point edits at maxEdits 2", + ), + StageTestCase( + "fuzzy_max_expansions_min", + pipeline=[ + { + "$search": { + "text": { + "query": "cafe", + "path": "title", + "fuzzy": {"maxEdits": 1, "maxExpansions": 1}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 11)]}, + msg="$search should accept fuzzy.maxExpansions at the lower bound and still match", + ), + StageTestCase( + "fuzzy_max_expansions_max", + pipeline=[ + { + "$search": { + "text": { + "query": "cafe", + "path": "title", + "fuzzy": {"maxEdits": 1, "maxExpansions": 1000}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 11)]}, + msg="$search should accept fuzzy.maxExpansions at the upper bound and still match", + ), + StageTestCase( + "fuzzy_per_element_array", + pipeline=[ + { + "$search": { + "text": { + "query": ["quik", "turtl"], + "path": "title", + "fuzzy": {"maxEdits": 1}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(4), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search should apply fuzzy matching independently to each query-array element", + ), +] + +# Property [text Fuzzy prefixLength]: fuzzy.prefixLength locks a code-point-counted +# prefix from edits, so a typo within that prefix does not match. +SEARCH_TEXT_FUZZY_PREFIX_TESTS: list[StageTestCase] = [ + StageTestCase( + "fuzzy_prefix_unlocked", + # éfoy -> éfox is one edit at code point 3; prefixLength 0 locks nothing. + pipeline=[ + { + "$search": { + "text": { + "query": "\u00e9foy", + "path": "title", + "fuzzy": {"maxEdits": 1, "prefixLength": 0}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 18)]}, + msg="$search should allow a fuzzy edit outside the prefix when prefixLength is 0", + ), + StageTestCase( + "fuzzy_prefix_locked_codepoint", + # prefixLength 4 locks all four code points of éfox (5 bytes), so the typo + # at code point 3 falls inside the locked prefix; byte counting would lock + # only éfo (3 code points) and still allow the edit. + pipeline=[ + { + "$search": { + "text": { + "query": "\u00e9foy", + "path": "title", + "fuzzy": {"maxEdits": 1, "prefixLength": 4}, + } + } + }, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search should count prefixLength in code points so a locked-prefix typo does " + "not match", + ), +] + +SEARCH_TEXT_OPERATOR_TESTS = ( + SEARCH_TEXT_PATH_FORMS_TESTS + + SEARCH_TEXT_MATCH_CRITERIA_TESTS + + SEARCH_TEXT_SCORE_TESTS + + SEARCH_TEXT_QUERY_ARRAY_CAP_TESTS + + SEARCH_TEXT_FUZZY_TESTS + + SEARCH_TEXT_FUZZY_PREFIX_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_OPERATOR_TESTS)) +def test_search_text_operator_cases(indexed_collection, test_case: StageTestCase): + """Test $search text operator path forms, matchCriteria, score, fuzzy.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_query_operators.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_query_operators.py new file mode 100644 index 000000000..a1cc2d131 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_text_query_operators.py @@ -0,0 +1,359 @@ +"""Tests for the $search queryString, term, and moreLikeThis operators.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +# Property [QueryString Lucene Syntax]: the queryString operator parses its query +# as Lucene syntax, so boolean operators and field-scoped terms take effect. +SEARCH_QUERY_STRING_TESTS: list[StageTestCase] = [ + StageTestCase( + "query_string_boolean_and", + pipeline=[ + {"$search": {"queryString": {"query": "quick AND brown", "defaultPath": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search queryString should require every term of a boolean AND query to be present", + ), + StageTestCase( + "query_string_score_boost", + pipeline=[ + { + "$search": { + "queryString": { + "query": "quick AND brown", + "defaultPath": "title", + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search queryString should accept a score modifier and still return its matches", + ), + StageTestCase( + "query_string_field_scoped", + pipeline=[ + {"$search": {"queryString": {"query": "body:quick", "defaultPath": "title"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 2)]}, + msg="$search queryString should restrict a field-scoped term to the named field", + ), +] + +# Property [Term Token Match]: the deprecated term operator executes a token +# match against the index, returning the documents whose covered path contains +# the query token. +SEARCH_TERM_TESTS: list[StageTestCase] = [ + StageTestCase( + "term_token_match_title", + pipeline=[ + {"$search": {"term": {"path": "title", "query": "quick"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search term should execute a token match, returning the documents whose " + "covered path contains the query token", + ), +] + +# Property [MoreLikeThis Similarity]: moreLikeThis analyzes the example text in +# like:{:} and returns the documents whose covered field shares its +# significant terms. +SEARCH_MORE_LIKE_THIS_TESTS: list[StageTestCase] = [ + StageTestCase( + "more_like_this_shared_terms", + pipeline=[ + {"$search": {"moreLikeThis": {"like": {"title": "the quick brown fox"}}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search moreLikeThis should return the documents whose covered field shares the " + "example text's significant terms", + ), + StageTestCase( + "more_like_this_score_boost", + pipeline=[ + { + "$search": { + "moreLikeThis": { + "like": {"title": "the quick brown fox"}, + "score": {"boost": {"value": 2.0}}, + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search moreLikeThis should accept a score modifier and still return its matches", + ), + StageTestCase( + "more_like_this_array_of_docs", + pipeline=[ + { + "$search": { + "moreLikeThis": { + "like": [ + {"title": "the quick brown fox"}, + {"title": "quick rabbit"}, + ] + } + } + }, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 3), + Contains("_id", 4), + ] + }, + msg="$search moreLikeThis should accept an array of example documents and return the " + "documents sharing their significant terms", + ), +] + +SEARCH_TEXT_QUERY_OPERATOR_TESTS = ( + SEARCH_QUERY_STRING_TESTS + SEARCH_TERM_TESTS + SEARCH_MORE_LIKE_THIS_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_QUERY_OPERATOR_TESTS)) +def test_search_text_query_operators_cases(indexed_collection, test_case: StageTestCase): + """Test $search queryString, term, and moreLikeThis operators.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [queryString Validation]: queryString requires query and defaultPath, +# both string-only (neither accepts the array form that term.query accepts). +SEARCH_QUERY_STRING_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "query_string_missing_query", + pipeline=[{"$search": {"queryString": {"defaultPath": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search queryString should reject an operator missing the required query", + ), + StageTestCase( + "query_string_missing_default_path", + pipeline=[{"$search": {"queryString": {"query": "quick"}}}], + error_code=UNKNOWN_ERROR, + msg="$search queryString should reject an operator missing the required defaultPath", + ), + *[ + StageTestCase( + f"query_string_query_non_string_{tid}", + pipeline=[{"$search": {"queryString": {"query": val, "defaultPath": "title"}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search queryString should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "quick"}), + ("array", ["quick"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + *[ + StageTestCase( + f"query_string_default_path_non_string_{tid}", + pipeline=[{"$search": {"queryString": {"query": "quick", "defaultPath": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search queryString should reject a {tid} defaultPath as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"value": "title"}), + ("array", ["title"]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [term Validation]: term requires path and query; term.path is a string, +# document, or array of paths, and term.query is a string or array of strings. +SEARCH_TERM_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "term_missing_path", + pipeline=[{"$search": {"term": {"query": "quick"}}}], + error_code=UNKNOWN_ERROR, + msg="$search term should reject an operator missing the required path", + ), + StageTestCase( + "term_missing_query", + pipeline=[{"$search": {"term": {"path": "title"}}}], + error_code=UNKNOWN_ERROR, + msg="$search term should reject an operator missing the required query", + ), + *[ + StageTestCase( + f"term_path_{tid}", + pipeline=[{"$search": {"term": {"path": val, "query": "quick"}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search term should reject a {tid} path as neither a string, document, " + "nor array", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + *[ + StageTestCase( + f"term_query_non_string_{tid}", + pipeline=[{"$search": {"term": {"path": "title", "query": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search term should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "quick"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +# Property [moreLikeThis Validation]: moreLikeThis requires like, which must be a +# document or array of documents. +SEARCH_MORE_LIKE_THIS_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "more_like_this_missing_like", + pipeline=[{"$search": {"moreLikeThis": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search moreLikeThis should reject an operator missing the required like", + ), + *[ + StageTestCase( + f"more_like_this_like_{tid}", + pipeline=[{"$search": {"moreLikeThis": {"like": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search moreLikeThis should reject a {tid} like as neither a document nor array", + ) + for tid, val in [ + ("string", "quick"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], +] + +SEARCH_TEXT_QUERY_OPERATOR_ERROR_TESTS = ( + SEARCH_QUERY_STRING_ERROR_TESTS + SEARCH_TERM_ERROR_TESTS + SEARCH_MORE_LIKE_THIS_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_TEXT_QUERY_OPERATOR_ERROR_TESTS)) +def test_search_text_query_operators_errors(indexed_collection, test_case: StageTestCase): + """Test $search queryString, term, and moreLikeThis required-field and type validation.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_unsupported_operators.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_unsupported_operators.py new file mode 100644 index 000000000..81c266a1d --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_unsupported_operators.py @@ -0,0 +1,263 @@ +"""Tests for $search recognized-but-unsupported operator validation (hierarchy, vector).""" + +from __future__ import annotations + +import pytest + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params + +pytestmark = pytest.mark.requires(search=True) + + +# Property [EmbeddedDocument/HasAncestor/HasRoot Validation]: each +# hierarchical-relationship operator rejects a spec that targets a non-embedded +# path or omits its required sub-field with a validation error. +SEARCH_HIERARCHY_OP_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "embedded_document_non_subfield_path", + pipeline=[ + { + "$search": { + "embeddedDocument": { + "path": "title", + "operator": {"text": {"query": "quick", "path": "title"}}, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search embeddedDocument should reject a path not mapped as an " + "embeddedDocuments field", + ), + StageTestCase( + "has_ancestor_missing_ancestor_path", + pipeline=[{"$search": {"hasAncestor": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search hasAncestor should reject a spec missing the required ancestorPath", + ), + StageTestCase( + "has_root_missing_operator", + pipeline=[{"$search": {"hasRoot": {}}}], + error_code=UNKNOWN_ERROR, + msg="$search hasRoot should reject a spec missing the required operator", + ), +] + +# Property [Vector-Search Operator Validation]: the vectorSearch and knnBeta +# operators are recognized but reject a non-vector index and any malformed or +# missing required field with a validation error. +SEARCH_VECTOR_OP_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "vector_search_non_vector_index", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": [0.1, 0.2, 0.3], + "numCandidates": 10, + "limit": 5, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a path not indexed as vector", + ), + StageTestCase( + "vector_search_index_inside_operator", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": [0.1, 0.2, 0.3], + "numCandidates": 10, + "limit": 5, + "index": "default", + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject an index field placed inside the operator", + ), + StageTestCase( + "vector_search_non_array_query_vector", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": "not_an_array", + "numCandidates": 10, + "limit": 5, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a non-array queryVector", + ), + StageTestCase( + "vector_search_string_query_vector", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": ["a", "b", "c"], + "numCandidates": 10, + "limit": 5, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a queryVector array of strings", + ), + StageTestCase( + "vector_search_empty_query_vector", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": [], + "numCandidates": 10, + "limit": 5, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject an empty queryVector", + ), + StageTestCase( + "vector_search_missing_query_vector", + pipeline=[ + {"$search": {"vectorSearch": {"path": "title", "numCandidates": 10, "limit": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a spec missing both query and queryVector", + ), + StageTestCase( + "vector_search_missing_path", + pipeline=[ + { + "$search": { + "vectorSearch": { + "queryVector": [0.1, 0.2, 0.3], + "numCandidates": 10, + "limit": 5, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a spec missing the required path", + ), + StageTestCase( + "vector_search_missing_num_candidates", + pipeline=[ + { + "$search": { + "vectorSearch": {"path": "title", "queryVector": [0.1, 0.2, 0.3], "limit": 5} + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a spec missing the required numCandidates", + ), + StageTestCase( + "vector_search_missing_limit", + pipeline=[ + { + "$search": { + "vectorSearch": { + "path": "title", + "queryVector": [0.1, 0.2, 0.3], + "numCandidates": 10, + } + } + }, + ], + error_code=UNKNOWN_ERROR, + msg="$search vectorSearch should reject a spec missing the required limit", + ), + StageTestCase( + "knn_beta_non_knn_vector_index", + pipeline=[ + {"$search": {"knnBeta": {"path": "title", "vector": [0.1, 0.2, 0.3], "k": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a path not indexed as knnVector", + ), + StageTestCase( + "knn_beta_non_array_vector", + pipeline=[ + {"$search": {"knnBeta": {"path": "title", "vector": "not_an_array", "k": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a non-array vector", + ), + StageTestCase( + "knn_beta_empty_vector", + pipeline=[ + {"$search": {"knnBeta": {"path": "title", "vector": [], "k": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject an empty vector", + ), + StageTestCase( + "knn_beta_string_vector", + pipeline=[ + {"$search": {"knnBeta": {"path": "title", "vector": ["a", "b", "c"], "k": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a vector array of strings", + ), + StageTestCase( + "knn_beta_missing_vector", + pipeline=[{"$search": {"knnBeta": {"path": "title", "k": 5}}}], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a spec missing the required vector", + ), + StageTestCase( + "knn_beta_missing_k", + pipeline=[ + {"$search": {"knnBeta": {"path": "title", "vector": [0.1, 0.2, 0.3]}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a spec missing the required k", + ), + StageTestCase( + "knn_beta_missing_path", + pipeline=[ + {"$search": {"knnBeta": {"vector": [0.1, 0.2, 0.3], "k": 5}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search knnBeta should reject a spec missing the required path", + ), +] + +SEARCH_UNSUPPORTED_OP_ERROR_TESTS = SEARCH_HIERARCHY_OP_ERROR_TESTS + SEARCH_VECTOR_OP_ERROR_TESTS + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_UNSUPPORTED_OP_ERROR_TESTS)) +def test_search_unsupported_operator_errors(indexed_collection, test_case: StageTestCase): + """Test $search recognized-but-unsupported hierarchy and vector operators reject their specs.""" + result = execute_command( + indexed_collection, + {"aggregate": indexed_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_wildcard.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_wildcard.py new file mode 100644 index 000000000..82ea3b3fd --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_search_wildcard.py @@ -0,0 +1,457 @@ +"""Tests for the $search wildcard operator.""" + +from __future__ import annotations + +import datetime + +import pytest +from bson import Binary, Code, Int64, MaxKey, MinKey, ObjectId, Regex, Timestamp + +from documentdb_tests.compatibility.tests.core.operator.stages.search.utils.search_common import ( + create_search_index, +) +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + UNKNOWN_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params +from documentdb_tests.framework.property_checks import ( + Contains, + Len, +) +from documentdb_tests.framework.test_constants import ( + DECIMAL128_ONE_AND_HALF, +) + +pytestmark = pytest.mark.requires(search=True) + + +_WILDCARD_DOCS = [ + {"_id": 1, "kw": "quick", "std": "the quick brown fox", "tok": "quick"}, + {"_id": 2, "kw": "quack"}, + {"_id": 3, "kw": "axb"}, + {"_id": 4, "kw": "AXB"}, + {"_id": 5, "kw": "a*b"}, # literal asterisk + {"_id": 6, "kw": "a?b"}, # literal question mark + {"_id": 7, "kw": "ab"}, # zero characters between a and b + {"_id": 8, "kw": "axxb"}, # two characters between a and b +] + +_WILDCARD_INDEX_DEFINITION = { + "mappings": { + "dynamic": False, + "fields": { + "kw": {"type": "string", "analyzer": "lucene.keyword"}, + "std": {"type": "string"}, + "tok": {"type": "token"}, + }, + } +} + + +@pytest.fixture(scope="module") +def wildcard_collection(engine_client, worker_id): + """A module-scoped collection with a static search index mapping a + keyword-analyzed, a standard-analyzed, and a token-typed field, shared + read-only across the wildcard cases so the index is built and polled once.""" + db_name = fixtures.generate_database_name("stages_search_wildcard", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["wildcard"] + coll.insert_many(_WILDCARD_DOCS) + create_search_index(coll, _WILDCARD_INDEX_DEFINITION) + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Wildcard Field-Type Matching]: wildcard term-searches a +# keyword-analyzed path without a flag and a standard-analyzed path with +# allowAnalyzedField true, but never matches a token-typed field even with the +# flag (it passes the analyzed-field guard yet is not term-searchable). +SEARCH_WILDCARD_FIELD_TYPE_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_keyword_no_flag", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "kw"}}}, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search wildcard should match a keyword-analyzed path with no allowAnalyzedField flag", + ), + StageTestCase( + "wildcard_score_boost", + pipeline=[ + { + "$search": { + "wildcard": {"query": "qu*", "path": "kw", "score": {"boost": {"value": 2.0}}} + } + }, + ], + expected={"cursor.firstBatch": [Len(2), Contains("_id", 1), Contains("_id", 2)]}, + msg="$search wildcard should accept a score modifier and still return its matches", + ), + StageTestCase( + "wildcard_standard_with_flag", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "std", "allowAnalyzedField": True}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search wildcard should match a standard-analyzed path when allowAnalyzedField " + "is true", + ), + StageTestCase( + "wildcard_token_matches_nothing", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "tok", "allowAnalyzedField": True}}}, + ], + expected={"cursor.firstBatch": Len(0)}, + msg="$search wildcard should match nothing on a token-typed field even with " + "allowAnalyzedField true", + ), +] + +# Property [Wildcard Special Characters]: `*` matches zero-or-more characters, +# `?` matches exactly one character, and a backslash-escaped `\*`/`\?` matches a +# literal `*`/`?` character rather than acting as a wildcard. +SEARCH_WILDCARD_SPECIAL_CHAR_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_star_zero_or_more", + pipeline=[ + {"$search": {"wildcard": {"query": "a*b", "path": "kw"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(5), + Contains("_id", 3), + Contains("_id", 5), + Contains("_id", 6), + Contains("_id", 7), + Contains("_id", 8), + ] + }, + msg="$search wildcard `*` should match zero-or-more characters, including the " + "zero-character and multi-character tokens", + ), + StageTestCase( + "wildcard_question_exactly_one", + pipeline=[ + {"$search": {"wildcard": {"query": "a?b", "path": "kw"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 3), + Contains("_id", 5), + Contains("_id", 6), + ] + }, + msg="$search wildcard `?` should match exactly one character, excluding the " + "zero-character and two-character tokens", + ), + StageTestCase( + "wildcard_escaped_star_literal", + pipeline=[ + {"$search": {"wildcard": {"query": "a\\*b", "path": "kw"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 5)]}, + msg="$search wildcard should match a literal `*` for an escaped `\\*`, not as a wildcard", + ), + StageTestCase( + "wildcard_escaped_question_literal", + pipeline=[ + {"$search": {"wildcard": {"query": "a\\?b", "path": "kw"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 6)]}, + msg="$search wildcard should match a literal `?` for an escaped `\\?`, not as a wildcard", + ), +] + +# Property [Wildcard Keyword Case Sensitivity]: matching on a keyword path is +# case-sensitive. +SEARCH_WILDCARD_CASE_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_keyword_case_sensitive", + pipeline=[ + {"$search": {"wildcard": {"query": "A*B", "path": "kw"}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 4)]}, + msg="$search wildcard on a keyword path should be case-sensitive, matching only the " + "uppercase-stored token", + ), +] + +# Property [Wildcard Query Array OR]: a query array matches the union of the +# documents matched by each element pattern. +SEARCH_WILDCARD_QUERY_ARRAY_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_query_array_or", + pipeline=[ + {"$search": {"wildcard": {"query": ["qu*", "axb"], "path": "kw"}}}, + ], + expected={ + "cursor.firstBatch": [ + Len(3), + Contains("_id", 1), + Contains("_id", 2), + Contains("_id", 3), + ] + }, + msg="$search wildcard should match the union of a multi-element query array's patterns", + ), +] + +# Property [Wildcard Path Forms]: the path accepts a {value} document, a +# {wildcard} document, and an array of paths in addition to a bare string, each +# resolving to the covered field(s) it names. +SEARCH_WILDCARD_PATH_FORMS_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_path_value_document", + pipeline=[ + {"$search": {"wildcard": {"query": "quick", "path": {"value": "kw"}}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search wildcard should accept a {value} path document resolving to the named field", + ), + StageTestCase( + "wildcard_path_wildcard_document", + pipeline=[ + { + "$search": { + "wildcard": { + "query": "quick", + "path": {"wildcard": "*"}, + "allowAnalyzedField": True, + } + } + }, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search wildcard should accept a {wildcard} path document spanning every covered " + "field", + ), + StageTestCase( + "wildcard_path_array", + pipeline=[ + {"$search": {"wildcard": {"query": "quick", "path": ["kw"]}}}, + ], + expected={"cursor.firstBatch": [Len(1), Contains("_id", 1)]}, + msg="$search wildcard should accept an array of paths resolving to the named field", + ), +] + +SEARCH_WILDCARD_TESTS = ( + SEARCH_WILDCARD_FIELD_TYPE_TESTS + + SEARCH_WILDCARD_SPECIAL_CHAR_TESTS + + SEARCH_WILDCARD_CASE_TESTS + + SEARCH_WILDCARD_QUERY_ARRAY_TESTS + + SEARCH_WILDCARD_PATH_FORMS_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_WILDCARD_TESTS)) +def test_search_wildcard_cases(wildcard_collection, test_case: StageTestCase): + """Test $search wildcard matching over keyword-, standard-, and token-mapped fields.""" + result = execute_command( + wildcard_collection, + {"aggregate": wildcard_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult( + result, + expected=test_case.expected, + msg=test_case.msg, + raw_res=True, + ) + + +# Property [Wildcard Analyzed Path Rejection]: wildcard rejects a path that +# resolves to a non-keyword analyzed field when allowAnalyzedField is not set, +# including an empty or dotted path that resolves to no keyword field. +SEARCH_WILDCARD_ANALYZED_PATH_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_standard_path_no_flag", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "std"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a standard-analyzed path without allowAnalyzedField", + ), + StageTestCase( + "wildcard_empty_path_no_flag", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": ""}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject an empty path that resolves to no keyword field", + ), + StageTestCase( + "wildcard_dotted_path_no_flag", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "a.b"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a dotted path that resolves to no keyword field", + ), +] + +# Property [Wildcard allowAnalyzedField Type]: allowAnalyzedField must be a +# boolean (a null value is treated as the default). +SEARCH_WILDCARD_ALLOW_ANALYZED_TYPE_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + f"wildcard_allow_analyzed_{tid}", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": "kw", "allowAnalyzedField": val}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search wildcard should reject a {tid} allowAnalyzedField as a non-boolean", + ) + for tid, val in [ + ("string", "true"), + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("object", {"a": 1}), + ("array", [True]), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] +] + +# Property [Wildcard query Validation]: wildcard.query is required and must be a +# non-empty string or array of non-null strings. +SEARCH_WILDCARD_QUERY_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "wildcard_query_missing", + pipeline=[{"$search": {"wildcard": {"path": "kw"}}}], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject an operator missing the required query", + ), + StageTestCase( + "wildcard_query_null", + pipeline=[ + {"$search": {"wildcard": {"query": None, "path": "kw"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a null query treated as missing", + ), + StageTestCase( + "wildcard_query_empty_string", + pipeline=[{"$search": {"wildcard": {"query": "", "path": "kw"}}}], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject an empty-string query", + ), + StageTestCase( + "wildcard_query_empty_array", + pipeline=[{"$search": {"wildcard": {"query": [], "path": "kw"}}}], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject an empty-array query", + ), + *[ + StageTestCase( + f"wildcard_query_non_string_{tid}", + pipeline=[ + {"$search": {"wildcard": {"query": val, "path": "kw"}}}, + ], + error_code=UNKNOWN_ERROR, + msg=f"$search wildcard should reject a {tid} query as a non-string", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("object", {"q": "quick"}), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "wildcard_query_array_element_null", + pipeline=[ + {"$search": {"wildcard": {"query": ["qu*", None], "path": "kw"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a null query-array element", + ), + StageTestCase( + "wildcard_query_array_element_non_string", + pipeline=[ + {"$search": {"wildcard": {"query": ["qu*", 1], "path": "kw"}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a non-string query-array element", + ), +] + +# Property [Wildcard path Validation]: wildcard.path is required and must be a +# string, document, or array of paths. +SEARCH_WILDCARD_PATH_ERROR_TESTS: list[StageTestCase] = [ + *[ + StageTestCase( + f"wildcard_path_{tid}", + pipeline=[{"$search": {"wildcard": {"query": "qu*", "path": val}}}], + error_code=UNKNOWN_ERROR, + msg=f"$search wildcard should reject a {tid} path as neither a document, string, " + "nor array", + ) + for tid, val in [ + ("int32", 1), + ("int64", Int64(1)), + ("double", 1.5), + ("bool", True), + ("objectid", ObjectId("0123456789abcdef01234567")), + ("datetime", datetime.datetime(2020, 1, 1)), + ("timestamp", Timestamp(1, 1)), + ("binary", Binary(b"\x01\x02\x03")), + ("regex", Regex(".*", "i")), + ("code", Code("function(){}")), + ("minkey", MinKey()), + ("maxkey", MaxKey()), + ("decimal128", DECIMAL128_ONE_AND_HALF), + ] + ], + StageTestCase( + "wildcard_path_null", + pipeline=[ + {"$search": {"wildcard": {"query": "qu*", "path": None}}}, + ], + error_code=UNKNOWN_ERROR, + msg="$search wildcard should reject a null path treated as missing", + ), +] + +SEARCH_WILDCARD_ERROR_TESTS = ( + SEARCH_WILDCARD_ANALYZED_PATH_ERROR_TESTS + + SEARCH_WILDCARD_ALLOW_ANALYZED_TYPE_ERROR_TESTS + + SEARCH_WILDCARD_QUERY_ERROR_TESTS + + SEARCH_WILDCARD_PATH_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_WILDCARD_ERROR_TESTS)) +def test_search_wildcard_errors(wildcard_collection, test_case: StageTestCase): + """Test $search wildcard rejects analyzed paths and bad allowAnalyzedField/query/path values.""" + result = execute_command( + wildcard_collection, + {"aggregate": wildcard_collection.name, "pipeline": test_case.pipeline, "cursor": {}}, + ) + assertResult(result, error_code=test_case.error_code, msg=test_case.msg) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_smoke_search.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_smoke_search.py index 045c4b3b1..20727273e 100644 --- a/documentdb_tests/compatibility/tests/core/operator/stages/search/test_smoke_search.py +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/test_smoke_search.py @@ -9,10 +9,9 @@ from documentdb_tests.framework.assertions import assertSuccessPartial from documentdb_tests.framework.executor import execute_command -pytestmark = pytest.mark.smoke +pytestmark = [pytest.mark.smoke, pytest.mark.requires(search=True)] -@pytest.mark.skip(reason="Requires Atlas Search configuration - not available on standard MongoDB") def test_smoke_search(collection): """Test basic $search stage behavior.""" collection.insert_many([{"_id": 1, "title": "test document"}]) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/utils/__init__.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/search/utils/search_common.py b/documentdb_tests/compatibility/tests/core/operator/stages/search/utils/search_common.py new file mode 100644 index 000000000..e1be36848 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/search/utils/search_common.py @@ -0,0 +1,67 @@ +"""Shared plumbing and corpus for $search stage tests. + +$search tests cannot use the per-case declarative ``docs``/``indexes`` model: a +search index is heavyweight (created, then polled until queryable), so the +corpus and index are built once per package by the package-scoped +``indexed_collection`` fixture (see conftest.py) rather than per test case. This +module holds the create/poll helpers, the shared option constants, and the +dynamic-mapping corpus that fixture populates.""" + +from __future__ import annotations + +import time +from typing import Any + +from pymongo.collection import Collection +from pymongo.operations import SearchIndexModel + +SEARCH_INDEX_NAME = "default" +INDEX_READY_TIMEOUT_SECONDS = 120 + +# Maximum number of query clauses the text operator accepts (inclusive); 'in' has no such cap. +QUERY_CLAUSE_CAP = 1024 + +# Shared corpus for the dynamic-mapping index. Docs 6-18 carry tokens probed only +# by specific analyzer, normalization, token-boundary, and fuzzy cases; none +# contains `quick`/`turtle`, so they do not perturb the matching, scoring, or +# count assertions in the other files. +FIXTURE_DOCS = [ + {"_id": 1, "title": "the quick brown fox", "body": "lazy dog"}, + {"_id": 2, "title": "slow green turtle", "body": "quick nap"}, # `quick` in body + {"_id": 3, "title": "a quick quick quick rabbit"}, # repeats `quick` for the top score + {"_id": 4, "title": "$quick literal dollar"}, # leading `$` matched as literal text + {"_id": 5, "title": "mon résumé est prêt"}, # multi-byte token for highlight spans + {"_id": 6, "title": "x"}, # single-character token + {"_id": 7, "title": "σιγμα"}, # lowercase Greek + {"_id": 8, "title": "день"}, # lowercase Cyrillic + {"_id": 9, "title": "\U00010428"}, # Deseret small letter long I (U+10428) + {"_id": 10, "title": "resume"}, # plain ASCII, distinct from doc 5's résumé + {"_id": 11, "title": "caf\u00e9"}, # precomposed é (U+00E9) + {"_id": 12, "title": "\ufb01le"}, # ligature fi (U+FB01) + le + {"_id": 13, "title": "stra\u00dfe"}, # German eszett (U+00DF) + {"_id": 14, "title": "\u0131rmak"}, # Turkish dotless i (U+0131) + {"_id": 15, "title": "a z"}, # ASCII range-edge letters as separate tokens + {"_id": 16, "title": "word joined"}, # two tokens: word, joined + {"_id": 17, "title": "wordjoined"}, # one token + {"_id": 18, "title": "\u00e9fox"}, # 4 code points / 5 bytes (é is 2 bytes) +] + + +def create_search_index(collection: Collection, definition: dict[str, Any]) -> None: + """Create a search index from a definition and poll until it is queryable.""" + collection.create_search_index(SearchIndexModel(definition=definition, name=SEARCH_INDEX_NAME)) + deadline = time.monotonic() + INDEX_READY_TIMEOUT_SECONDS + while time.monotonic() < deadline: + indexes = list(collection.list_search_indexes(SEARCH_INDEX_NAME)) + if indexes and indexes[0].get("queryable"): + return + time.sleep(1) + raise RuntimeError( + f"search index {SEARCH_INDEX_NAME!r} did not become queryable within " + f"{INDEX_READY_TIMEOUT_SECONDS}s" + ) + + +def create_dynamic_search_index(collection: Collection) -> None: + """Create a dynamic-mapping search index and poll until it is queryable.""" + create_search_index(collection, {"mappings": {"dynamic": True}}) diff --git a/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_search.py b/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_search.py new file mode 100644 index 000000000..dc30c4c75 --- /dev/null +++ b/documentdb_tests/compatibility/tests/core/operator/stages/test_stages_position_search.py @@ -0,0 +1,177 @@ +"""Tests for $search pipeline position constraints and stage combinations.""" + +from __future__ import annotations + +import time + +import pytest +from pymongo.operations import SearchIndexModel + +from documentdb_tests.compatibility.tests.core.operator.stages.utils.stage_test_case import ( + StageTestCase, +) +from documentdb_tests.framework import fixtures +from documentdb_tests.framework.assertions import assertResult +from documentdb_tests.framework.error_codes import ( + FACET_PIPELINE_INVALID_STAGE_ERROR, + NOT_FIRST_STAGE_ERROR, +) +from documentdb_tests.framework.executor import execute_command +from documentdb_tests.framework.parametrize import pytest_params + +pytestmark = pytest.mark.requires(search=True) + +_POSITION_DOCS = [ + {"_id": 1, "title": "quick brown fox"}, + {"_id": 2, "title": "lazy sleeping dog"}, + {"_id": 3, "title": "green sea turtle"}, +] + + +@pytest.fixture(scope="module") +def position_collection(engine_client, worker_id): + """A module-scoped collection with a ready dynamic search index, shared + read-only across the placement cases so the index is built and polled once + rather than per test. The collection carries a fixed name so the + $unionWith/$lookup sub-pipeline cases can reference it as their source.""" + db_name = fixtures.generate_database_name("stages_search_position", worker_id) + fixtures.cleanup_database(engine_client, db_name) + db = engine_client[db_name] + coll = db["position"] + coll.insert_many(_POSITION_DOCS) + coll.create_search_index( + SearchIndexModel(definition={"mappings": {"dynamic": True}}, name="default") + ) + deadline = time.monotonic() + 120 + while time.monotonic() < deadline: + indexes = list(coll.list_search_indexes("default")) + if indexes and indexes[0].get("queryable"): + break + time.sleep(1) + else: + raise RuntimeError("search index 'default' did not become queryable within 120s") + yield coll + fixtures.cleanup_database(engine_client, db_name) + + +# Property [Sub-pipeline Placement]: the first-stage-only rule is enforced per +# pipeline, so $search is allowed as the first stage of a $unionWith or $lookup +# sub-pipeline and may be followed by other stages within that sub-pipeline. +SEARCH_SUBPIPELINE_PLACEMENT_TESTS: list[StageTestCase] = [ + StageTestCase( + "placement_unionwith_subpipeline", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$project": {"_id": 1}}, + { + "$unionWith": { + "coll": "position", + "pipeline": [ + {"$search": {"text": {"query": "turtle", "path": "title"}}}, + {"$project": {"_id": 1}}, + ], + } + }, + ], + expected=[{"_id": 1}, {"_id": 3}], + msg="$search should be allowed as the first stage of a $unionWith sub-pipeline, " + "unioning the sub-pipeline matches with the main-pipeline matches", + ), + StageTestCase( + "placement_lookup_subpipeline", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$project": {"_id": 1}}, + { + "$lookup": { + "from": "position", + "pipeline": [ + {"$search": {"text": {"query": "turtle", "path": "title"}}}, + {"$project": {"_id": 1}}, + ], + "as": "joined", + } + }, + ], + expected=[{"_id": 1, "joined": [{"_id": 3}]}], + msg="$search should be allowed as the first stage of a $lookup sub-pipeline and " + "attach the sub-pipeline matches to each joined row", + ), + StageTestCase( + "placement_subpipeline_trailing_stages", + pipeline=[ + {"$search": {"text": {"query": "dog", "path": "title"}}}, + {"$project": {"_id": 1}}, + { + "$unionWith": { + "coll": "position", + "pipeline": [ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$match": {"_id": {"$gt": 0}}}, + {"$project": {"_id": 1}}, + ], + } + }, + ], + expected=[{"_id": 2}, {"_id": 1}], + msg="$search as the first stage of a sub-pipeline should permit trailing stages " + "($match, $project) after it within that sub-pipeline", + ), +] + +# Property [Stage Placement Errors]: $search anywhere other than the first stage +# of its pipeline is rejected with NOT_FIRST_STAGE_ERROR, and $search nested +# inside a $facet stage is rejected with FACET_PIPELINE_INVALID_STAGE_ERROR. +SEARCH_STAGE_PLACEMENT_ERROR_TESTS: list[StageTestCase] = [ + StageTestCase( + "not_first_after_match", + pipeline=[ + {"$match": {"_id": 1}}, + {"$search": {"text": {"query": "quick", "path": "title"}}}, + ], + error_code=NOT_FIRST_STAGE_ERROR, + msg="$search should be rejected when it follows another stage in the pipeline", + ), + StageTestCase( + "not_first_second_search", + pipeline=[ + {"$search": {"text": {"query": "quick", "path": "title"}}}, + {"$search": {"text": {"query": "quick", "path": "title"}}}, + ], + error_code=NOT_FIRST_STAGE_ERROR, + msg="$search should be rejected when a second $search follows the first stage", + ), + StageTestCase( + "in_facet", + pipeline=[ + {"$facet": {"results": [{"$search": {"text": {"query": "quick", "path": "title"}}}]}} + ], + error_code=FACET_PIPELINE_INVALID_STAGE_ERROR, + msg="$search should be rejected when nested inside a $facet stage", + ), +] + +SEARCH_POSITION_TESTS: list[StageTestCase] = ( + SEARCH_SUBPIPELINE_PLACEMENT_TESTS + SEARCH_STAGE_PLACEMENT_ERROR_TESTS +) + + +@pytest.mark.aggregate +@pytest.mark.parametrize("test_case", pytest_params(SEARCH_POSITION_TESTS)) +def test_search_position(position_collection, test_case: StageTestCase): + """Test $search pipeline position constraints, sub-pipeline combinations, and rejections.""" + result = execute_command( + position_collection, + { + "aggregate": position_collection.name, + "pipeline": test_case.pipeline, + "cursor": {}, + }, + ) + assertResult( + result, + expected=test_case.expected, + error_code=test_case.error_code, + msg=test_case.msg, + ignore_doc_order=True, + ) diff --git a/documentdb_tests/framework/engine_registry.py b/documentdb_tests/framework/engine_registry.py index 774ab7810..1e2bf1664 100644 --- a/documentdb_tests/framework/engine_registry.py +++ b/documentdb_tests/framework/engine_registry.py @@ -102,6 +102,15 @@ def _is_reachable(connection_string: str) -> bool: # replSetInitiate error code when the set is already initiated (e.g. a race # between concurrent callers); treated as success. _ALREADY_INITIALIZED = 23 +# createUser error code when the user already exists (idempotent re-runs). +_USER_ALREADY_EXISTS = 51003 + +# The user mongot authenticates as to replicate from a search-enabled mongod. +# Its name and password are a fixed local-dev secret matched by the mongot +# sidecar's config (see dev/mongot.yml and the mongot service in +# dev/compose.yaml); it is not a real credential. +_SEARCH_SYNC_USER = "searchSyncUser" +_SEARCH_SYNC_PASSWORD = "searchSyncPassword" def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: @@ -120,12 +129,14 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: that already initiated it (AlreadyInitialized) is tolerated. After initiating, it waits up to ``timeout_s`` for a primary to be elected - so callers can write immediately. + so callers can write immediately. A search-enabled mongod additionally has + the searchCoordinator user mongot needs provisioned once it is primary. """ client: MongoClient = MongoClient(connection_string, serverSelectionTimeoutMS=5000) try: try: client.admin.command("replSetGetStatus") + _ensure_search_user(client) # Idempotent; a no-op off a search target. return # Already initiated. except OperationFailure as exc: if exc.code != _NOT_YET_INITIALIZED: @@ -140,6 +151,7 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: deadline = time.monotonic() + timeout_s while time.monotonic() < deadline: if client.admin.command("hello").get("isWritablePrimary"): + _ensure_search_user(client) return time.sleep(0.5) raise TimeoutError( @@ -149,6 +161,29 @@ def ensure_initiated(connection_string: str, timeout_s: float = 30.0) -> None: client.close() +def _ensure_search_user(client: MongoClient) -> None: + """Provision the searchCoordinator user a search-enabled mongod needs. + + A search target points at a mongot sidecar (a non-empty ``mongotHost``). + mongot replicates from this mongod as an authenticated sync source, so it + needs a user with the searchCoordinator role to log in as. This creates that + user (idempotently) once the server is primary. It is a no-op on a target + without a mongot sidecar. + """ + if not client.admin.command({"getParameter": 1, "mongotHost": 1}).get("mongotHost"): + return # Not a search target. + try: + client.admin.command( + "createUser", + _SEARCH_SYNC_USER, + pwd=_SEARCH_SYNC_PASSWORD, + roles=[{"role": "searchCoordinator", "db": "admin"}], + ) + except OperationFailure as exc: + if exc.code != _USER_ALREADY_EXISTS: + raise + + def live_targets(compose_path: Path = COMPOSE_PATH) -> list[Target]: """Return the declared targets that are currently reachable.""" return [t for t in load_targets(compose_path) if _is_reachable(t.connection_string)] diff --git a/documentdb_tests/framework/preconditions.py b/documentdb_tests/framework/preconditions.py index 8c3bf2534..75cf1c0e1 100644 --- a/documentdb_tests/framework/preconditions.py +++ b/documentdb_tests/framework/preconditions.py @@ -56,12 +56,17 @@ "unforced_compact": "compact succeeds without force", "reindex": "reIndex is permitted", "local_rename": "renaming into the unreplicated local database is permitted", + "search": "search and vector search surfaces are available", "replication": "replication commands are available (applyOps, oplog access)", } # The capabilities each (engine, topology) target has. To add an engine or # topology, add an entry here; every test then gates correctly. _CAPABILITIES_BY_PROFILE: dict[tuple[str, str], frozenset[str]] = { + # A replica set, wired to a mongot search sidecar so it also serves the + # search surfaces (see dev/compose.yaml). mongot is transparent to all other + # behavior, so this is a replica set that additionally has the search + # capability, not a distinct topology. ("mongodb", "replica_set"): frozenset( { "change_streams", @@ -71,6 +76,7 @@ "cluster_time", "cluster_read_concern", "quorum_write_concern", + "search", "replication", } ), diff --git a/documentdb_tests/framework/test_structure_validator.py b/documentdb_tests/framework/test_structure_validator.py index 57f811f08..dd40a3f75 100644 --- a/documentdb_tests/framework/test_structure_validator.py +++ b/documentdb_tests/framework/test_structure_validator.py @@ -15,7 +15,7 @@ def validate_python_files_in_tests(tests_dir: Path) -> list[str]: allowed_folders = {"utils", "fixtures", "__pycache__"} for py_file in tests_dir.rglob("*.py"): - if py_file.name == "__init__.py": + if py_file.name in ("__init__.py", "conftest.py"): continue if any(folder in py_file.parts for folder in allowed_folders): continue diff --git a/documentdb_tests/pytest.ini b/documentdb_tests/pytest.ini index b2cd6c4a1..0f577b18f 100644 --- a/documentdb_tests/pytest.ini +++ b/documentdb_tests/pytest.ini @@ -50,6 +50,7 @@ markers = # Timeout for tests (seconds) timeout = 300 +timeout_method = thread # Parallel execution settings # Use with: pytest -n auto