From 366ffc3228381b16f017cf6d45999d75c4a9d86a Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 11 Jun 2026 09:02:00 +0900 Subject: [PATCH] fix(querier): wait for store-gateway ACTIVE in querier ring view in store-gateway limits integration tests TestQuerierWithStoreGatewayDataBytesLimits intermittently fails with HTTP 500 instead of the expected 422 (#7606, arm64 CI). The decoded (gzipped) 500 response body from the failing run is the querier-local ring error: expanding series: failed to get store-gateway replication set owning the block : at least 1 healthy replica required, could only find 0 - unhealthy instances: 172.18.0.8:9095 i.e. the ring lookup failed before any store-gateway RPC was made. The store-gateway registers in the ring as JOINING (already owning tokens) and switches to ACTIVE only after its initial blocks sync, while the querier's BlocksRead ring operation only selects ACTIVE instances and its consul watch is rate-limited (1 rps by default). So the existing waits (ring tokens registered, blocks loaded on the store-gateway) can all pass while the querier's view of the store-gateway ring still says JOINING, and the first query 500s. The hypothesis originally filed on the issue - that the bytes-limit error loses its 422/ResourceExhausted coding in the vendored Thanos refetch ("series size exceeded expected size; refetching") path - was falsified during investigation: those log lines belong to an earlier, passing test in the same CI job; the failing query never reached store-gateway limiter code at all; and all 10 vendored limiter consumption sites (including the refetch recursion) re-code the error as ResourceExhausted, which the querier maps to a 422 LimitError (#5286). Fix the race in the tests by waiting until the querier sees the store-gateway ACTIVE in its store-gateway ring view before querying (same idiom as backward_compatibility_test.go, #5975). Apply the same wait to the sibling TestQuerierWithBlocksStorageLimits, which has the identical vulnerable shape (every query expected to hit a 422 limit against a freshly started store-gateway). Same root cause as #7605, which is fixed separately for TestQuerierWithBlocksStorageOnMissingBlocksFromStorage in a non-overlapping PR. Co-Authored-By: Claude Fable 5 Signed-off-by: Sandy Chen --- CHANGELOG.md | 1 + integration/querier_test.go | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0756d48d25..9e7cf316a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ * [BUGFIX] Query Frontend: Fix native histogram responses not being handled correctly in `minTime()` sort ordering for split_by_interval merge. #7555 * [BUGFIX] Distributor: Release the push worker pool goroutines on shutdown by stopping the async executor during the stopping phase when `-distributor.num-push-workers` is set. #7602 * [BUGFIX] Querier: Fix unbounded resource leak in the bucket-scan blocks finder (used when the bucket index is disabled). Per-tenant metadata fetchers, their Prometheus registries, and on-disk meta caches are now evicted once a tenant is no longer active, instead of being retained for the lifetime of the process. #7573 +* [BUGFIX] Querier: Fix flake in integration tests TestQuerierWithStoreGatewayDataBytesLimits and TestQuerierWithBlocksStorageLimits by waiting for the querier to see the store-gateway ACTIVE in the ring before querying. #7614 ## 1.21.0 2026-04-24 diff --git a/integration/querier_test.go b/integration/querier_test.go index 2bba87703f..f5683414b6 100644 --- a/integration/querier_test.go +++ b/integration/querier_test.go @@ -474,6 +474,14 @@ func TestQuerierWithBlocksStorageLimits(t *testing.T) { require.NoError(t, storeGateway.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total")) require.NoError(t, storeGateway.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded")) + // Wait until the store-gateway is ACTIVE in the querier's view of the store-gateway ring. The + // store-gateway registers JOINING (with tokens) and switches to ACTIVE only after the initial + // blocks sync, so the waits above can pass while the querier would still fail queries with + // "at least 1 healthy replica required, could only find 0" (HTTP 500) instead of the expected 422. + require.NoError(t, querier.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_ring_members"}, e2e.WithLabelMatchers( + labels.MustNewMatcher(labels.MatchEqual, "name", "store-gateway-client"), + labels.MustNewMatcher(labels.MatchEqual, "state", "ACTIVE")))) + // Query back the series. c, err = e2ecortex.NewClient("", querier.HTTPEndpoint(), "", "", "user-1") require.NoError(t, err) @@ -571,6 +579,14 @@ func TestQuerierWithStoreGatewayDataBytesLimits(t *testing.T) { require.NoError(t, storeGateway.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total")) require.NoError(t, storeGateway.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded")) + // Wait until the store-gateway is ACTIVE in the querier's view of the store-gateway ring. The + // store-gateway registers JOINING (with tokens) and switches to ACTIVE only after the initial + // blocks sync, so the waits above can pass while the querier would still fail queries with + // "at least 1 healthy replica required, could only find 0" (HTTP 500) instead of the expected 422. + require.NoError(t, querier.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_ring_members"}, e2e.WithLabelMatchers( + labels.MustNewMatcher(labels.MatchEqual, "name", "store-gateway-client"), + labels.MustNewMatcher(labels.MatchEqual, "state", "ACTIVE")))) + // Query back the series. c, err = e2ecortex.NewClient("", querier.HTTPEndpoint(), "", "", "user-1") require.NoError(t, err)