From 9d770477916c404c55ba24d78453436ecfcc7395 Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 11 Jun 2026 09:06:52 +0900 Subject: [PATCH] fix(compactor): de-flake sharded-ownership compactor tests TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning and its partition twin seed one shared bucket.ClientMock with 100 tenants (~3,208/~3,707 registered testify expectations). Every bucket operation from both compactors and both blocks cleaners holds the mock's single mutex through an O(expectations) reflective scan (~38% of CPU in two independent profiles), so the first compaction cycle alone takes ~20s under -race on an idle machine and 6-7x longer on starved CI runners. Each per-tenant meta sync additionally runs under a deadline equal to -compactor.compaction-interval (5s in these tests, pkg/compactor/compactor.go:1085), so starved syncs turn whole runs into failures and CompactionRunsCompleted can stay pinned at 0 longer than ANY poll budget: in CI run 26632776611 the partition test failed its 60s poll AND the non-partition sibling burned its full 120s poll, in both attempts. The ring itself was ACTIVE and stable before the poll started (starting() waits for both), correcting the issue's original ring-convergence framing. Fix (test-only): - Cut numUsers from 100 to 20 in both tests (~25x less quadratic mock-matching work; the per-user ownership assertions are tenant-count independent). - Align the partition test's run-completion poll budget with its non-partition sibling (60s -> 120s) as a seatbelt. - Set WaitActiveInstanceTimeout=30s in both tests (same hardening as #7503): prepareConfig's 5s ACTIVE ceiling is within reach of the observed CI starvation envelope. The shuffle-sharding twins are deliberately untouched: their live arm64 failure in the same run is an ownership-exclusivity violation (compactor_test.go:1318), a different failure class that needs its own root-causing. Before/after on Apple Silicon, -race: 21.00s/24.01s -> 3.57s/3.55s. Fixes #7607 Co-Authored-By: Claude Fable 5 Signed-off-by: Sandy Chen --- CHANGELOG.md | 1 + pkg/compactor/compactor_paritioning_test.go | 5 +++-- pkg/compactor/compactor_test.go | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0756d48d25..b14be0cbb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ * [BUGFIX] Query Frontend: Fix native histogram responses not being handled correctly in `minTime()` sort ordering for split_by_interval merge. #7555 * [BUGFIX] Distributor: Release the push worker pool goroutines on shutdown by stopping the async executor during the stopping phase when `-distributor.num-push-workers` is set. #7602 * [BUGFIX] Querier: Fix unbounded resource leak in the bucket-scan blocks finder (used when the bucket index is disabled). Per-tenant metadata fetchers, their Prometheus registries, and on-disk meta caches are now evicted once a tenant is no longer active, instead of being retained for the lifetime of the process. #7573 +* [BUGFIX] Compactor: Fix flake in `TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning` and its partition twin: shrink the 100-tenant mock-bucket fixture, whose quadratic testify-mock matching could starve the first compaction run past the poll budget on loaded CI runners, and align the partition test's run-completion poll timeout with its non-partition sibling. #7617 ## 1.21.0 2026-04-24 diff --git a/pkg/compactor/compactor_paritioning_test.go b/pkg/compactor/compactor_paritioning_test.go index 0002a131b5..4398a1da36 100644 --- a/pkg/compactor/compactor_paritioning_test.go +++ b/pkg/compactor/compactor_paritioning_test.go @@ -1142,7 +1142,7 @@ func TestPartitionCompactor_ShouldCompactAllUsersOnShardingEnabledButOnlyOneInst func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning(t *testing.T) { - numUsers := 100 + numUsers := 20 // Setup user IDs userIDs := make([]string, 0, numUsers) @@ -1192,6 +1192,7 @@ func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEn cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) cfg.ShardingRing.WaitStabilityMinDuration = time.Second cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second + cfg.ShardingRing.WaitActiveInstanceTimeout = 30 * time.Second // Give the ring ACTIVE wait headroom on slow CI (#7503). cfg.ShardingRing.KVStore.Mock = kvstore c, _, tsdbPlanner, l, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil) @@ -1214,7 +1215,7 @@ func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEn // Wait until a run has been completed on each compactor for _, c := range compactors { - cortex_testutil.Poll(t, 60*time.Second, true, func() any { + cortex_testutil.Poll(t, 120*time.Second, true, func() any { return prom_testutil.ToFloat64(c.CompactionRunsCompleted) >= 1 }) } diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go index d1d4460383..15b6c1adf1 100644 --- a/pkg/compactor/compactor_test.go +++ b/pkg/compactor/compactor_test.go @@ -1091,7 +1091,7 @@ func TestCompactor_ShouldCompactAllUsersOnShardingEnabledButOnlyOneInstanceRunni func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning(t *testing.T) { t.Parallel() - numUsers := 100 + numUsers := 20 // Setup user IDs userIDs := make([]string, 0, numUsers) @@ -1139,6 +1139,7 @@ func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndM cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) cfg.ShardingRing.WaitStabilityMinDuration = time.Second cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second + cfg.ShardingRing.WaitActiveInstanceTimeout = 30 * time.Second // Give the ring ACTIVE wait headroom on slow CI (#7503). cfg.ShardingRing.KVStore.Mock = kvstore c, _, tsdbPlanner, l, _ := prepare(t, cfg, bucketClient, nil)