Skip to content

Commit cf724d7

Browse files
authored
Fix TestAlertmanager_StateReplicationWithSharding. (#4078)
* Fix TestAlertmanager_StateReplicationWithSharding. The test had two issues: 1. The most common issue was that replicating the silence sometimes took a small amount of time, as it happens asynchronously. Replacing Equal with Eventually when checking the replication metrics is enough to make the tests reliable. 2. The second issue was more rare: When a particular shard does not have any users assigned to it, _and_ the test randomly picks _that_ shard to write the silence to, the test would panic (in rand.Intn). Due to the alertmanagersMtx being held, the test actually just hung up while trying to shutdown the alertmanagers. Signed-off-by: Steve Simpson <[email protected]> * Review comments. Signed-off-by: Steve Simpson <[email protected]>
1 parent 03c092f commit cf724d7

File tree

1 file changed

+30
-7
lines changed

1 file changed

+30
-7
lines changed

pkg/alertmanager/multitenant_test.go

+30-7
Original file line numberDiff line numberDiff line change
@@ -1049,8 +1049,6 @@ func TestAlertmanager_ReplicasPosition(t *testing.T) {
10491049
}
10501050

10511051
func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
1052-
t.Skip("Flaky test under investigation")
1053-
10541052
tc := []struct {
10551053
name string
10561054
replicationFactor int
@@ -1161,10 +1159,22 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
11611159

11621160
// With sharding enabled, we propagate messages over gRPC instead of using a gossip over TCP.
11631161
// 1. First, get a random multitenant instance
1164-
multitenantAM := instances[rand.Intn(len(instances))]
1162+
// We must pick an instance which actually has a user configured.
1163+
var multitenantAM *MultitenantAlertmanager
1164+
for {
1165+
multitenantAM = instances[rand.Intn(len(instances))]
1166+
1167+
multitenantAM.alertmanagersMtx.Lock()
1168+
amount := len(multitenantAM.alertmanagers)
1169+
multitenantAM.alertmanagersMtx.Unlock()
1170+
if amount > 0 {
1171+
break
1172+
}
1173+
}
11651174

11661175
// 2. Then, get a random user that exists in that particular alertmanager instance.
11671176
multitenantAM.alertmanagersMtx.Lock()
1177+
require.Greater(t, len(multitenantAM.alertmanagers), 0)
11681178
k := rand.Intn(len(multitenantAM.alertmanagers))
11691179
var userID string
11701180
for u := range multitenantAM.alertmanagers {
@@ -1202,18 +1212,26 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
12021212
require.Regexp(t, regexp.MustCompile(`{"silenceID":".+"}`), string(body))
12031213
}
12041214

1205-
metrics := registries.BuildMetricFamiliesPerUser()
1206-
12071215
// If sharding is not enabled, we never propagate any messages amongst replicas in this way, and we can stop here.
12081216
if !tt.withSharding {
1217+
metrics := registries.BuildMetricFamiliesPerUser()
1218+
12091219
assert.Equal(t, float64(1), metrics.GetSumOfGauges("cortex_alertmanager_silences"))
12101220
assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total"))
12111221
assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total"))
12121222
return
12131223
}
12141224

1225+
var metrics util.MetricFamiliesPerUser
1226+
12151227
// 5. Then, make sure it is propagated successfully.
1216-
assert.Equal(t, float64(tt.replicationFactor), metrics.GetSumOfGauges("cortex_alertmanager_silences"))
1228+
// Replication is asynchronous, so we may have to wait a short period of time.
1229+
assert.Eventually(t, func() bool {
1230+
metrics = registries.BuildMetricFamiliesPerUser()
1231+
return (float64(tt.replicationFactor) == metrics.GetSumOfGauges("cortex_alertmanager_silences") &&
1232+
float64(tt.replicationFactor) == metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total"))
1233+
}, 5*time.Second, 100*time.Millisecond)
1234+
12171235
assert.Equal(t, float64(tt.replicationFactor), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_total"))
12181236
assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_state_replication_failed_total"))
12191237

@@ -1224,7 +1242,12 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
12241242
// For RF=3 1 -> 2 -> 4 = Total 6 merges
12251243
nFanOut := tt.replicationFactor - 1
12261244
nMerges := nFanOut + (nFanOut * nFanOut)
1227-
assert.Equal(t, float64(nMerges), metrics.GetSumOfCounters("cortex_alertmanager_partial_state_merges_total"))
1245+
1246+
assert.Eventually(t, func() bool {
1247+
metrics = registries.BuildMetricFamiliesPerUser()
1248+
return float64(nMerges) == metrics.GetSumOfCounters("cortex_alertmanager_partial_state_merges_total")
1249+
}, 5*time.Second, 100*time.Millisecond)
1250+
12281251
assert.Equal(t, float64(0), metrics.GetSumOfCounters("cortex_alertmanager_partial_state_merges_failed_total"))
12291252
})
12301253
}

0 commit comments

Comments
 (0)