@@ -1205,6 +1205,185 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) {
1205
1205
}
1206
1206
}
1207
1207
1208
+ func TestAlertmanager_StateReplicationWithSharding_InitialSyncFromPeers (t * testing.T ) {
1209
+ tc := []struct {
1210
+ name string
1211
+ replicationFactor int
1212
+ }{
1213
+ {
1214
+ name : "RF = 2" ,
1215
+ replicationFactor : 2 ,
1216
+ },
1217
+ {
1218
+ name : "RF = 3" ,
1219
+ replicationFactor : 3 ,
1220
+ },
1221
+ }
1222
+
1223
+ for _ , tt := range tc {
1224
+ t .Run (tt .name , func (t * testing.T ) {
1225
+ ctx := context .Background ()
1226
+ ringStore := consul .NewInMemoryClient (ring .GetCodec ())
1227
+ mockStore := prepareInMemoryAlertStore ()
1228
+ clientPool := newPassthroughAlertmanagerClientPool ()
1229
+ externalURL := flagext.URLValue {}
1230
+ err := externalURL .Set ("http://localhost:8080/alertmanager" )
1231
+ require .NoError (t , err )
1232
+
1233
+ var instances []* MultitenantAlertmanager
1234
+ var instanceIDs []string
1235
+ registries := util .NewUserRegistries ()
1236
+
1237
+ // Create only two users - no need for more for these test cases.
1238
+ for i := 1 ; i <= 2 ; i ++ {
1239
+ u := fmt .Sprintf ("u-%d" , i )
1240
+ require .NoError (t , mockStore .SetAlertConfig (ctx , alertspb.AlertConfigDesc {
1241
+ User : u ,
1242
+ RawConfig : simpleConfigOne ,
1243
+ Templates : []* alertspb.TemplateDesc {},
1244
+ }))
1245
+ }
1246
+
1247
+ createInstance := func (i int ) * MultitenantAlertmanager {
1248
+ instanceIDs = append (instanceIDs , fmt .Sprintf ("alertmanager-%d" , i ))
1249
+ instanceID := fmt .Sprintf ("alertmanager-%d" , i )
1250
+
1251
+ amConfig := mockAlertmanagerConfig (t )
1252
+ amConfig .ExternalURL = externalURL
1253
+ amConfig .ShardingRing .ReplicationFactor = tt .replicationFactor
1254
+ amConfig .ShardingRing .InstanceID = instanceID
1255
+ amConfig .ShardingRing .InstanceAddr = fmt .Sprintf ("127.0.0.%d" , i )
1256
+
1257
+ // Do not check the ring topology changes or poll in an interval in this test (we explicitly sync alertmanagers).
1258
+ amConfig .PollInterval = time .Hour
1259
+ amConfig .ShardingRing .RingCheckPeriod = time .Hour
1260
+
1261
+ amConfig .ShardingEnabled = true
1262
+
1263
+ reg := prometheus .NewPedanticRegistry ()
1264
+ am , err := createMultitenantAlertmanager (amConfig , nil , nil , mockStore , ringStore , log .NewNopLogger (), reg )
1265
+ require .NoError (t , err )
1266
+
1267
+ clientPool .servers [amConfig .ShardingRing .InstanceAddr + ":0" ] = am
1268
+ am .alertmanagerClientsPool = clientPool
1269
+
1270
+ require .NoError (t , services .StartAndAwaitRunning (ctx , am ))
1271
+ t .Cleanup (func () {
1272
+ services .StopAndAwaitTerminated (ctx , am ) //nolint:errcheck
1273
+ })
1274
+
1275
+ instances = append (instances , am )
1276
+ instanceIDs = append (instanceIDs , instanceID )
1277
+ registries .AddUserRegistry (instanceID , reg )
1278
+
1279
+ // Make sure the ring is settled.
1280
+ {
1281
+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1282
+ defer cancel ()
1283
+
1284
+ // The alertmanager is ready to be tested once all instances are ACTIVE and the ring settles.
1285
+ for _ , am := range instances {
1286
+ for _ , id := range instanceIDs {
1287
+ require .NoError (t , ring .WaitInstanceState (ctx , am .ring , id , ring .ACTIVE ))
1288
+ }
1289
+ }
1290
+ }
1291
+
1292
+ // Now that the ring has settled, sync configs with the instances.
1293
+ require .NoError (t , am .loadAndSyncConfigs (ctx , reasonRingChange ))
1294
+
1295
+ return am
1296
+ }
1297
+
1298
+ writeSilence := func (i * MultitenantAlertmanager , userID string ) {
1299
+ silence := types.Silence {
1300
+ Matchers : labels.Matchers {
1301
+ {Name : "instance" , Value : "prometheus-one" },
1302
+ },
1303
+ Comment : "Created for a test case." ,
1304
+ StartsAt : time .Now (),
1305
+ EndsAt : time .Now ().Add (time .Hour ),
1306
+ }
1307
+ data , err := json .Marshal (silence )
1308
+ require .NoError (t , err )
1309
+
1310
+ req := httptest .NewRequest (http .MethodPost , externalURL .String ()+ "/api/v2/silences" , bytes .NewReader (data ))
1311
+ req .Header .Set ("content-type" , "application/json" )
1312
+ reqCtx := user .InjectOrgID (req .Context (), userID )
1313
+ {
1314
+ w := httptest .NewRecorder ()
1315
+ i .ServeHTTP (w , req .WithContext (reqCtx ))
1316
+
1317
+ resp := w .Result ()
1318
+ body , _ := ioutil .ReadAll (resp .Body )
1319
+ assert .Equal (t , http .StatusOK , w .Code )
1320
+ require .Regexp (t , regexp .MustCompile (`{"silenceID":".+"}` ), string (body ))
1321
+ }
1322
+ }
1323
+
1324
+ checkSilence := func (i * MultitenantAlertmanager , userID string ) {
1325
+ req := httptest .NewRequest (http .MethodGet , externalURL .String ()+ "/api/v2/silences" , nil )
1326
+ req .Header .Set ("content-type" , "application/json" )
1327
+ reqCtx := user .InjectOrgID (req .Context (), userID )
1328
+ {
1329
+ w := httptest .NewRecorder ()
1330
+ i .ServeHTTP (w , req .WithContext (reqCtx ))
1331
+
1332
+ resp := w .Result ()
1333
+ body , _ := ioutil .ReadAll (resp .Body )
1334
+ assert .Equal (t , http .StatusOK , w .Code )
1335
+ require .Regexp (t , regexp .MustCompile (`"comment":"Created for a test case."` ), string (body ))
1336
+ }
1337
+ }
1338
+
1339
+ // 1. Create the first instance and load the user configurations.
1340
+ i1 := createInstance (1 )
1341
+
1342
+ // 2. Create a silence in the first alertmanager instance and check we can read it.
1343
+ writeSilence (i1 , "u-1" )
1344
+ // 2.a. Check the silence was created (paranoia).
1345
+ checkSilence (i1 , "u-1" )
1346
+ // 2.b. Check the relevant metrics were updated.
1347
+ {
1348
+ metrics := registries .BuildMetricFamiliesPerUser ()
1349
+ assert .Equal (t , float64 (1 ), metrics .GetSumOfGauges ("cortex_alertmanager_silences" ))
1350
+ assert .Equal (t , float64 (1 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_total" ))
1351
+ assert .Equal (t , float64 (0 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_failed_total" ))
1352
+ }
1353
+
1354
+ // 3. Create a second instance. This should attempt to fetch the silence from the first.
1355
+ i2 := createInstance (2 )
1356
+
1357
+ // 3.a. Check the silence was fetched from the first instance successfully.
1358
+ checkSilence (i2 , "u-1" )
1359
+
1360
+ // 3.b. Check the metrics: We should see the additional silences without any replication activity.
1361
+ {
1362
+ metrics := registries .BuildMetricFamiliesPerUser ()
1363
+ assert .Equal (t , float64 (2 ), metrics .GetSumOfGauges ("cortex_alertmanager_silences" ))
1364
+ assert .Equal (t , float64 (1 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_total" ))
1365
+ assert .Equal (t , float64 (0 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_failed_total" ))
1366
+ }
1367
+
1368
+ if tt .replicationFactor >= 3 {
1369
+ // 4. When testing RF = 3, create a third instance, to test obtaining state from multiple places.
1370
+ i3 := createInstance (3 )
1371
+
1372
+ // 4.a. Check the silence was fetched one or both of the instances successfully.
1373
+ checkSilence (i3 , "u-1" )
1374
+
1375
+ // 4.b. Check the metrics one more time. We should have three replicas of the silence.
1376
+ {
1377
+ metrics := registries .BuildMetricFamiliesPerUser ()
1378
+ assert .Equal (t , float64 (3 ), metrics .GetSumOfGauges ("cortex_alertmanager_silences" ))
1379
+ assert .Equal (t , float64 (1 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_total" ))
1380
+ assert .Equal (t , float64 (0 ), metrics .GetSumOfCounters ("cortex_alertmanager_state_replication_failed_total" ))
1381
+ }
1382
+ }
1383
+ })
1384
+ }
1385
+ }
1386
+
1208
1387
// prepareInMemoryAlertStore builds and returns an in-memory alert store.
1209
1388
func prepareInMemoryAlertStore () alertstore.AlertStore {
1210
1389
return bucketclient .NewBucketAlertStore (objstore .NewInMemBucket (), nil , log .NewNopLogger ())
@@ -1251,6 +1430,10 @@ func (am *passthroughAlertmanagerClient) UpdateState(ctx context.Context, in *cl
1251
1430
return am .server .UpdateState (ctx , in )
1252
1431
}
1253
1432
1433
+ func (am * passthroughAlertmanagerClient ) ReadState (ctx context.Context , in * alertmanagerpb.ReadStateRequest , opts ... grpc.CallOption ) (* alertmanagerpb.ReadStateResponse , error ) {
1434
+ return am .server .ReadState (ctx , in )
1435
+ }
1436
+
1254
1437
func (am * passthroughAlertmanagerClient ) HandleRequest (context.Context , * httpgrpc.HTTPRequest , ... grpc.CallOption ) (* httpgrpc.HTTPResponse , error ) {
1255
1438
return nil , fmt .Errorf ("unexpected call to HandleRequest" )
1256
1439
}
0 commit comments