@@ -34,6 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1
34
34
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
35
35
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
36
36
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
37
+ # HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
38
+ # TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
39
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
40
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
41
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
42
+ # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
43
+ # TYPE cortex_prometheus_notifications_dropped_total counter
44
+ cortex_prometheus_notifications_dropped_total{user="user1"} 1
45
+ cortex_prometheus_notifications_dropped_total{user="user2"} 10
46
+ cortex_prometheus_notifications_dropped_total{user="user3"} 100
47
+ # HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
48
+ # TYPE cortex_prometheus_notifications_errors_total counter
49
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
50
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
51
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
52
+ # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
53
+ # TYPE cortex_prometheus_notifications_latency_seconds summary
54
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
55
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
56
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
57
+ cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
58
+ cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
59
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
60
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
61
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
62
+ cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
63
+ cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
64
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
65
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
66
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
67
+ cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
68
+ cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
69
+ # HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
70
+ # TYPE cortex_prometheus_notifications_queue_capacity gauge
71
+ cortex_prometheus_notifications_queue_capacity{user="user1"} 1
72
+ cortex_prometheus_notifications_queue_capacity{user="user2"} 10
73
+ cortex_prometheus_notifications_queue_capacity{user="user3"} 100
74
+ # HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
75
+ # TYPE cortex_prometheus_notifications_queue_length gauge
76
+ cortex_prometheus_notifications_queue_length{user="user1"} 1
77
+ cortex_prometheus_notifications_queue_length{user="user2"} 10
78
+ cortex_prometheus_notifications_queue_length{user="user3"} 100
79
+ # HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
80
+ # TYPE cortex_prometheus_notifications_sent_total counter
81
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
82
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
83
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
37
84
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
38
85
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
39
86
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -153,6 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) {
153
200
cortex_prometheus_last_evaluation_samples{user="user1"} 2000
154
201
cortex_prometheus_last_evaluation_samples{user="user2"} 20000
155
202
cortex_prometheus_last_evaluation_samples{user="user3"} 200000
203
+ # HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
204
+ # TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
205
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
206
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
207
+ cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
208
+ # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
209
+ # TYPE cortex_prometheus_notifications_dropped_total counter
210
+ cortex_prometheus_notifications_dropped_total{user="user1"} 1
211
+ cortex_prometheus_notifications_dropped_total{user="user2"} 10
212
+ cortex_prometheus_notifications_dropped_total{user="user3"} 100
213
+ # HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
214
+ # TYPE cortex_prometheus_notifications_errors_total counter
215
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
216
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
217
+ cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
218
+ # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
219
+ # TYPE cortex_prometheus_notifications_latency_seconds summary
220
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
221
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
222
+ cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
223
+ cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
224
+ cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
225
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
226
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
227
+ cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
228
+ cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
229
+ cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
230
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
231
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
232
+ cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
233
+ cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
234
+ cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
235
+ # HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
236
+ # TYPE cortex_prometheus_notifications_queue_capacity gauge
237
+ cortex_prometheus_notifications_queue_capacity{user="user1"} 1
238
+ cortex_prometheus_notifications_queue_capacity{user="user2"} 10
239
+ cortex_prometheus_notifications_queue_capacity{user="user3"} 100
240
+ # HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
241
+ # TYPE cortex_prometheus_notifications_queue_length gauge
242
+ cortex_prometheus_notifications_queue_length{user="user1"} 1
243
+ cortex_prometheus_notifications_queue_length{user="user2"} 10
244
+ cortex_prometheus_notifications_queue_length{user="user3"} 100
245
+ # HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
246
+ # TYPE cortex_prometheus_notifications_sent_total counter
247
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
248
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
249
+ cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
156
250
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
157
251
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
158
252
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -261,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry {
261
355
metrics .groupLastEvalSamples .WithLabelValues ("group_one" ).Add (base * 1000 )
262
356
metrics .groupLastEvalSamples .WithLabelValues ("group_two" ).Add (base * 1000 )
263
357
358
+ metrics .notificationsLatency .WithLabelValues ("alertmanager_1" ).Observe (base )
359
+ metrics .notificationsErrors .WithLabelValues ("alertmanager_1" ).Add (base )
360
+ metrics .notificationsSent .WithLabelValues ("alertmanager_1" ).Add (base )
361
+ metrics .notificationsDropped .Add (base )
362
+ metrics .notificationsQueueLength .Set (base )
363
+ metrics .notificationsQueueCapacity .Set (base )
364
+ metrics .notificationsAlertmanagersDiscovered .Set (base )
264
365
return r
265
366
}
266
367
267
368
// Copied from github.com/prometheus/rules/manager.go
369
+ // and github.com/prometheus/notifier/notifier.go
268
370
type groupMetrics struct {
269
- evalDuration prometheus.Summary
270
- iterationDuration prometheus.Summary
271
- iterationsMissed * prometheus.CounterVec
272
- iterationsScheduled * prometheus.CounterVec
273
- evalTotal * prometheus.CounterVec
274
- evalFailures * prometheus.CounterVec
275
- groupInterval * prometheus.GaugeVec
276
- groupLastEvalTime * prometheus.GaugeVec
277
- groupLastDuration * prometheus.GaugeVec
278
- groupRules * prometheus.GaugeVec
279
- groupLastEvalSamples * prometheus.GaugeVec
371
+ evalDuration prometheus.Summary
372
+ iterationDuration prometheus.Summary
373
+ iterationsMissed * prometheus.CounterVec
374
+ iterationsScheduled * prometheus.CounterVec
375
+ evalTotal * prometheus.CounterVec
376
+ evalFailures * prometheus.CounterVec
377
+ groupInterval * prometheus.GaugeVec
378
+ groupLastEvalTime * prometheus.GaugeVec
379
+ groupLastDuration * prometheus.GaugeVec
380
+ groupRules * prometheus.GaugeVec
381
+ groupLastEvalSamples * prometheus.GaugeVec
382
+ notificationsLatency * prometheus.SummaryVec
383
+ notificationsErrors * prometheus.CounterVec
384
+ notificationsSent * prometheus.CounterVec
385
+ notificationsDropped prometheus.Counter
386
+ notificationsQueueLength prometheus.Gauge
387
+ notificationsQueueCapacity prometheus.Gauge
388
+ notificationsAlertmanagersDiscovered prometheus.Gauge
280
389
}
281
390
282
391
func newGroupMetrics (r prometheus.Registerer ) * groupMetrics {
@@ -355,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
355
464
},
356
465
[]string {"rule_group" },
357
466
),
467
+ notificationsLatency : promauto .With (r ).NewSummaryVec (
468
+ prometheus.SummaryOpts {
469
+ Name : "prometheus_notifications_latency_seconds" ,
470
+ Help : "Latency quantiles for sending alert notifications." ,
471
+ Objectives : map [float64 ]float64 {0.5 : 0.05 , 0.9 : 0.01 , 0.99 : 0.001 },
472
+ },
473
+ []string {"alertmanager" },
474
+ ),
475
+ notificationsErrors : promauto .With (r ).NewCounterVec (
476
+ prometheus.CounterOpts {
477
+ Name : "prometheus_notifications_errors_total" ,
478
+ Help : "Latency quantiles for sending alert notifications." ,
479
+ },
480
+ []string {"alertmanager" },
481
+ ),
482
+ notificationsSent : promauto .With (r ).NewCounterVec (
483
+ prometheus.CounterOpts {
484
+ Name : "prometheus_notifications_sent_total" ,
485
+ Help : "Total number of errors sending alert notifications" ,
486
+ },
487
+ []string {"alertmanager" },
488
+ ),
489
+ notificationsDropped : promauto .With (r ).NewCounter (
490
+ prometheus.CounterOpts {
491
+ Name : "prometheus_notifications_dropped_total" ,
492
+ Help : "Total number of alerts dropped due to errors when sending to Alertmanager." ,
493
+ },
494
+ ),
495
+ notificationsQueueLength : promauto .With (r ).NewGauge (
496
+ prometheus.GaugeOpts {
497
+ Name : "prometheus_notifications_queue_length" ,
498
+ Help : "The number of alert notifications in the queue." ,
499
+ },
500
+ ),
501
+ notificationsQueueCapacity : promauto .With (r ).NewGauge (
502
+ prometheus.GaugeOpts {
503
+ Name : "prometheus_notifications_queue_capacity" ,
504
+ Help : "The capacity of the alert notifications queue." ,
505
+ },
506
+ ),
507
+ notificationsAlertmanagersDiscovered : promauto .With (r ).NewGauge (
508
+ prometheus.GaugeOpts {
509
+ Name : "prometheus_notifications_alertmanagers_discovered" ,
510
+ Help : "The number of alertmanagers discovered and active." ,
511
+ },
512
+ ),
358
513
}
359
-
360
514
return m
361
515
}
362
516
0 commit comments