Skip to content

Commit e34f5e8

Browse files
roystchiangrohang98alvinlin123
authored
fix leaking notifier in ruler when user is removed (#4718)
* fix leaking notifier in ruler when user is removed Signed-off-by: Roy Chiang <[email protected]> Co-authored-by: Rohan Gupta <[email protected]> Co-authored-by: Alvin Lin <[email protected]>
1 parent 28a8a37 commit e34f5e8

File tree

6 files changed

+268
-28
lines changed

6 files changed

+268
-28
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
* [BUGFIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633
3939
* [BUGFIX] Distributor: update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy. #4636
4040
* [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716
41+
* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718
4142
* [BUGFIX] Distributor: Fix a memory leak in distributor due to the cluster label. #4739
4243
* [ENHANCEMENT] Compactor: uploading blocks no compaction marks to the global location and introduce a new metric #4729
4344
* `cortex_bucket_blocks_marked_for_no_compaction_count`: Total number of blocks marked for no compaction in the bucket.

pkg/ruler/manager.go

+22-10
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
107107
go mngr.Stop()
108108
delete(r.userManagers, userID)
109109

110+
r.removeNotifier(userID)
110111
r.mapper.cleanupUser(userID)
111112
r.lastReloadSuccessful.DeleteLabelValues(userID)
112113
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
@@ -163,33 +164,44 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
163164
// newManager creates a prometheus rule manager wrapped with a user id
164165
// configured storage, appendable, notifier, and instrumentation
165166
func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) {
166-
notifier, err := r.getOrCreateNotifier(userID)
167-
if err != nil {
168-
return nil, err
169-
}
170-
171167
// Create a new Prometheus registry and register it within
172-
// our metrics struct for the provided user.
168+
// our metrics struct for the provided user if it doesn't already exist.
173169
reg := prometheus.NewRegistry()
174170
r.userManagerMetrics.AddUserRegistry(userID, reg)
175171

172+
notifier, err := r.getOrCreateNotifier(userID, reg)
173+
if err != nil {
174+
return nil, err
175+
}
176+
176177
return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil
177178
}
178179

179-
func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) {
180+
func (r *DefaultMultiTenantManager) removeNotifier(userID string) {
181+
r.notifiersMtx.Lock()
182+
defer r.notifiersMtx.Unlock()
183+
184+
if n, ok := r.notifiers[userID]; ok {
185+
n.stop()
186+
}
187+
188+
delete(r.notifiers, userID)
189+
}
190+
191+
func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) {
180192
r.notifiersMtx.Lock()
181193
defer r.notifiersMtx.Unlock()
182194

183195
n, ok := r.notifiers[userID]
184196
if ok {
197+
// When there is a stale user, we stop the notifier but do not remove it
198+
n.run()
185199
return n.notifier, nil
186200
}
187201

188-
reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry)
189-
reg = prometheus.WrapRegistererWithPrefix("cortex_", reg)
190202
n = newRulerNotifier(&notifier.Options{
191203
QueueCapacity: r.cfg.NotificationQueueCapacity,
192-
Registerer: reg,
204+
Registerer: userManagerRegistry,
193205
Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) {
194206
// Note: The passed-in context comes from the Prometheus notifier
195207
// and does *not* contain the userID. So it needs to be added to the context

pkg/ruler/manager_metrics.go

+69
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ type ManagerMetrics struct {
2323
GroupLastDuration *prometheus.Desc
2424
GroupRules *prometheus.Desc
2525
GroupLastEvalSamples *prometheus.Desc
26+
27+
NotificationLatency *prometheus.Desc
28+
NotificationErrors *prometheus.Desc
29+
NotificationSent *prometheus.Desc
30+
NotificationDropped *prometheus.Desc
31+
NotificationQueueLength *prometheus.Desc
32+
NotificationQueueCapacity *prometheus.Desc
33+
AlertmanagersDiscovered *prometheus.Desc
2634
}
2735

2836
// NewManagerMetrics returns a ManagerMetrics struct
@@ -101,6 +109,51 @@ func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics {
101109
commonLabels,
102110
nil,
103111
),
112+
113+
// Prometheus' ruler's notification metrics
114+
NotificationLatency: prometheus.NewDesc(
115+
"cortex_prometheus_notifications_latency_seconds",
116+
"Latency quantiles for sending alert notifications.",
117+
[]string{"user"},
118+
nil,
119+
),
120+
121+
NotificationErrors: prometheus.NewDesc(
122+
"cortex_prometheus_notifications_errors_total",
123+
"Total number of errors sending alert notifications.",
124+
[]string{"user", "alertmanager"},
125+
nil,
126+
),
127+
NotificationSent: prometheus.NewDesc(
128+
"cortex_prometheus_notifications_sent_total",
129+
"Total number of alerts sent.",
130+
[]string{"user", "alertmanager"},
131+
nil,
132+
),
133+
NotificationDropped: prometheus.NewDesc(
134+
"cortex_prometheus_notifications_dropped_total",
135+
"Total number of alerts dropped due to errors when sending to Alertmanager.",
136+
[]string{"user"},
137+
nil,
138+
),
139+
NotificationQueueLength: prometheus.NewDesc(
140+
"cortex_prometheus_notifications_queue_length",
141+
"The number of alert notifications in the queue.",
142+
[]string{"user"},
143+
nil,
144+
),
145+
NotificationQueueCapacity: prometheus.NewDesc(
146+
"cortex_prometheus_notifications_queue_capacity",
147+
"The capacity of the alert notifications queue.",
148+
[]string{"user"},
149+
nil,
150+
),
151+
AlertmanagersDiscovered: prometheus.NewDesc(
152+
"cortex_prometheus_notifications_alertmanagers_discovered",
153+
"The number of alertmanagers discovered and active.",
154+
[]string{"user"},
155+
nil,
156+
),
104157
}
105158
}
106159

@@ -127,6 +180,14 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) {
127180
out <- m.GroupLastDuration
128181
out <- m.GroupRules
129182
out <- m.GroupLastEvalSamples
183+
184+
out <- m.NotificationLatency
185+
out <- m.NotificationErrors
186+
out <- m.NotificationSent
187+
out <- m.NotificationDropped
188+
out <- m.NotificationQueueLength
189+
out <- m.NotificationQueueCapacity
190+
out <- m.AlertmanagersDiscovered
130191
}
131192

132193
// Collect implements the Collector interface
@@ -152,4 +213,12 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
152213
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...)
153214
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...)
154215
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...)
216+
217+
data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds")
218+
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager")
219+
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager")
220+
data.SendSumOfCountersPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total")
221+
data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length")
222+
data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity")
223+
data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered")
155224
}

pkg/ruler/manager_metrics_test.go

+166-12
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1
3434
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
3535
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
3636
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
37+
# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
38+
# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
39+
cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
40+
cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
41+
cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
42+
# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
43+
# TYPE cortex_prometheus_notifications_dropped_total counter
44+
cortex_prometheus_notifications_dropped_total{user="user1"} 1
45+
cortex_prometheus_notifications_dropped_total{user="user2"} 10
46+
cortex_prometheus_notifications_dropped_total{user="user3"} 100
47+
# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
48+
# TYPE cortex_prometheus_notifications_errors_total counter
49+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
50+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
51+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
52+
# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
53+
# TYPE cortex_prometheus_notifications_latency_seconds summary
54+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
55+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
56+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
57+
cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
58+
cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
59+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
60+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
61+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
62+
cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
63+
cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
64+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
65+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
66+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
67+
cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
68+
cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
69+
# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
70+
# TYPE cortex_prometheus_notifications_queue_capacity gauge
71+
cortex_prometheus_notifications_queue_capacity{user="user1"} 1
72+
cortex_prometheus_notifications_queue_capacity{user="user2"} 10
73+
cortex_prometheus_notifications_queue_capacity{user="user3"} 100
74+
# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
75+
# TYPE cortex_prometheus_notifications_queue_length gauge
76+
cortex_prometheus_notifications_queue_length{user="user1"} 1
77+
cortex_prometheus_notifications_queue_length{user="user2"} 10
78+
cortex_prometheus_notifications_queue_length{user="user3"} 100
79+
# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
80+
# TYPE cortex_prometheus_notifications_sent_total counter
81+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
82+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
83+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
3784
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
3885
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
3986
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -153,6 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) {
153200
cortex_prometheus_last_evaluation_samples{user="user1"} 2000
154201
cortex_prometheus_last_evaluation_samples{user="user2"} 20000
155202
cortex_prometheus_last_evaluation_samples{user="user3"} 200000
203+
# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
204+
# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
205+
cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
206+
cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
207+
cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
208+
# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
209+
# TYPE cortex_prometheus_notifications_dropped_total counter
210+
cortex_prometheus_notifications_dropped_total{user="user1"} 1
211+
cortex_prometheus_notifications_dropped_total{user="user2"} 10
212+
cortex_prometheus_notifications_dropped_total{user="user3"} 100
213+
# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
214+
# TYPE cortex_prometheus_notifications_errors_total counter
215+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
216+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
217+
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
218+
# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
219+
# TYPE cortex_prometheus_notifications_latency_seconds summary
220+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
221+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
222+
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
223+
cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
224+
cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
225+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
226+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
227+
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
228+
cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
229+
cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
230+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
231+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
232+
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
233+
cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
234+
cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
235+
# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
236+
# TYPE cortex_prometheus_notifications_queue_capacity gauge
237+
cortex_prometheus_notifications_queue_capacity{user="user1"} 1
238+
cortex_prometheus_notifications_queue_capacity{user="user2"} 10
239+
cortex_prometheus_notifications_queue_capacity{user="user3"} 100
240+
# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
241+
# TYPE cortex_prometheus_notifications_queue_length gauge
242+
cortex_prometheus_notifications_queue_length{user="user1"} 1
243+
cortex_prometheus_notifications_queue_length{user="user2"} 10
244+
cortex_prometheus_notifications_queue_length{user="user3"} 100
245+
# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
246+
# TYPE cortex_prometheus_notifications_sent_total counter
247+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
248+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
249+
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
156250
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
157251
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
158252
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
@@ -261,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry {
261355
metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000)
262356
metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000)
263357

358+
metrics.notificationsLatency.WithLabelValues("alertmanager_1").Observe(base)
359+
metrics.notificationsErrors.WithLabelValues("alertmanager_1").Add(base)
360+
metrics.notificationsSent.WithLabelValues("alertmanager_1").Add(base)
361+
metrics.notificationsDropped.Add(base)
362+
metrics.notificationsQueueLength.Set(base)
363+
metrics.notificationsQueueCapacity.Set(base)
364+
metrics.notificationsAlertmanagersDiscovered.Set(base)
264365
return r
265366
}
266367

267368
// Copied from github.com/prometheus/rules/manager.go
369+
// and github.com/prometheus/notifier/notifier.go
268370
type groupMetrics struct {
269-
evalDuration prometheus.Summary
270-
iterationDuration prometheus.Summary
271-
iterationsMissed *prometheus.CounterVec
272-
iterationsScheduled *prometheus.CounterVec
273-
evalTotal *prometheus.CounterVec
274-
evalFailures *prometheus.CounterVec
275-
groupInterval *prometheus.GaugeVec
276-
groupLastEvalTime *prometheus.GaugeVec
277-
groupLastDuration *prometheus.GaugeVec
278-
groupRules *prometheus.GaugeVec
279-
groupLastEvalSamples *prometheus.GaugeVec
371+
evalDuration prometheus.Summary
372+
iterationDuration prometheus.Summary
373+
iterationsMissed *prometheus.CounterVec
374+
iterationsScheduled *prometheus.CounterVec
375+
evalTotal *prometheus.CounterVec
376+
evalFailures *prometheus.CounterVec
377+
groupInterval *prometheus.GaugeVec
378+
groupLastEvalTime *prometheus.GaugeVec
379+
groupLastDuration *prometheus.GaugeVec
380+
groupRules *prometheus.GaugeVec
381+
groupLastEvalSamples *prometheus.GaugeVec
382+
notificationsLatency *prometheus.SummaryVec
383+
notificationsErrors *prometheus.CounterVec
384+
notificationsSent *prometheus.CounterVec
385+
notificationsDropped prometheus.Counter
386+
notificationsQueueLength prometheus.Gauge
387+
notificationsQueueCapacity prometheus.Gauge
388+
notificationsAlertmanagersDiscovered prometheus.Gauge
280389
}
281390

282391
func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
@@ -355,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
355464
},
356465
[]string{"rule_group"},
357466
),
467+
notificationsLatency: promauto.With(r).NewSummaryVec(
468+
prometheus.SummaryOpts{
469+
Name: "prometheus_notifications_latency_seconds",
470+
Help: "Latency quantiles for sending alert notifications.",
471+
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
472+
},
473+
[]string{"alertmanager"},
474+
),
475+
notificationsErrors: promauto.With(r).NewCounterVec(
476+
prometheus.CounterOpts{
477+
Name: "prometheus_notifications_errors_total",
478+
Help: "Latency quantiles for sending alert notifications.",
479+
},
480+
[]string{"alertmanager"},
481+
),
482+
notificationsSent: promauto.With(r).NewCounterVec(
483+
prometheus.CounterOpts{
484+
Name: "prometheus_notifications_sent_total",
485+
Help: "Total number of errors sending alert notifications",
486+
},
487+
[]string{"alertmanager"},
488+
),
489+
notificationsDropped: promauto.With(r).NewCounter(
490+
prometheus.CounterOpts{
491+
Name: "prometheus_notifications_dropped_total",
492+
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
493+
},
494+
),
495+
notificationsQueueLength: promauto.With(r).NewGauge(
496+
prometheus.GaugeOpts{
497+
Name: "prometheus_notifications_queue_length",
498+
Help: "The number of alert notifications in the queue.",
499+
},
500+
),
501+
notificationsQueueCapacity: promauto.With(r).NewGauge(
502+
prometheus.GaugeOpts{
503+
Name: "prometheus_notifications_queue_capacity",
504+
Help: "The capacity of the alert notifications queue.",
505+
},
506+
),
507+
notificationsAlertmanagersDiscovered: promauto.With(r).NewGauge(
508+
prometheus.GaugeOpts{
509+
Name: "prometheus_notifications_alertmanagers_discovered",
510+
Help: "The number of alertmanagers discovered and active.",
511+
},
512+
),
358513
}
359-
360514
return m
361515
}
362516

0 commit comments

Comments
 (0)