Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix leaking notifier in ruler when user is removed #4718

Merged
merged 15 commits into from
Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
* [BUGFIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633
* [BUGFIX] Distributor: update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy. #4636
* [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716
* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718
* [BUGFIX] Distributor: Fix a memory leak in distributor due to the cluster label. #4739
* [ENHANCEMENT] Compactor: uploading blocks no compaction marks to the global location and introduce a new metric #4729
* `cortex_bucket_blocks_marked_for_no_compaction_count`: Total number of blocks marked for no compaction in the bucket.
Expand Down
32 changes: 22 additions & 10 deletions pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
go mngr.Stop()
delete(r.userManagers, userID)

r.removeNotifier(userID)
r.mapper.cleanupUser(userID)
r.lastReloadSuccessful.DeleteLabelValues(userID)
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
Expand Down Expand Up @@ -163,33 +164,44 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
// newManager creates a prometheus rule manager wrapped with a user id
// configured storage, appendable, notifier, and instrumentation
func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) {
notifier, err := r.getOrCreateNotifier(userID)
if err != nil {
return nil, err
}

// Create a new Prometheus registry and register it within
// our metrics struct for the provided user.
// our metrics struct for the provided user if it doesn't already exist.
reg := prometheus.NewRegistry()
r.userManagerMetrics.AddUserRegistry(userID, reg)

notifier, err := r.getOrCreateNotifier(userID, reg)
if err != nil {
return nil, err
}

return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil
}

func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) {
func (r *DefaultMultiTenantManager) removeNotifier(userID string) {
r.notifiersMtx.Lock()
defer r.notifiersMtx.Unlock()

if n, ok := r.notifiers[userID]; ok {
n.stop()
}

delete(r.notifiers, userID)
}

func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) {
r.notifiersMtx.Lock()
defer r.notifiersMtx.Unlock()

n, ok := r.notifiers[userID]
if ok {
// When there is a stale user, we stop the notifier but do not remove it
n.run()
return n.notifier, nil
}

reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry)
reg = prometheus.WrapRegistererWithPrefix("cortex_", reg)
n = newRulerNotifier(&notifier.Options{
QueueCapacity: r.cfg.NotificationQueueCapacity,
Registerer: reg,
Registerer: userManagerRegistry,
Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) {
// Note: The passed-in context comes from the Prometheus notifier
// and does *not* contain the userID. So it needs to be added to the context
Expand Down
69 changes: 69 additions & 0 deletions pkg/ruler/manager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ type ManagerMetrics struct {
GroupLastDuration *prometheus.Desc
GroupRules *prometheus.Desc
GroupLastEvalSamples *prometheus.Desc

NotificationLatency *prometheus.Desc
NotificationErrors *prometheus.Desc
NotificationSent *prometheus.Desc
NotificationDropped *prometheus.Desc
NotificationQueueLength *prometheus.Desc
NotificationQueueCapacity *prometheus.Desc
AlertmanagersDiscovered *prometheus.Desc
}

// NewManagerMetrics returns a ManagerMetrics struct
Expand Down Expand Up @@ -101,6 +109,51 @@ func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics {
commonLabels,
nil,
),

// Prometheus' ruler's notification metrics
NotificationLatency: prometheus.NewDesc(
"cortex_prometheus_notifications_latency_seconds",
"Latency quantiles for sending alert notifications.",
[]string{"user"},
nil,
),

NotificationErrors: prometheus.NewDesc(
"cortex_prometheus_notifications_errors_total",
"Total number of errors sending alert notifications.",
[]string{"user", "alertmanager"},
nil,
),
NotificationSent: prometheus.NewDesc(
"cortex_prometheus_notifications_sent_total",
"Total number of alerts sent.",
[]string{"user", "alertmanager"},
nil,
),
NotificationDropped: prometheus.NewDesc(
"cortex_prometheus_notifications_dropped_total",
"Total number of alerts dropped due to errors when sending to Alertmanager.",
[]string{"user"},
nil,
),
NotificationQueueLength: prometheus.NewDesc(
"cortex_prometheus_notifications_queue_length",
"The number of alert notifications in the queue.",
[]string{"user"},
nil,
),
NotificationQueueCapacity: prometheus.NewDesc(
"cortex_prometheus_notifications_queue_capacity",
"The capacity of the alert notifications queue.",
[]string{"user"},
nil,
),
AlertmanagersDiscovered: prometheus.NewDesc(
"cortex_prometheus_notifications_alertmanagers_discovered",
"The number of alertmanagers discovered and active.",
[]string{"user"},
nil,
),
}
}

Expand All @@ -127,6 +180,14 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.GroupLastDuration
out <- m.GroupRules
out <- m.GroupLastEvalSamples

out <- m.NotificationLatency
out <- m.NotificationErrors
out <- m.NotificationSent
out <- m.NotificationDropped
out <- m.NotificationQueueLength
out <- m.NotificationQueueCapacity
out <- m.AlertmanagersDiscovered
}

// Collect implements the Collector interface
Expand All @@ -152,4 +213,12 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...)
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...)

data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds")
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager")
data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager")
data.SendSumOfCountersPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total")
data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length")
data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity")
data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered")
}
178 changes: 166 additions & 12 deletions pkg/ruler/manager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000
cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000
# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
# TYPE cortex_prometheus_notifications_dropped_total counter
cortex_prometheus_notifications_dropped_total{user="user1"} 1
cortex_prometheus_notifications_dropped_total{user="user2"} 10
cortex_prometheus_notifications_dropped_total{user="user3"} 100
# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
# TYPE cortex_prometheus_notifications_errors_total counter
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
# TYPE cortex_prometheus_notifications_latency_seconds summary
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
# TYPE cortex_prometheus_notifications_queue_capacity gauge
cortex_prometheus_notifications_queue_capacity{user="user1"} 1
cortex_prometheus_notifications_queue_capacity{user="user2"} 10
cortex_prometheus_notifications_queue_capacity{user="user3"} 100
# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
# TYPE cortex_prometheus_notifications_queue_length gauge
cortex_prometheus_notifications_queue_length{user="user1"} 1
cortex_prometheus_notifications_queue_length{user="user2"} 10
cortex_prometheus_notifications_queue_length{user="user3"} 100
# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
# TYPE cortex_prometheus_notifications_sent_total counter
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
Expand Down Expand Up @@ -153,6 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) {
cortex_prometheus_last_evaluation_samples{user="user1"} 2000
cortex_prometheus_last_evaluation_samples{user="user2"} 20000
cortex_prometheus_last_evaluation_samples{user="user3"} 200000
# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active.
# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge
cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1
cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10
cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100
# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager.
# TYPE cortex_prometheus_notifications_dropped_total counter
cortex_prometheus_notifications_dropped_total{user="user1"} 1
cortex_prometheus_notifications_dropped_total{user="user2"} 10
cortex_prometheus_notifications_dropped_total{user="user3"} 100
# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications.
# TYPE cortex_prometheus_notifications_errors_total counter
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10
cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100
# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications.
# TYPE cortex_prometheus_notifications_latency_seconds summary
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1
cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1
cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1
cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10
cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10
cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10
cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100
cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100
cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100
cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1
# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue.
# TYPE cortex_prometheus_notifications_queue_capacity gauge
cortex_prometheus_notifications_queue_capacity{user="user1"} 1
cortex_prometheus_notifications_queue_capacity{user="user2"} 10
cortex_prometheus_notifications_queue_capacity{user="user3"} 100
# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue.
# TYPE cortex_prometheus_notifications_queue_length gauge
cortex_prometheus_notifications_queue_length{user="user1"} 1
cortex_prometheus_notifications_queue_length{user="user2"} 10
cortex_prometheus_notifications_queue_length{user="user3"} 100
# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent.
# TYPE cortex_prometheus_notifications_sent_total counter
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10
cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
Expand Down Expand Up @@ -261,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry {
metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000)
metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000)

metrics.notificationsLatency.WithLabelValues("alertmanager_1").Observe(base)
metrics.notificationsErrors.WithLabelValues("alertmanager_1").Add(base)
metrics.notificationsSent.WithLabelValues("alertmanager_1").Add(base)
metrics.notificationsDropped.Add(base)
metrics.notificationsQueueLength.Set(base)
metrics.notificationsQueueCapacity.Set(base)
metrics.notificationsAlertmanagersDiscovered.Set(base)
return r
}

// Copied from github.com/prometheus/rules/manager.go
// and github.com/prometheus/notifier/notifier.go
type groupMetrics struct {
evalDuration prometheus.Summary
iterationDuration prometheus.Summary
iterationsMissed *prometheus.CounterVec
iterationsScheduled *prometheus.CounterVec
evalTotal *prometheus.CounterVec
evalFailures *prometheus.CounterVec
groupInterval *prometheus.GaugeVec
groupLastEvalTime *prometheus.GaugeVec
groupLastDuration *prometheus.GaugeVec
groupRules *prometheus.GaugeVec
groupLastEvalSamples *prometheus.GaugeVec
evalDuration prometheus.Summary
iterationDuration prometheus.Summary
iterationsMissed *prometheus.CounterVec
iterationsScheduled *prometheus.CounterVec
evalTotal *prometheus.CounterVec
evalFailures *prometheus.CounterVec
groupInterval *prometheus.GaugeVec
groupLastEvalTime *prometheus.GaugeVec
groupLastDuration *prometheus.GaugeVec
groupRules *prometheus.GaugeVec
groupLastEvalSamples *prometheus.GaugeVec
notificationsLatency *prometheus.SummaryVec
notificationsErrors *prometheus.CounterVec
notificationsSent *prometheus.CounterVec
notificationsDropped prometheus.Counter
notificationsQueueLength prometheus.Gauge
notificationsQueueCapacity prometheus.Gauge
notificationsAlertmanagersDiscovered prometheus.Gauge
}

func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
Expand Down Expand Up @@ -355,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
},
[]string{"rule_group"},
),
notificationsLatency: promauto.With(r).NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_notifications_latency_seconds",
Help: "Latency quantiles for sending alert notifications.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"alertmanager"},
),
notificationsErrors: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_notifications_errors_total",
Help: "Latency quantiles for sending alert notifications.",
},
[]string{"alertmanager"},
),
notificationsSent: promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_notifications_sent_total",
Help: "Total number of errors sending alert notifications",
},
[]string{"alertmanager"},
),
notificationsDropped: promauto.With(r).NewCounter(
prometheus.CounterOpts{
Name: "prometheus_notifications_dropped_total",
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
},
),
notificationsQueueLength: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Name: "prometheus_notifications_queue_length",
Help: "The number of alert notifications in the queue.",
},
),
notificationsQueueCapacity: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Name: "prometheus_notifications_queue_capacity",
Help: "The capacity of the alert notifications queue.",
},
),
notificationsAlertmanagersDiscovered: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Name: "prometheus_notifications_alertmanagers_discovered",
Help: "The number of alertmanagers discovered and active.",
},
),
}

return m
}

Expand Down
Loading