diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d3827e883c..4a155f427e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [ENHANCEMENT] Ruler: Add new ruler metric `cortex_ruler_rule_groups_in_store` that is the total rule groups per tenant in store, which can be used to compare with `cortex_prometheus_rule_group_rules` to count the number of rule groups that are not loaded by a ruler. #5869 +* [ENHANCEMENT] Ruler: Add query statistics metrics when --ruler.query-stats-enabled=true. #6173 ## 1.18.0 in progress diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 0144dbaf31a..c664fa6a960 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -4366,8 +4366,8 @@ ring: # CLI flag: -ruler.disabled-tenants [disabled_tenants: | default = ""] -# Report the wall time for ruler queries to complete as a per user metric and as -# an info level log message. +# Report query statistics for ruler queries to complete as a per user metric and +# as an info level log message. # CLI flag: -ruler.query-stats-enabled [query_stats_enabled: | default = false] diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 21b609a4fd2..6ed72c9831e 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -229,19 +229,25 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun } } -func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { - if queryTime == nil { - return qf - } +func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, userID string, evalMetrics *RuleEvalMetrics, logger log.Logger) rules.QueryFunc { + queryTime := evalMetrics.RulerQuerySeconds.WithLabelValues(userID) + querySeries := evalMetrics.RulerQuerySeries.WithLabelValues(userID) + querySample := evalMetrics.RulerQuerySamples.WithLabelValues(userID) + queryChunkBytes := evalMetrics.RulerQueryChunkBytes.WithLabelValues(userID) + queryDataBytes := evalMetrics.RulerQueryDataBytes.WithLabelValues(userID) return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queryStats, ctx := stats.ContextWithEmptyStats(ctx) // If we've been passed a counter we want to record the wall time spent executing this request. timer := prometheus.NewTimer(nil) + defer func() { querySeconds := timer.ObserveDuration().Seconds() queryTime.Add(querySeconds) - + querySeries.Add(float64(queryStats.FetchedSeriesCount)) + querySample.Add(float64(queryStats.FetchedSamplesCount)) + queryChunkBytes.Add(float64(queryStats.FetchedChunkBytes)) + queryDataBytes.Add(float64(queryStats.FetchedDataBytes)) // Log ruler query stats. logMessage := []interface{}{ "msg", "query stats", @@ -303,23 +309,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors) return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { - var queryTime prometheus.Counter - if evalMetrics.RulerQuerySeconds != nil { - queryTime = evalMetrics.RulerQuerySeconds.WithLabelValues(userID) - } - failedQueries := evalMetrics.FailedQueriesVec.WithLabelValues(userID) totalQueries := evalMetrics.TotalQueriesVec.WithLabelValues(userID) totalWrites := evalMetrics.TotalWritesVec.WithLabelValues(userID) failedWrites := evalMetrics.FailedWritesVec.WithLabelValues(userID) + var queryFunc rules.QueryFunc engineQueryFunc := EngineQueryFunc(engine, q, overrides, userID, cfg.LookbackDelta) metricsQueryFunc := MetricsQueryFunc(engineQueryFunc, totalQueries, failedQueries) + if cfg.EnableQueryStats { + queryFunc = RecordAndReportRuleQueryMetrics(metricsQueryFunc, userID, evalMetrics, logger) + } else { + queryFunc = metricsQueryFunc + } return rules.NewManager(&rules.ManagerOptions{ Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), Queryable: q, - QueryFunc: RecordAndReportRuleQueryMetrics(metricsQueryFunc, queryTime, logger), + QueryFunc: queryFunc, Context: user.InjectOrgID(ctx, userID), ExternalURL: cfg.ExternalURL.URL, NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 28c261f5496..3a3d6633508 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -23,6 +23,7 @@ import ( "github.com/weaveworks/common/httpgrpc" "github.com/cortexproject/cortex/pkg/cortexpb" + "github.com/cortexproject/cortex/pkg/querier/stats" "github.com/cortexproject/cortex/pkg/util/validation" ) @@ -392,14 +393,23 @@ func TestMetricsQueryFuncErrors(t *testing.T) { } func TestRecordAndReportRuleQueryMetrics(t *testing.T) { - queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"}) + metrics := NewRuleEvalMetrics(Config{EnableQueryStats: true}, prometheus.DefaultRegisterer) mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { + queryStats := stats.FromContext(ctx) + queryStats.AddFetchedSeries(2) + queryStats.AddFetchedSamples(2) + queryStats.AddFetchedChunkBytes(10) + queryStats.AddFetchedDataBytes(14) time.Sleep(1 * time.Second) return promql.Vector{}, nil } - qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger()) + qf := RecordAndReportRuleQueryMetrics(mockFunc, "userID", metrics, log.NewNopLogger()) _, _ = qf(context.Background(), "test", time.Now()) - require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1)) + require.GreaterOrEqual(t, testutil.ToFloat64(metrics.RulerQuerySeconds.WithLabelValues("userID")), float64(1)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySeries.WithLabelValues("userID")), float64(2)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQuerySamples.WithLabelValues("userID")), float64(2)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQueryChunkBytes.WithLabelValues("userID")), float64(10)) + require.Equal(t, testutil.ToFloat64(metrics.RulerQueryDataBytes.WithLabelValues("userID")), float64(14)) } diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index ef211ddd898..93acdc26b17 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -225,11 +225,15 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { } type RuleEvalMetrics struct { - TotalWritesVec *prometheus.CounterVec - FailedWritesVec *prometheus.CounterVec - TotalQueriesVec *prometheus.CounterVec - FailedQueriesVec *prometheus.CounterVec - RulerQuerySeconds *prometheus.CounterVec + TotalWritesVec *prometheus.CounterVec + FailedWritesVec *prometheus.CounterVec + TotalQueriesVec *prometheus.CounterVec + FailedQueriesVec *prometheus.CounterVec + RulerQuerySeconds *prometheus.CounterVec + RulerQuerySeries *prometheus.CounterVec + RulerQuerySamples *prometheus.CounterVec + RulerQueryChunkBytes *prometheus.CounterVec + RulerQueryDataBytes *prometheus.CounterVec } func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics { @@ -256,6 +260,22 @@ func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics Name: "cortex_ruler_query_seconds_total", Help: "Total amount of wall clock time spent processing queries by the ruler.", }, []string{"user"}) + m.RulerQuerySeries = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_series_total", + Help: "Number of series fetched to execute a query by the ruler.", + }, []string{"user"}) + m.RulerQuerySamples = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_samples_total", + Help: "Number of samples fetched to execute a query by the ruler.", + }, []string{"user"}) + m.RulerQueryChunkBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_chunks_bytes_total", + Help: "Size of all chunks fetched to execute a query in bytes by the ruler.", + }, []string{"user"}) + m.RulerQueryDataBytes = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ruler_fetched_data_bytes_total", + Help: "Size of all data fetched to execute a query in bytes by the ruler.", + }, []string{"user"}) } return m @@ -270,6 +290,18 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) { if m.RulerQuerySeconds != nil { m.RulerQuerySeconds.DeleteLabelValues(userID) } + if m.RulerQuerySeries != nil { + m.RulerQuerySeries.DeleteLabelValues(userID) + } + if m.RulerQuerySamples != nil { + m.RulerQuerySamples.DeleteLabelValues(userID) + } + if m.RulerQueryChunkBytes != nil { + m.RulerQueryChunkBytes.DeleteLabelValues(userID) + } + if m.RulerQueryDataBytes != nil { + m.RulerQueryDataBytes.DeleteLabelValues(userID) + } } type RuleGroupMetrics struct { diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 60b68452ca5..dfc9800ad52 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -574,8 +574,16 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) { m.FailedQueriesVec.WithLabelValues("fake2").Add(10) m.RulerQuerySeconds.WithLabelValues("fake1").Add(10) m.RulerQuerySeconds.WithLabelValues("fake2").Add(10) - - metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total"} + m.RulerQuerySeries.WithLabelValues("fake1").Add(10) + m.RulerQuerySeries.WithLabelValues("fake2").Add(10) + m.RulerQuerySamples.WithLabelValues("fake1").Add(10) + m.RulerQuerySamples.WithLabelValues("fake2").Add(10) + m.RulerQueryChunkBytes.WithLabelValues("fake1").Add(10) + m.RulerQueryChunkBytes.WithLabelValues("fake2").Add(10) + m.RulerQueryDataBytes.WithLabelValues("fake1").Add(10) + m.RulerQueryDataBytes.WithLabelValues("fake2").Add(10) + + metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total", "cortex_ruler_fetched_series_total", "cortex_ruler_samples_total", "cortex_ruler_fetched_chunks_bytes_total", "cortex_ruler_fetched_data_bytes_total"} gm, err := reg.Gather() require.NoError(t, err) mfm, err := util.NewMetricFamilyMap(gm) diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 4f121570cce..a910896ad27 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -217,7 +217,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.") f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.") - f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.") + f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report query statistics for ruler queries to complete as a per user metric and as an info level log message.") f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics") cfg.RingCheckPeriod = 5 * time.Second