Skip to content

Commit 4a9adff

Browse files
author
Tyler Reid
authored
Add a new config and metric for reporting ruler query execution wall time. (#4317)
* Add a new config and metric for reporting ruler query execution wall time. Signed-off-by: Tyler Reid <[email protected]> * Spacing and PR number fixup Signed-off-by: Tyler Reid <[email protected]> * Wrap the defer in a function to make it defer after the return rather than after the if block. Add a unit test to validate we're tracking time correctly. Signed-off-by: Tyler Reid <[email protected]> * Use seconds for our duration rather than nanoseconds Signed-off-by: Tyler Reid <[email protected]> * Review comment fixes Signed-off-by: Tyler Reid <[email protected]> * Update config flag in the config docs Signed-off-by: Tyler Reid <[email protected]> * Pass counter rather than counter vector for metrics query function Signed-off-by: Tyler Reid <[email protected]> * Fix comment in MetricsQueryFunction Signed-off-by: Tyler Reid <[email protected]> * Move query metric and log to separate function. Add log message for ruler query time. Signed-off-by: Tyler Reid <[email protected]> * Update config file and change log to show this a per user metric Signed-off-by: Tyler Reid <[email protected]> * code review fixes Signed-off-by: Tyler Reid <[email protected]> * update log message for ruler query metrics Signed-off-by: Tyler Reid <[email protected]> * Remove append and just use the array for key values in the log messag Signed-off-by: Tyler Reid <[email protected]> * Add query-frontend component to front end log message Signed-off-by: Tyler Reid <[email protected]>
1 parent ed62246 commit 4a9adff

File tree

7 files changed

+68
-4
lines changed

7 files changed

+68
-4
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317
45

56
* [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260
67
* [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262

docs/configuration/config-file-reference.md

+5
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,11 @@ ring:
16171617
# processing will ignore them instead. Subject to sharding.
16181618
# CLI flag: -ruler.disabled-tenants
16191619
[disabled_tenants: <string> | default = ""]
1620+
1621+
# Report the wall time for ruler queries to complete as a per user metric and as
1622+
# an info level log message.
1623+
# CLI flag: -ruler.query-stats-enabled
1624+
[query_stats_enabled: <boolean> | default = false]
16201625
```
16211626
16221627
### `ruler_storage_config`

pkg/frontend/transport/handler.go

+1
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ func (f *Handler) reportQueryStats(r *http.Request, queryString url.Values, quer
173173
// Log stats.
174174
logMessage := append([]interface{}{
175175
"msg", "query stats",
176+
"component", "query-frontend",
176177
"method", r.Method,
177178
"path", r.URL.Path,
178179
"response_time", queryResponseTime,

pkg/ruler/compat.go

+42-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"time"
77

88
"github.com/go-kit/kit/log"
9+
"github.com/go-kit/kit/log/level"
910
"github.com/prometheus/client_golang/prometheus"
1011
"github.com/prometheus/client_golang/prometheus/promauto"
1112
"github.com/prometheus/prometheus/notifier"
@@ -20,6 +21,7 @@ import (
2021

2122
"github.com/cortexproject/cortex/pkg/cortexpb"
2223
"github.com/cortexproject/cortex/pkg/querier"
24+
util_log "github.com/cortexproject/cortex/pkg/util/log"
2325
)
2426

2527
// Pusher is an ingester server that accepts pushes.
@@ -146,9 +148,7 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules
146148
func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc {
147149
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
148150
queries.Inc()
149-
150151
result, err := qf(ctx, qs, t)
151-
152152
// We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
153153
// It returns promql.ErrStorage, if error should be reported back as 500.
154154
// Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures),
@@ -163,6 +163,33 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
163163
}
164164
}
165165

166+
func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc {
167+
if queryTime == nil {
168+
return qf
169+
}
170+
171+
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
172+
// If we've been passed a counter we want to record the wall time spent executing this request.
173+
timer := prometheus.NewTimer(nil)
174+
defer func() {
175+
querySeconds := timer.ObserveDuration().Seconds()
176+
queryTime.Add(querySeconds)
177+
178+
// Log ruler query stats.
179+
logMessage := []interface{}{
180+
"msg", "query stats",
181+
"component", "ruler",
182+
"cortex_ruler_query_seconds_total", querySeconds,
183+
"query", qs,
184+
}
185+
level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...)
186+
}()
187+
188+
result, err := qf(ctx, qs, t)
189+
return result, err
190+
}
191+
}
192+
166193
// This interface mimicks rules.Manager API. Interface is used to simplify tests.
167194
type RulesManager interface {
168195
// Starts rules manager. Blocks until Stop is called.
@@ -199,12 +226,24 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
199226
Name: "cortex_ruler_queries_failed_total",
200227
Help: "Number of failed queries by ruler.",
201228
})
229+
var rulerQuerySeconds *prometheus.CounterVec
230+
if cfg.EnableQueryStats {
231+
rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
232+
Name: "cortex_ruler_query_seconds_total",
233+
Help: "Total amount of wall clock time spent processing queries by the ruler.",
234+
}, []string{"user"})
235+
}
202236

203237
return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
238+
var queryTime prometheus.Counter = nil
239+
if rulerQuerySeconds != nil {
240+
queryTime = rulerQuerySeconds.WithLabelValues(userID)
241+
}
242+
204243
return rules.NewManager(&rules.ManagerOptions{
205244
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
206245
Queryable: q,
207-
QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries),
246+
QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger),
208247
Context: user.InjectOrgID(ctx, userID),
209248
ExternalURL: cfg.ExternalURL.URL,
210249
NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()),

pkg/ruler/compat_test.go

+14-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"testing"
99
"time"
1010

11+
"github.com/go-kit/kit/log"
1112
"github.com/prometheus/client_golang/prometheus"
1213
"github.com/prometheus/client_golang/prometheus/testutil"
1314
"github.com/prometheus/common/model"
@@ -230,7 +231,6 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
230231
mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
231232
return promql.Vector{}, tc.returnedError
232233
}
233-
234234
qf := MetricsQueryFunc(mockFunc, queries, failures)
235235

236236
_, err := qf(context.Background(), "test", time.Now())
@@ -241,3 +241,16 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
241241
})
242242
}
243243
}
244+
245+
func TestRecordAndReportRuleQueryMetrics(t *testing.T) {
246+
queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"})
247+
248+
mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
249+
time.Sleep(1 * time.Second)
250+
return promql.Vector{}, nil
251+
}
252+
qf := RecordAndReportRuleQueryMetrics(mockFunc, queryTime.WithLabelValues("userID"), log.NewNopLogger())
253+
_, _ = qf(context.Background(), "test", time.Now())
254+
255+
require.GreaterOrEqual(t, testutil.ToFloat64(queryTime.WithLabelValues("userID")), float64(1))
256+
}

pkg/ruler/ruler.go

+4
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ type Config struct {
115115
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`
116116

117117
RingCheckPeriod time.Duration `yaml:"-"`
118+
119+
EnableQueryStats bool `yaml:"query_stats_enabled"`
118120
}
119121

120122
// Validate config and returns error on failure
@@ -173,6 +175,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
173175
f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.")
174176
f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")
175177

178+
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.")
179+
176180
cfg.RingCheckPeriod = 5 * time.Second
177181
}
178182

pkg/ruler/ruler_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) {
6060
cfg.Ring.ListenPort = 0
6161
cfg.Ring.InstanceAddr = "localhost"
6262
cfg.Ring.InstanceID = "localhost"
63+
cfg.EnableQueryStats = false
6364

6465
// Create a cleanup function that will be called at the end of the test
6566
cleanup := func() {

0 commit comments

Comments
 (0)