Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new config and metric for reporting ruler query execution wall time. #4317

Merged
merged 16 commits into from
Jul 9, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* Ensure that a ring store is configured using `-alertmanager.sharding-ring.store`, and set the flags relevant to the chosen store type.
* Enable the feature using `-alertmanager.sharding-enabled`.
* Note the prior addition of a new configuration option `-alertmanager.persist-interval`. This sets the interval between persisting the current alertmanager state (notification log and silences) to object storage. See the [configuration file reference](https://cortexmetrics.io/docs/configuration/configuration-file/#alertmanager_config) for more information.
* [FEATURE] Ruler: Add new `-ruler.enable-query-stats` which when enabled will report the `cortex_ruler_query_seconds_total` metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317
* [ENHANCEMENT] Alertmanager: Cleanup persisted state objects from remote storage when a tenant configuration is deleted. #4167
* [ENHANCEMENT] Storage: Added the ability to disable Open Census within GCS client (e.g `-gcs.enable-opencensus=false`). #4219
* [ENHANCEMENT] Etcd: Added username and password to etcd config. #4205
Expand Down
4 changes: 4 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1616,6 +1616,10 @@ ring:
# processing will ignore them instead. Subject to sharding.
# CLI flag: -ruler.disabled-tenants
[disabled_tenants: <string> | default = ""]

# Report the wall time for ruler queries to complete as a metric.
# CLI flag: -ruler.enable-query-stats
[enable_query_stats: <boolean> | default = false]
```

### `ruler_storage_config`
Expand Down
19 changes: 17 additions & 2 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,17 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides Rules
}
}

func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc {
func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, queryTime *prometheus.CounterVec, userID string) rules.QueryFunc {
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
queries.Inc()

var startTime time.Time
// If we've been passed a counter vec we want to record the wall time spent executing this request.
if queryTime != nil {
startTime = time.Now()
defer func() { queryTime.WithLabelValues(userID).Add(time.Since(startTime).Seconds()) }()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use prometheus.NewTimer() for this purpose, instead of tracking manually.

}

result, err := qf(ctx, qs, t)

// We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
Expand Down Expand Up @@ -199,12 +206,20 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
Name: "cortex_ruler_queries_failed_total",
Help: "Number of failed queries by ruler.",
})
var rulerQuerySeconds *prometheus.CounterVec
if cfg.RulerEnableQueryStats {
rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Name: "cortex_ruler_query_seconds_total",
Help: "Total amount of wall clock time spend processing queries by the ruler.",
}, []string{"user"})
}

return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {

return rules.NewManager(&rules.ManagerOptions{
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
Queryable: q,
QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries),
QueryFunc: MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries, rulerQuerySeconds, userID),
Context: user.InjectOrgID(ctx, userID),
ExternalURL: cfg.ExternalURL.URL,
NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()),
Expand Down
22 changes: 20 additions & 2 deletions pkg/ruler/compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,12 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
t.Run(name, func(t *testing.T) {
queries := prometheus.NewCounter(prometheus.CounterOpts{})
failures := prometheus.NewCounter(prometheus.CounterOpts{})
queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"})

mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
return promql.Vector{}, tc.returnedError
}

qf := MetricsQueryFunc(mockFunc, queries, failures)
qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID")

_, err := qf(context.Background(), "test", time.Now())
require.Equal(t, tc.returnedError, err)
Expand All @@ -241,3 +241,21 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
})
}
}

func TestMetricsQueryFuncMetrics(t *testing.T) {
queries := prometheus.NewCounter(prometheus.CounterOpts{})
failures := prometheus.NewCounter(prometheus.CounterOpts{})
queryTime := prometheus.NewCounterVec(prometheus.CounterOpts{}, []string{"user"})

mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
time.Sleep(1 * time.Second)
return promql.Vector{}, nil
}
qf := MetricsQueryFunc(mockFunc, queries, failures, queryTime, "userID")

_, _ = qf(context.Background(), "test", time.Now())

require.Equal(t, 1, int(testutil.ToFloat64(queries)))
require.Equal(t, 0, int(testutil.ToFloat64(failures)))
require.LessOrEqual(t, float64(1), testutil.ToFloat64(queryTime.WithLabelValues("userID")))
}
4 changes: 4 additions & 0 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ type Config struct {
DisabledTenants flagext.StringSliceCSV `yaml:"disabled_tenants"`

RingCheckPeriod time.Duration `yaml:"-"`

RulerEnableQueryStats bool `yaml:"enable_query_stats"`
}

// Validate config and returns error on failure
Expand Down Expand Up @@ -173,6 +175,8 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.Var(&cfg.EnabledTenants, "ruler.enabled-tenants", "Comma separated list of tenants whose rules this ruler can evaluate. If specified, only these tenants will be handled by ruler, otherwise this ruler can process rules from all tenants. Subject to sharding.")
f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")

f.BoolVar(&cfg.RulerEnableQueryStats, "ruler.enable-query-stats", false, "Report the wall time for ruler queries to complete as a metric.")

cfg.RingCheckPeriod = 5 * time.Second
}

Expand Down
1 change: 1 addition & 0 deletions pkg/ruler/ruler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func defaultRulerConfig(store rulestore.RuleStore) (Config, func()) {
cfg.Ring.ListenPort = 0
cfg.Ring.InstanceAddr = "localhost"
cfg.Ring.InstanceID = "localhost"
cfg.RulerEnableQueryStats = false

// Create a cleanup function that will be called at the end of the test
cleanup := func() {
Expand Down