Skip to content

Commit fbf0677

Browse files
authored
New ruler alerts that ignore user-errors (#4281)
* Move error translate queryable to querier package, to allow reuse. Signed-off-by: Peter Štibraný <[email protected]> * Introduce new metrics for tracking ruler evaluation and append errors. Signed-off-by: Peter Štibraný <[email protected]> * Added PR number. Signed-off-by: Peter Štibraný <[email protected]> * Fix name. Signed-off-by: Peter Štibraný <[email protected]> * Make lint happy. Signed-off-by: Peter Štibraný <[email protected]> * Address review feedback. Signed-off-by: Peter Štibraný <[email protected]>
1 parent ea53377 commit fbf0677

8 files changed

+273
-58
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254
4242
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_limited_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253
4343
* [ENHANCEMENT] Store-gateway: added `-store-gateway.sharding-ring.wait-stability-min-duration` and `-store-gateway.sharding-ring.wait-stability-max-duration` support to store-gateway, to wait for ring stability at startup. #4271
44+
* [ENHANCEMENT] Ruler: added new metrics for tracking total number of queries and push requests sent to ingester, as well as failed queries and push requests. Failures are only counted for internal errors, but not user-errors like limits or invalid query. This is in contrast to existing `cortex_prometheus_rule_evaluation_failures_total`, which is incremented also when query or samples appending fails due to user-errors. #4281
45+
* `cortex_ruler_write_requests_total`
46+
* `cortex_ruler_write_requests_failed_total`
47+
* `cortex_ruler_queries_total`
48+
* `cortex_ruler_queries_failed_total`
4449
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
4550
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176
4651
* [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184

pkg/api/handlers.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,8 @@ func NewQuerierHandler(
195195

196196
api := v1.NewAPI(
197197
engine,
198-
errorTranslateQueryable{queryable}, // Translate errors to errors expected by API.
199-
nil, // No remote write support.
198+
querier.NewErrorTranslateQueryable(queryable), // Translate errors to errors expected by API.
199+
nil, // No remote write support.
200200
exemplarQueryable,
201201
func(context.Context) v1.TargetRetriever { return &querier.DummyTargetRetriever{} },
202202
func(context.Context) v1.AlertmanagerRetriever { return &querier.DummyAlertmanagerRetriever{} },

pkg/cortex/modules.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) {
662662
// TODO: Consider wrapping logger to differentiate from querier module logger
663663
queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, rulerRegisterer, util_log.Logger)
664664

665-
managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides)
665+
managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, prometheus.DefaultRegisterer)
666666
manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, prometheus.DefaultRegisterer, util_log.Logger)
667667
if err != nil {
668668
return nil, err

pkg/api/queryable.go pkg/querier/error_translate_queryable.go

+31-21
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package api
1+
package querier
22

33
import (
44
"context"
@@ -13,20 +13,26 @@ import (
1313
"github.com/cortexproject/cortex/pkg/util/validation"
1414
)
1515

16-
func translateError(err error) error {
16+
// TranslateToPromqlAPIError converts error to one of promql.Errors for consumption in PromQL API.
17+
// PromQL API only recognizes few errors, and converts everything else to HTTP status code 422.
18+
//
19+
// Specifically, it supports:
20+
//
21+
// promql.ErrQueryCanceled, mapped to 503
22+
// promql.ErrQueryTimeout, mapped to 503
23+
// promql.ErrStorage mapped to 500
24+
// anything else is mapped to 422
25+
//
26+
// Querier code produces different kinds of errors, and we want to map them to above-mentioned HTTP status codes correctly.
27+
//
28+
// Details:
29+
// - vendor/github.com/prometheus/prometheus/web/api/v1/api.go, respondError function only accepts *apiError types.
30+
// - translation of error to *apiError happens in vendor/github.com/prometheus/prometheus/web/api/v1/api.go, returnAPIError method.
31+
func TranslateToPromqlAPIError(err error) error {
1732
if err == nil {
1833
return err
1934
}
2035

21-
// vendor/github.com/prometheus/prometheus/web/api/v1/api.go, respondError function only accepts
22-
// *apiError types.
23-
// Translation of error to *apiError happens in vendor/github.com/prometheus/prometheus/web/api/v1/api.go, returnAPIError method.
24-
// It only supports:
25-
// promql.ErrQueryCanceled, mapped to 503
26-
// promql.ErrQueryTimeout, mapped to 503
27-
// promql.ErrStorage mapped to 500
28-
// anything else is mapped to 422
29-
3036
switch errors.Cause(err).(type) {
3137
case promql.ErrStorage, promql.ErrTooManySamples, promql.ErrQueryCanceled, promql.ErrQueryTimeout:
3238
// Don't translate those, just in case we use them internally.
@@ -63,18 +69,22 @@ func translateError(err error) error {
6369
}
6470
}
6571

72+
func NewErrorTranslateQueryable(q storage.SampleAndChunkQueryable) storage.SampleAndChunkQueryable {
73+
return errorTranslateQueryable{q}
74+
}
75+
6676
type errorTranslateQueryable struct {
6777
q storage.SampleAndChunkQueryable
6878
}
6979

7080
func (e errorTranslateQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
7181
q, err := e.q.Querier(ctx, mint, maxt)
72-
return errorTranslateQuerier{q: q}, translateError(err)
82+
return errorTranslateQuerier{q: q}, TranslateToPromqlAPIError(err)
7383
}
7484

7585
func (e errorTranslateQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
7686
q, err := e.q.ChunkQuerier(ctx, mint, maxt)
77-
return errorTranslateChunkQuerier{q: q}, translateError(err)
87+
return errorTranslateChunkQuerier{q: q}, TranslateToPromqlAPIError(err)
7888
}
7989

8090
type errorTranslateQuerier struct {
@@ -83,16 +93,16 @@ type errorTranslateQuerier struct {
8393

8494
func (e errorTranslateQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
8595
values, warnings, err := e.q.LabelValues(name, matchers...)
86-
return values, warnings, translateError(err)
96+
return values, warnings, TranslateToPromqlAPIError(err)
8797
}
8898

8999
func (e errorTranslateQuerier) LabelNames() ([]string, storage.Warnings, error) {
90100
values, warnings, err := e.q.LabelNames()
91-
return values, warnings, translateError(err)
101+
return values, warnings, TranslateToPromqlAPIError(err)
92102
}
93103

94104
func (e errorTranslateQuerier) Close() error {
95-
return translateError(e.q.Close())
105+
return TranslateToPromqlAPIError(e.q.Close())
96106
}
97107

98108
func (e errorTranslateQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
@@ -106,16 +116,16 @@ type errorTranslateChunkQuerier struct {
106116

107117
func (e errorTranslateChunkQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
108118
values, warnings, err := e.q.LabelValues(name, matchers...)
109-
return values, warnings, translateError(err)
119+
return values, warnings, TranslateToPromqlAPIError(err)
110120
}
111121

112122
func (e errorTranslateChunkQuerier) LabelNames() ([]string, storage.Warnings, error) {
113123
values, warnings, err := e.q.LabelNames()
114-
return values, warnings, translateError(err)
124+
return values, warnings, TranslateToPromqlAPIError(err)
115125
}
116126

117127
func (e errorTranslateChunkQuerier) Close() error {
118-
return translateError(e.q.Close())
128+
return TranslateToPromqlAPIError(e.q.Close())
119129
}
120130

121131
func (e errorTranslateChunkQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.ChunkSeriesSet {
@@ -136,7 +146,7 @@ func (e errorTranslateSeriesSet) At() storage.Series {
136146
}
137147

138148
func (e errorTranslateSeriesSet) Err() error {
139-
return translateError(e.s.Err())
149+
return TranslateToPromqlAPIError(e.s.Err())
140150
}
141151

142152
func (e errorTranslateSeriesSet) Warnings() storage.Warnings {
@@ -156,7 +166,7 @@ func (e errorTranslateChunkSeriesSet) At() storage.ChunkSeries {
156166
}
157167

158168
func (e errorTranslateChunkSeriesSet) Err() error {
159-
return translateError(e.s.Err())
169+
return TranslateToPromqlAPIError(e.s.Err())
160170
}
161171

162172
func (e errorTranslateChunkSeriesSet) Warnings() storage.Warnings {

pkg/api/queryable_test.go pkg/querier/error_translate_queryable_test.go

+23-24
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package api
1+
package querier
22

33
import (
44
"context"
@@ -9,6 +9,7 @@ import (
99
"testing"
1010
"time"
1111

12+
"github.com/go-kit/kit/log"
1213
"github.com/pkg/errors"
1314
"github.com/prometheus/client_golang/prometheus"
1415
"github.com/prometheus/common/route"
@@ -22,8 +23,6 @@ import (
2223
"github.com/weaveworks/common/user"
2324

2425
"github.com/cortexproject/cortex/pkg/chunk"
25-
"github.com/cortexproject/cortex/pkg/querier"
26-
util_log "github.com/cortexproject/cortex/pkg/util/log"
2726
"github.com/cortexproject/cortex/pkg/util/validation"
2827
)
2928

@@ -109,9 +108,9 @@ func TestApiStatusCodes(t *testing.T) {
109108
},
110109
} {
111110
for k, q := range map[string]storage.SampleAndChunkQueryable{
112-
"error from queryable": testQueryable{err: tc.err},
113-
"error from querier": testQueryable{q: testQuerier{err: tc.err}},
114-
"error from seriesset": testQueryable{q: testQuerier{s: testSeriesSet{err: tc.err}}},
111+
"error from queryable": errorTestQueryable{err: tc.err},
112+
"error from querier": errorTestQueryable{q: errorTestQuerier{err: tc.err}},
113+
"error from seriesset": errorTestQueryable{q: errorTestQuerier{s: errorTestSeriesSet{err: tc.err}}},
115114
} {
116115
t.Run(fmt.Sprintf("%s/%d", k, ix), func(t *testing.T) {
117116
r := createPrometheusAPI(errorTranslateQueryable{q: q})
@@ -131,7 +130,7 @@ func TestApiStatusCodes(t *testing.T) {
131130

132131
func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
133132
engine := promql.NewEngine(promql.EngineOpts{
134-
Logger: util_log.Logger,
133+
Logger: log.NewNopLogger(),
135134
Reg: nil,
136135
ActiveQueryTracker: nil,
137136
MaxSamples: 100,
@@ -143,17 +142,17 @@ func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
143142
q,
144143
nil,
145144
nil,
146-
func(context.Context) v1.TargetRetriever { return &querier.DummyTargetRetriever{} },
147-
func(context.Context) v1.AlertmanagerRetriever { return &querier.DummyAlertmanagerRetriever{} },
145+
func(context.Context) v1.TargetRetriever { return &DummyTargetRetriever{} },
146+
func(context.Context) v1.AlertmanagerRetriever { return &DummyAlertmanagerRetriever{} },
148147
func() config.Config { return config.Config{} },
149148
map[string]string{}, // TODO: include configuration flags
150149
v1.GlobalURLOptions{},
151150
func(f http.HandlerFunc) http.HandlerFunc { return f },
152151
nil, // Only needed for admin APIs.
153152
"", // This is for snapshots, which is disabled when admin APIs are disabled. Hence empty.
154153
false, // Disable admin APIs.
155-
util_log.Logger,
156-
func(context.Context) v1.RulesRetriever { return &querier.DummyRulesRetriever{} },
154+
log.NewNopLogger(),
155+
func(context.Context) v1.RulesRetriever { return &DummyRulesRetriever{} },
157156
0, 0, 0, // Remote read samples and concurrency limit.
158157
regexp.MustCompile(".*"),
159158
func() (v1.RuntimeInfo, error) { return v1.RuntimeInfo{}, errors.New("not implemented") },
@@ -168,62 +167,62 @@ func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
168167
return promRouter
169168
}
170169

171-
type testQueryable struct {
170+
type errorTestQueryable struct {
172171
q storage.Querier
173172
err error
174173
}
175174

176-
func (t testQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
175+
func (t errorTestQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
177176
return nil, t.err
178177
}
179178

180-
func (t testQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
179+
func (t errorTestQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
181180
if t.q != nil {
182181
return t.q, nil
183182
}
184183
return nil, t.err
185184
}
186185

187-
type testQuerier struct {
186+
type errorTestQuerier struct {
188187
s storage.SeriesSet
189188
err error
190189
}
191190

192-
func (t testQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
191+
func (t errorTestQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
193192
return nil, nil, t.err
194193
}
195194

196-
func (t testQuerier) LabelNames() ([]string, storage.Warnings, error) {
195+
func (t errorTestQuerier) LabelNames() ([]string, storage.Warnings, error) {
197196
return nil, nil, t.err
198197
}
199198

200-
func (t testQuerier) Close() error {
199+
func (t errorTestQuerier) Close() error {
201200
return nil
202201
}
203202

204-
func (t testQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
203+
func (t errorTestQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
205204
if t.s != nil {
206205
return t.s
207206
}
208207
return storage.ErrSeriesSet(t.err)
209208
}
210209

211-
type testSeriesSet struct {
210+
type errorTestSeriesSet struct {
212211
err error
213212
}
214213

215-
func (t testSeriesSet) Next() bool {
214+
func (t errorTestSeriesSet) Next() bool {
216215
return false
217216
}
218217

219-
func (t testSeriesSet) At() storage.Series {
218+
func (t errorTestSeriesSet) At() storage.Series {
220219
return nil
221220
}
222221

223-
func (t testSeriesSet) Err() error {
222+
func (t errorTestSeriesSet) Err() error {
224223
return t.err
225224
}
226225

227-
func (t testSeriesSet) Warnings() storage.Warnings {
226+
func (t errorTestSeriesSet) Warnings() storage.Warnings {
228227
return nil
229228
}

0 commit comments

Comments
 (0)