Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New ruler alerts that ignore user-errors #4281

Merged
merged 6 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_limited_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253
* [ENHANCEMENT] Store-gateway: added `-store-gateway.sharding-ring.wait-stability-min-duration` and `-store-gateway.sharding-ring.wait-stability-max-duration` support to store-gateway, to wait for ring stability at startup. #4271
* [ENHANCEMENT] Ruler: added new metrics for tracking total number of queries and push requests sent to ingester, as well as failed queries and push requests. Failures are only counted for internal errors, but not user-errors like limits or invalid query. This is in contrast to existing `cortex_prometheus_rule_evaluation_failures_total`, which is incremented also when query or samples appending fails due to user-errors. #4281
* `cortex_ruler_write_requests_total`
* `cortex_ruler_write_requests_failed_total`
* `cortex_ruler_queries_total`
* `cortex_ruler_queries_failed_total`
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176
* [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184
Expand Down
4 changes: 2 additions & 2 deletions pkg/api/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ func NewQuerierHandler(

api := v1.NewAPI(
engine,
errorTranslateQueryable{queryable}, // Translate errors to errors expected by API.
nil, // No remote write support.
querier.NewErrorTranslateQueryable(queryable), // Translate errors to errors expected by API.
nil, // No remote write support.
exemplarQueryable,
func(context.Context) v1.TargetRetriever { return &querier.DummyTargetRetriever{} },
func(context.Context) v1.AlertmanagerRetriever { return &querier.DummyAlertmanagerRetriever{} },
Expand Down
2 changes: 1 addition & 1 deletion pkg/cortex/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ func (t *Cortex) initRuler() (serv services.Service, err error) {
// TODO: Consider wrapping logger to differentiate from querier module logger
queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, t.TombstonesLoader, rulerRegisterer, util_log.Logger)

managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides)
managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, prometheus.DefaultRegisterer)
manager, err := ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, prometheus.DefaultRegisterer, util_log.Logger)
if err != nil {
return nil, err
Expand Down
52 changes: 31 additions & 21 deletions pkg/api/queryable.go → pkg/querier/error_translate_queryable.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package api
package querier

import (
"context"
Expand All @@ -13,20 +13,26 @@ import (
"github.com/cortexproject/cortex/pkg/util/validation"
)

func translateError(err error) error {
// TranslateToPromqlAPIError converts error to one of promql.Errors for consumption in PromQL API.
// PromQL API only recognizes few errors, and converts everything else to HTTP status code 422.
//
// Specifically, it supports:
//
// promql.ErrQueryCanceled, mapped to 503
// promql.ErrQueryTimeout, mapped to 503
// promql.ErrStorage mapped to 500
// anything else is mapped to 422
//
// Querier code produces different kinds of errors, and we want to map them to above-mentioned HTTP status codes correctly.
//
// Details:
// - vendor/github.com/prometheus/prometheus/web/api/v1/api.go, respondError function only accepts *apiError types.
// - translation of error to *apiError happens in vendor/github.com/prometheus/prometheus/web/api/v1/api.go, returnAPIError method.
func TranslateToPromqlAPIError(err error) error {
if err == nil {
return err
}

// vendor/github.com/prometheus/prometheus/web/api/v1/api.go, respondError function only accepts
// *apiError types.
// Translation of error to *apiError happens in vendor/github.com/prometheus/prometheus/web/api/v1/api.go, returnAPIError method.
// It only supports:
// promql.ErrQueryCanceled, mapped to 503
// promql.ErrQueryTimeout, mapped to 503
// promql.ErrStorage mapped to 500
// anything else is mapped to 422

switch errors.Cause(err).(type) {
case promql.ErrStorage, promql.ErrTooManySamples, promql.ErrQueryCanceled, promql.ErrQueryTimeout:
// Don't translate those, just in case we use them internally.
Expand Down Expand Up @@ -63,18 +69,22 @@ func translateError(err error) error {
}
}

func NewErrorTranslateQueryable(q storage.SampleAndChunkQueryable) storage.SampleAndChunkQueryable {
return errorTranslateQueryable{q}
}

type errorTranslateQueryable struct {
q storage.SampleAndChunkQueryable
}

func (e errorTranslateQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
q, err := e.q.Querier(ctx, mint, maxt)
return errorTranslateQuerier{q: q}, translateError(err)
return errorTranslateQuerier{q: q}, TranslateToPromqlAPIError(err)
}

func (e errorTranslateQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
q, err := e.q.ChunkQuerier(ctx, mint, maxt)
return errorTranslateChunkQuerier{q: q}, translateError(err)
return errorTranslateChunkQuerier{q: q}, TranslateToPromqlAPIError(err)
}

type errorTranslateQuerier struct {
Expand All @@ -83,16 +93,16 @@ type errorTranslateQuerier struct {

func (e errorTranslateQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
values, warnings, err := e.q.LabelValues(name, matchers...)
return values, warnings, translateError(err)
return values, warnings, TranslateToPromqlAPIError(err)
}

func (e errorTranslateQuerier) LabelNames() ([]string, storage.Warnings, error) {
values, warnings, err := e.q.LabelNames()
return values, warnings, translateError(err)
return values, warnings, TranslateToPromqlAPIError(err)
}

func (e errorTranslateQuerier) Close() error {
return translateError(e.q.Close())
return TranslateToPromqlAPIError(e.q.Close())
}

func (e errorTranslateQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
Expand All @@ -106,16 +116,16 @@ type errorTranslateChunkQuerier struct {

func (e errorTranslateChunkQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
values, warnings, err := e.q.LabelValues(name, matchers...)
return values, warnings, translateError(err)
return values, warnings, TranslateToPromqlAPIError(err)
}

func (e errorTranslateChunkQuerier) LabelNames() ([]string, storage.Warnings, error) {
values, warnings, err := e.q.LabelNames()
return values, warnings, translateError(err)
return values, warnings, TranslateToPromqlAPIError(err)
}

func (e errorTranslateChunkQuerier) Close() error {
return translateError(e.q.Close())
return TranslateToPromqlAPIError(e.q.Close())
}

func (e errorTranslateChunkQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.ChunkSeriesSet {
Expand All @@ -136,7 +146,7 @@ func (e errorTranslateSeriesSet) At() storage.Series {
}

func (e errorTranslateSeriesSet) Err() error {
return translateError(e.s.Err())
return TranslateToPromqlAPIError(e.s.Err())
}

func (e errorTranslateSeriesSet) Warnings() storage.Warnings {
Expand All @@ -156,7 +166,7 @@ func (e errorTranslateChunkSeriesSet) At() storage.ChunkSeries {
}

func (e errorTranslateChunkSeriesSet) Err() error {
return translateError(e.s.Err())
return TranslateToPromqlAPIError(e.s.Err())
}

func (e errorTranslateChunkSeriesSet) Warnings() storage.Warnings {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package api
package querier

import (
"context"
Expand All @@ -9,6 +9,7 @@ import (
"testing"
"time"

"github.com/go-kit/kit/log"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/route"
Expand All @@ -22,8 +23,6 @@ import (
"github.com/weaveworks/common/user"

"github.com/cortexproject/cortex/pkg/chunk"
"github.com/cortexproject/cortex/pkg/querier"
util_log "github.com/cortexproject/cortex/pkg/util/log"
"github.com/cortexproject/cortex/pkg/util/validation"
)

Expand Down Expand Up @@ -109,9 +108,9 @@ func TestApiStatusCodes(t *testing.T) {
},
} {
for k, q := range map[string]storage.SampleAndChunkQueryable{
"error from queryable": testQueryable{err: tc.err},
"error from querier": testQueryable{q: testQuerier{err: tc.err}},
"error from seriesset": testQueryable{q: testQuerier{s: testSeriesSet{err: tc.err}}},
"error from queryable": errorTestQueryable{err: tc.err},
"error from querier": errorTestQueryable{q: errorTestQuerier{err: tc.err}},
"error from seriesset": errorTestQueryable{q: errorTestQuerier{s: errorTestSeriesSet{err: tc.err}}},
} {
t.Run(fmt.Sprintf("%s/%d", k, ix), func(t *testing.T) {
r := createPrometheusAPI(errorTranslateQueryable{q: q})
Expand All @@ -131,7 +130,7 @@ func TestApiStatusCodes(t *testing.T) {

func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
engine := promql.NewEngine(promql.EngineOpts{
Logger: util_log.Logger,
Logger: log.NewNopLogger(),
Reg: nil,
ActiveQueryTracker: nil,
MaxSamples: 100,
Expand All @@ -143,17 +142,17 @@ func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
q,
nil,
nil,
func(context.Context) v1.TargetRetriever { return &querier.DummyTargetRetriever{} },
func(context.Context) v1.AlertmanagerRetriever { return &querier.DummyAlertmanagerRetriever{} },
func(context.Context) v1.TargetRetriever { return &DummyTargetRetriever{} },
func(context.Context) v1.AlertmanagerRetriever { return &DummyAlertmanagerRetriever{} },
func() config.Config { return config.Config{} },
map[string]string{}, // TODO: include configuration flags
v1.GlobalURLOptions{},
func(f http.HandlerFunc) http.HandlerFunc { return f },
nil, // Only needed for admin APIs.
"", // This is for snapshots, which is disabled when admin APIs are disabled. Hence empty.
false, // Disable admin APIs.
util_log.Logger,
func(context.Context) v1.RulesRetriever { return &querier.DummyRulesRetriever{} },
log.NewNopLogger(),
func(context.Context) v1.RulesRetriever { return &DummyRulesRetriever{} },
0, 0, 0, // Remote read samples and concurrency limit.
regexp.MustCompile(".*"),
func() (v1.RuntimeInfo, error) { return v1.RuntimeInfo{}, errors.New("not implemented") },
Expand All @@ -168,62 +167,62 @@ func createPrometheusAPI(q storage.SampleAndChunkQueryable) *route.Router {
return promRouter
}

type testQueryable struct {
type errorTestQueryable struct {
q storage.Querier
err error
}

func (t testQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
func (t errorTestQueryable) ChunkQuerier(ctx context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
return nil, t.err
}

func (t testQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
func (t errorTestQueryable) Querier(ctx context.Context, mint, maxt int64) (storage.Querier, error) {
if t.q != nil {
return t.q, nil
}
return nil, t.err
}

type testQuerier struct {
type errorTestQuerier struct {
s storage.SeriesSet
err error
}

func (t testQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
func (t errorTestQuerier) LabelValues(name string, matchers ...*labels.Matcher) ([]string, storage.Warnings, error) {
return nil, nil, t.err
}

func (t testQuerier) LabelNames() ([]string, storage.Warnings, error) {
func (t errorTestQuerier) LabelNames() ([]string, storage.Warnings, error) {
return nil, nil, t.err
}

func (t testQuerier) Close() error {
func (t errorTestQuerier) Close() error {
return nil
}

func (t testQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
func (t errorTestQuerier) Select(sortSeries bool, hints *storage.SelectHints, matchers ...*labels.Matcher) storage.SeriesSet {
if t.s != nil {
return t.s
}
return storage.ErrSeriesSet(t.err)
}

type testSeriesSet struct {
type errorTestSeriesSet struct {
err error
}

func (t testSeriesSet) Next() bool {
func (t errorTestSeriesSet) Next() bool {
return false
}

func (t testSeriesSet) At() storage.Series {
func (t errorTestSeriesSet) At() storage.Series {
return nil
}

func (t testSeriesSet) Err() error {
func (t errorTestSeriesSet) Err() error {
return t.err
}

func (t testSeriesSet) Warnings() storage.Warnings {
func (t errorTestSeriesSet) Warnings() storage.Warnings {
return nil
}
Loading