Skip to content

Commit 76d5b3c

Browse files
authored
Send new label 'status' for ingester failures (#4442)
* Send new label statusFamily for ingestor failures Signed-off-by: Daniel Blando <[email protected]> * update changelog Signed-off-by: Daniel Blando <[email protected]> * Change changelog Signed-off-by: Daniel Blando <[email protected]> * Update label name Signed-off-by: Daniel Blando <[email protected]>
1 parent c2ed27e commit 76d5b3c

File tree

3 files changed

+48
-7
lines changed

3 files changed

+48
-7
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* [FEATURE] Ruler: Add new `-ruler.query-stats-enabled` which when enabled will report the `cortex_ruler_query_seconds_total` as a per-user metric that tracks the sum of the wall time of executing queries in the ruler in seconds. #4317
1818
* [FEATURE] Query Frontend: Add `cortex_query_fetched_series_total` and `cortex_query_fetched_chunks_bytes_total` per-user counters to expose the number of series and bytes fetched as part of queries. These metrics can be enabled with the `-frontend.query-stats-enabled` flag (or its respective YAML config option `query_stats_enabled`). #4343
1919
* [FEATURE] AlertManager: Add support for SNS Receiver. #4382
20+
* [FEATURE] Distributor: Add label `status` to metric `cortex_distributor_ingester_append_failures_total` #4442
2021
* [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262
2122
* [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341
2223
* [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342

pkg/distributor/distributor.go

+13-3
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove
300300
Namespace: "cortex",
301301
Name: "distributor_ingester_append_failures_total",
302302
Help: "The total number of failed batch appends sent to ingesters.",
303-
}, []string{"ingester", "type"}),
303+
}, []string{"ingester", "type", "status"}),
304304
ingesterQueries: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
305305
Namespace: "cortex",
306306
Name: "distributor_ingester_queries_total",
@@ -819,19 +819,29 @@ func (d *Distributor) send(ctx context.Context, ingester ring.InstanceDesc, time
819819
if len(metadata) > 0 {
820820
d.ingesterAppends.WithLabelValues(ingester.Addr, typeMetadata).Inc()
821821
if err != nil {
822-
d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata).Inc()
822+
d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeMetadata, getErrorStatus(err)).Inc()
823823
}
824824
}
825825
if len(timeseries) > 0 {
826826
d.ingesterAppends.WithLabelValues(ingester.Addr, typeSamples).Inc()
827827
if err != nil {
828-
d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples).Inc()
828+
d.ingesterAppendFailures.WithLabelValues(ingester.Addr, typeSamples, getErrorStatus(err)).Inc()
829829
}
830830
}
831831

832832
return err
833833
}
834834

835+
func getErrorStatus(err error) string {
836+
status := "5xx"
837+
httpResp, ok := httpgrpc.HTTPResponseFromError(err)
838+
if ok && httpResp.Code/100 == 4 {
839+
status = "4xx"
840+
}
841+
842+
return status
843+
}
844+
835845
// ForReplicationSet runs f, in parallel, for all ingesters in the input replication set.
836846
func (d *Distributor) ForReplicationSet(ctx context.Context, replicationSet ring.ReplicationSet, f func(context.Context, ingester_client.IngesterClient) (interface{}, error)) ([]interface{}, error) {
837847
return replicationSet.Do(ctx, d.cfg.ExtraQueryDelay, func(ctx context.Context, ing *ring.InstanceDesc) (interface{}, error) {

pkg/distributor/distributor_test.go

+34-4
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ import (
4848
)
4949

5050
var (
51-
errFail = fmt.Errorf("Fail")
51+
errFail = httpgrpc.Errorf(http.StatusInternalServerError, "Fail")
5252
emptyResponse = &cortexpb.WriteResponse{}
5353
)
5454

@@ -124,6 +124,7 @@ func TestDistributor_Push(t *testing.T) {
124124
expectedResponse *cortexpb.WriteResponse
125125
expectedError error
126126
expectedMetrics string
127+
ingesterError error
127128
}{
128129
"A push of no samples shouldn't block or return error, even if ingesters are sad": {
129130
numIngesters: 3,
@@ -203,7 +204,7 @@ func TestDistributor_Push(t *testing.T) {
203204
expectedMetrics: `
204205
# HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters.
205206
# TYPE cortex_distributor_ingester_append_failures_total counter
206-
cortex_distributor_ingester_append_failures_total{ingester="2",type="samples"} 1
207+
cortex_distributor_ingester_append_failures_total{ingester="2",status="5xx",type="samples"} 1
207208
# HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters.
208209
# TYPE cortex_distributor_ingester_appends_total counter
209210
cortex_distributor_ingester_appends_total{ingester="0",type="samples"} 1
@@ -218,10 +219,30 @@ func TestDistributor_Push(t *testing.T) {
218219
metadata: 1,
219220
metricNames: []string{distributorAppend, distributorAppendFailure},
220221
expectedResponse: emptyResponse,
222+
ingesterError: httpgrpc.Errorf(http.StatusInternalServerError, "Fail"),
221223
expectedMetrics: `
222224
# HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters.
223225
# TYPE cortex_distributor_ingester_append_failures_total counter
224-
cortex_distributor_ingester_append_failures_total{ingester="2",type="metadata"} 1
226+
cortex_distributor_ingester_append_failures_total{ingester="2",status="5xx",type="metadata"} 1
227+
# HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters.
228+
# TYPE cortex_distributor_ingester_appends_total counter
229+
cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1
230+
cortex_distributor_ingester_appends_total{ingester="1",type="metadata"} 1
231+
cortex_distributor_ingester_appends_total{ingester="2",type="metadata"} 1
232+
`,
233+
},
234+
"A push to overloaded ingesters should report the correct metrics": {
235+
numIngesters: 3,
236+
happyIngesters: 2,
237+
samples: samplesIn{num: 0, startTimestampMs: 123456789000},
238+
metadata: 1,
239+
metricNames: []string{distributorAppend, distributorAppendFailure},
240+
expectedResponse: emptyResponse,
241+
ingesterError: httpgrpc.Errorf(http.StatusTooManyRequests, "Fail"),
242+
expectedMetrics: `
243+
# HELP cortex_distributor_ingester_append_failures_total The total number of failed batch appends sent to ingesters.
244+
# TYPE cortex_distributor_ingester_append_failures_total counter
245+
cortex_distributor_ingester_append_failures_total{ingester="2",status="4xx",type="metadata"} 1
225246
# HELP cortex_distributor_ingester_appends_total The total number of batch appends sent to ingesters.
226247
# TYPE cortex_distributor_ingester_appends_total counter
227248
cortex_distributor_ingester_appends_total{ingester="0",type="metadata"} 1
@@ -243,6 +264,7 @@ func TestDistributor_Push(t *testing.T) {
243264
numDistributors: 1,
244265
shardByAllLabels: shardByAllLabels,
245266
limits: limits,
267+
errFail: tc.ingesterError,
246268
})
247269
defer stopAll(ds, r)
248270

@@ -1905,6 +1927,7 @@ type prepConfig struct {
19051927
maxInflightRequests int
19061928
maxIngestionRate float64
19071929
replicationFactor int
1930+
errFail error
19081931
}
19091932

19101933
func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *ring.Ring, []*prometheus.Registry) {
@@ -1916,8 +1939,14 @@ func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *rin
19161939
})
19171940
}
19181941
for i := cfg.happyIngesters; i < cfg.numIngesters; i++ {
1942+
miError := errFail
1943+
if cfg.errFail != nil {
1944+
miError = cfg.errFail
1945+
}
1946+
19191947
ingesters = append(ingesters, mockIngester{
19201948
queryDelay: cfg.queryDelay,
1949+
failResp: miError,
19211950
})
19221951
}
19231952

@@ -2149,6 +2178,7 @@ type mockIngester struct {
21492178
client.IngesterClient
21502179
grpc_health_v1.HealthClient
21512180
happy bool
2181+
failResp error
21522182
stats client.UsersStatsResponse
21532183
timeseries map[uint32]*cortexpb.PreallocTimeseries
21542184
metadata map[uint32]map[cortexpb.MetricMetadata]struct{}
@@ -2187,7 +2217,7 @@ func (i *mockIngester) Push(ctx context.Context, req *cortexpb.WriteRequest, opt
21872217
i.trackCall("Push")
21882218

21892219
if !i.happy {
2190-
return nil, errFail
2220+
return nil, i.failResp
21912221
}
21922222

21932223
if i.timeseries == nil {

0 commit comments

Comments
 (0)