Skip to content

Expose more TSDB metrics from ingesters when running blocks storage #2583

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,18 @@
* [ENHANCEMENT] Redis Cache: Added `idle_timeout`, `wait_on_pool_exhaustion` and `max_conn_lifetime` options to redis cache configuration. #2550
* [ENHANCEMENT] WAL: the experimental tag has been removed on the WAL in ingesters.
* [ENHANCEMENT] Use newer AWS API for paginated queries - removes 'Deprecated' message from logfiles. #2452
* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580
* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580 #2583
* `cortex_ingester_tsdb_appender_add_duration_seconds`
* `cortex_ingester_tsdb_appender_commit_duration_seconds`
* `cortex_ingester_tsdb_refcache_purge_duration_seconds`
* `cortex_ingester_tsdb_compactions_total`
* `cortex_ingester_tsdb_compaction_duration_seconds`
* `cortex_ingester_tsdb_wal_fsync_duration_seconds`
* `cortex_ingester_tsdb_wal_page_flushes_total`
* `cortex_ingester_tsdb_wal_completed_pages_total`
* `cortex_ingester_tsdb_wal_truncations_failed_total`
* `cortex_ingester_tsdb_wal_truncations_total`
* `cortex_ingester_tsdb_wal_writes_failed_total`
* [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506
* [BUGFIX] Fixes #2411, Ensure requests are properly routed to the prometheus api embedded in the query if `-server.path-prefix` is set. #2372
* [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400
Expand Down
14 changes: 12 additions & 2 deletions pkg/ingester/ingester_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ type TSDBState struct {
walReplayTime prometheus.Histogram
appenderAddDuration prometheus.Histogram
appenderCommitDuration prometheus.Histogram
refCachePurgeDuration prometheus.Histogram
}

// NewV2 returns a new Ingester that uses prometheus block storage instead of chunk storage
Expand Down Expand Up @@ -126,6 +127,11 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides,
Help: "The total time it takes for a push request to commit samples appended to TSDB.",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
}),
refCachePurgeDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_refcache_purge_duration_seconds",
Help: "The total time it takes to purge the TSDB series reference cache for a single tenant.",
Buckets: prometheus.DefBuckets,
}),
},
}

Expand Down Expand Up @@ -228,9 +234,13 @@ func (i *Ingester) updateLoop(ctx context.Context) error {
case <-refCachePurgeTicker.C:
for _, userID := range i.getTSDBUsers() {
userDB := i.getTSDB(userID)
if userDB != nil {
userDB.refCache.Purge(time.Now().Add(-cortex_tsdb.DefaultRefCacheTTL))
if userDB == nil {
continue
}

startTime := time.Now()
userDB.refCache.Purge(time.Now().Add(-cortex_tsdb.DefaultRefCacheTTL))
i.TSDBState.refCachePurgeDuration.Observe(time.Since(startTime).Seconds())
}
case <-ctx.Done():
return nil
Expand Down
64 changes: 62 additions & 2 deletions pkg/ingester/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,22 @@ func newIngesterMetrics(r prometheus.Registerer, createMetricsConflictingWithTSD

// TSDB metrics collector. Each tenant has its own registry, that TSDB code uses.
type tsdbMetrics struct {
// We aggregate metrics from individual TSDB registries into
// a single set of counters, which are exposed as Cortex metrics.
// Metrics aggregated from Thanos shipper.
dirSyncs *prometheus.Desc // sum(thanos_shipper_dir_syncs_total)
dirSyncFailures *prometheus.Desc // sum(thanos_shipper_dir_sync_failures_total)
uploads *prometheus.Desc // sum(thanos_shipper_uploads_total)
uploadFailures *prometheus.Desc // sum(thanos_shipper_upload_failures_total)

// Metrics aggregated from TSDB.
tsdbCompactionsTotal *prometheus.Desc
tsdbCompactionDuration *prometheus.Desc
tsdbFsyncDuration *prometheus.Desc
tsdbPageFlushes *prometheus.Desc
tsdbPageCompletions *prometheus.Desc
tsdbTruncateFail *prometheus.Desc
tsdbTruncateTotal *prometheus.Desc
tsdbWritesFailed *prometheus.Desc

// These two metrics replace metrics in ingesterMetrics, as we count them differently
memSeriesCreatedTotal *prometheus.Desc
memSeriesRemovedTotal *prometheus.Desc
Expand Down Expand Up @@ -253,6 +262,38 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
"cortex_ingester_shipper_upload_failures_total",
"TSDB: Total number of block upload failures",
nil, nil),
tsdbCompactionsTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_compactions_total",
"Total number of TSDB compactions that were executed.",
nil, nil),
tsdbCompactionDuration: prometheus.NewDesc(
"cortex_ingester_tsdb_compaction_duration_seconds",
"Duration of TSDB compaction runs.",
nil, nil),
tsdbFsyncDuration: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_fsync_duration_seconds",
"Duration of TSDB WAL fsync.",
nil, nil),
tsdbPageFlushes: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_page_flushes_total",
"Total number of TSDB WAL page flushes.",
nil, nil),
tsdbPageCompletions: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_completed_pages_total",
"Total number of TSDB WAL completed pages.",
nil, nil),
tsdbTruncateFail: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_truncations_failed_total",
"Total number of TSDB WAL truncations that failed.",
nil, nil),
tsdbTruncateTotal: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_truncations_total",
"Total number of TSDB WAL truncations attempted.",
nil, nil),
tsdbWritesFailed: prometheus.NewDesc(
"cortex_ingester_tsdb_wal_writes_failed_total",
"Total number of TSDB WAL writes that failed.",
nil, nil),

memSeriesCreatedTotal: prometheus.NewDesc(memSeriesCreatedTotalName, memSeriesCreatedTotalHelp, []string{"user"}, nil),
memSeriesRemovedTotal: prometheus.NewDesc(memSeriesRemovedTotalName, memSeriesRemovedTotalHelp, []string{"user"}, nil),
Expand All @@ -269,6 +310,16 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
out <- sm.dirSyncFailures
out <- sm.uploads
out <- sm.uploadFailures

out <- sm.tsdbCompactionsTotal
out <- sm.tsdbCompactionDuration
out <- sm.tsdbFsyncDuration
out <- sm.tsdbPageFlushes
out <- sm.tsdbPageCompletions
out <- sm.tsdbTruncateFail
out <- sm.tsdbTruncateTotal
out <- sm.tsdbWritesFailed

out <- sm.memSeriesCreatedTotal
out <- sm.memSeriesRemovedTotal
}
Expand All @@ -282,6 +333,15 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, sm.uploads, "thanos_shipper_uploads_total")
data.SendSumOfCounters(out, sm.uploadFailures, "thanos_shipper_upload_failures_total")

data.SendSumOfCounters(out, sm.tsdbCompactionsTotal, "prometheus_tsdb_compactions_total")
data.SendSumOfHistograms(out, sm.tsdbCompactionDuration, "prometheus_tsdb_compaction_duration_seconds")
data.SendSumOfSummaries(out, sm.tsdbFsyncDuration, "prometheus_tsdb_wal_fsync_duration_seconds")
data.SendSumOfCounters(out, sm.tsdbPageFlushes, "prometheus_tsdb_wal_page_flushes_total")
data.SendSumOfCounters(out, sm.tsdbPageCompletions, "prometheus_tsdb_wal_completed_pages_total")
data.SendSumOfCounters(out, sm.tsdbTruncateFail, "prometheus_tsdb_wal_truncations_failed_total")
data.SendSumOfCounters(out, sm.tsdbTruncateTotal, "prometheus_tsdb_wal_truncations_total")
data.SendSumOfCounters(out, sm.tsdbWritesFailed, "prometheus_tsdb_wal_writes_failed_total")

data.SendSumOfCountersPerUser(out, sm.memSeriesCreatedTotal, "prometheus_tsdb_head_series_created_total")
data.SendSumOfCountersPerUser(out, sm.memSeriesRemovedTotal, "prometheus_tsdb_head_series_removed_total")
}
Expand Down
100 changes: 99 additions & 1 deletion pkg/ingester/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,54 @@ func TestTSDBMetrics(t *testing.T) {
# 4*(12345 + 85787 + 999)
cortex_ingester_shipper_upload_failures_total 396524

# HELP cortex_ingester_tsdb_compactions_total Total number of TSDB compactions that were executed.
# TYPE cortex_ingester_tsdb_compactions_total counter
cortex_ingester_tsdb_compactions_total 693917

# HELP cortex_ingester_tsdb_compaction_duration_seconds Duration of TSDB compaction runs.
# TYPE cortex_ingester_tsdb_compaction_duration_seconds histogram
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="1"} 0
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="2"} 0
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="4"} 0
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="8"} 0
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="16"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="32"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="64"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="128"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="256"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="512"} 3
cortex_ingester_tsdb_compaction_duration_seconds_bucket{le="+Inf"} 3
cortex_ingester_tsdb_compaction_duration_seconds_sum 27
cortex_ingester_tsdb_compaction_duration_seconds_count 3

# HELP cortex_ingester_tsdb_wal_fsync_duration_seconds Duration of TSDB WAL fsync.
# TYPE cortex_ingester_tsdb_wal_fsync_duration_seconds summary
cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.5"} 30
cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.9"} 30
cortex_ingester_tsdb_wal_fsync_duration_seconds{quantile="0.99"} 30
cortex_ingester_tsdb_wal_fsync_duration_seconds_sum 30
cortex_ingester_tsdb_wal_fsync_duration_seconds_count 3

# HELP cortex_ingester_tsdb_wal_page_flushes_total Total number of TSDB WAL page flushes.
# TYPE cortex_ingester_tsdb_wal_page_flushes_total counter
cortex_ingester_tsdb_wal_page_flushes_total 1090441

# HELP cortex_ingester_tsdb_wal_completed_pages_total Total number of TSDB WAL completed pages.
# TYPE cortex_ingester_tsdb_wal_completed_pages_total counter
cortex_ingester_tsdb_wal_completed_pages_total 1189572

# HELP cortex_ingester_tsdb_wal_truncations_failed_total Total number of TSDB WAL truncations that failed.
# TYPE cortex_ingester_tsdb_wal_truncations_failed_total counter
cortex_ingester_tsdb_wal_truncations_failed_total 1288703

# HELP cortex_ingester_tsdb_wal_truncations_total Total number of TSDB WAL truncations attempted.
# TYPE cortex_ingester_tsdb_wal_truncations_total counter
cortex_ingester_tsdb_wal_truncations_total 1387834

# HELP cortex_ingester_tsdb_wal_writes_failed_total Total number of TSDB WAL writes that failed.
# TYPE cortex_ingester_tsdb_wal_writes_failed_total counter
cortex_ingester_tsdb_wal_writes_failed_total 1486965

# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
# TYPE cortex_ingester_memory_series_created_total counter
# 5 * (12345, 85787 and 999 respectively)
Expand All @@ -60,7 +108,7 @@ func TestTSDBMetrics(t *testing.T) {
func populateTSDBMetrics(base float64) *prometheus.Registry {
r := prometheus.NewRegistry()

// shipper
// Thanos shipper.
dirSyncs := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "thanos_shipper_dir_syncs_total",
Help: "Total number of dir syncs",
Expand Down Expand Up @@ -96,5 +144,55 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
})
seriesRemoved.Add(6 * base)

ran := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_total",
Help: "Total number of compactions that were executed for the partition.",
})
ran.Add(7 * base)

duration := promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_compaction_duration_seconds",
Help: "Duration of compaction runs",
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
})
duration.Observe(9)

fsyncDuration := promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_wal_fsync_duration_seconds",
Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
})
fsyncDuration.Observe(10)

pageFlushes := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_page_flushes_total",
Help: "Total number of page flushes.",
})
pageFlushes.Add(11 * base)

pageCompletions := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_completed_pages_total",
Help: "Total number of completed pages.",
})
pageCompletions.Add(12 * base)

truncateFail := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_failed_total",
Help: "Total number of WAL truncations that failed.",
})
truncateFail.Add(13 * base)

truncateTotal := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_total",
Help: "Total number of WAL truncations attempted.",
})
truncateTotal.Add(14 * base)

writesFailed := promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_writes_failed_total",
Help: "Total number of WAL writes that failed.",
})
writesFailed.Add(15 * base)

return r
}