Skip to content

Track TSDB appender timing #2580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@
* [ENHANCEMENT] Experimental WAL: Ingester WAL records now have type header and the custom WAL records have been replaced by Prometheus TSDB's WAL records. Old records will not be supported from 1.3 onwards. Note: once this is deployed, you cannot downgrade without data loss. #2436
* [ENHANCEMENT] Redis Cache: Added `idle_timeout`, `wait_on_pool_exhaustion` and `max_conn_lifetime` options to redis cache configuration. #2550
* [ENHANCEMENT] WAL: the experimental tag has been removed on the WAL in ingesters.
* [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506
* [ENHANCEMENT] Use newer AWS API for paginated queries - removes 'Deprecated' message from logfiles. #2452
* [ENHANCEMENT] Experimental TSDB: added the following metrics to the ingester: #2580
* `cortex_ingester_tsdb_appender_add_duration_seconds`
* `cortex_ingester_tsdb_appender_commit_duration_seconds`
* [BUGFIX] Ruler: Ensure temporary rule files with special characters are properly mapped and cleaned up. #2506
* [BUGFIX] Fixes #2411, Ensure requests are properly routed to the prometheus api embedded in the query if `-server.path-prefix` is set. #2372
* [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400
* [BUGFIX] Cassandra Storage: Fix endpoint TLS host verification. #2109
Expand Down
30 changes: 24 additions & 6 deletions pkg/ingester/ingester_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ type TSDBState struct {
tsdbMetrics *tsdbMetrics

// Head compactions metrics.
compactionsTriggered prometheus.Counter
compactionsFailed prometheus.Counter
walReplayTime prometheus.Histogram
compactionsTriggered prometheus.Counter
compactionsFailed prometheus.Counter
walReplayTime prometheus.Histogram
appenderAddDuration prometheus.Histogram
appenderCommitDuration prometheus.Histogram
}

// NewV2 returns a new Ingester that uses prometheus block storage instead of chunk storage
Expand Down Expand Up @@ -114,6 +116,16 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides,
Help: "The total time it takes to open and replay a TSDB WAL.",
Buckets: prometheus.DefBuckets,
}),
appenderAddDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_appender_add_duration_seconds",
Help: "The total time it takes for a push request to add samples to the TSDB appender.",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
}),
appenderCommitDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ingester_tsdb_appender_commit_duration_seconds",
Help: "The total time it takes for a push request to commit samples appended to TSDB.",
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
}),
},
}

Expand Down Expand Up @@ -272,7 +284,7 @@ func (i *Ingester) v2Push(ctx context.Context, req *client.WriteRequest) (*clien
// successfully committed
succeededSamplesCount := 0
failedSamplesCount := 0
now := time.Now()
startAppend := time.Now()

// Walk the samples, appending them to the users database
app := db.Appender()
Expand All @@ -281,7 +293,7 @@ func (i *Ingester) v2Push(ctx context.Context, req *client.WriteRequest) (*clien
// that even if we have a reference it's not guaranteed to be still valid.
// The labels must be sorted (in our case, it's guaranteed a write request
// has sorted labels once hit the ingester).
cachedRef, cachedRefExists := db.refCache.Ref(now, client.FromLabelAdaptersToLabels(ts.Labels))
cachedRef, cachedRefExists := db.refCache.Ref(startAppend, client.FromLabelAdaptersToLabels(ts.Labels))

for _, s := range ts.Samples {
var err error
Expand All @@ -307,7 +319,7 @@ func (i *Ingester) v2Push(ctx context.Context, req *client.WriteRequest) (*clien
copiedLabels := client.FromLabelAdaptersToLabelsWithCopy(ts.Labels)

if ref, err = app.Add(copiedLabels, s.TimestampMs, s.Value); err == nil {
db.refCache.SetRef(now, copiedLabels, ref)
db.refCache.SetRef(startAppend, copiedLabels, ref)
cachedRef = ref
cachedRefExists = true

Expand Down Expand Up @@ -348,9 +360,15 @@ func (i *Ingester) v2Push(ctx context.Context, req *client.WriteRequest) (*clien
return nil, wrapWithUser(err, userID)
}
}

// At this point all samples have been added to the appender, so we can track the time it took.
i.TSDBState.appenderAddDuration.Observe(time.Since(startAppend).Seconds())

startCommit := time.Now()
if err := app.Commit(); err != nil {
return nil, wrapWithUser(err, userID)
}
i.TSDBState.appenderCommitDuration.Observe(time.Since(startCommit).Seconds())

// Increment metrics only if the samples have been successfully committed.
// If the code didn't reach this point, it means that we returned an error
Expand Down