Skip to content

Commit 58b701c

Browse files
NickAngeArthurSens
authored andcommitted
[exporter/prometheusremotewrite] : WAL write success/failure telemetry (open-telemetry#39843)
<!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> #### Description This PR introduces Write and Write failures metric from which we can derive the success ratio or the failure ratio. - `otelcol_exporter_prometheusremotewrite_wal_writes`: Total WAL writes requests - `otelcol_exporter_prometheusremotewrite_wal_writes_failures`: Total WAL write failures I decided to introduce the code in the handle export function of the exporter , just before calling the `wal.persistToWAL` <!-- Issue number (e.g. #1234) or full URL to issue, if applicable. --> #### Link to tracking issue Part of open-telemetry#39556 <!--Describe what testing was performed and which tests were added.--> #### Testing [ ![Screenshot 2025-05-09 at 15 34 45](https://github.com/user-attachments/assets/4489b13a-a538-40ef-9ff7-de6d9f23290a) ](url) <!--Describe the documentation added.--> #### Documentation <!--Please delete paragraphs that you did not use before submitting.--> --------- Co-authored-by: Arthur Silva Sens <[email protected]>
1 parent 6cc3de6 commit 58b701c

File tree

10 files changed

+246
-13
lines changed

10 files changed

+246
-13
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: prometheusremotewriteexproter
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: |
11+
Adds wal metrics to the Prometheus Remote WRite Exporter. The new metrics are:
12+
- `otelcol_exporter_prometheusremotewrite_wal_writes`: The total number of WAL writes.
13+
- `otelcol_exporter_prometheusremotewrite_wal_writes_failures`: The total number of WAL write failures.
14+
15+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
16+
issues: [39556]
17+
18+
# (Optional) One or more lines of additional information to render under the primary note.
19+
# These lines will be padded with 2 spaces and then inserted directly into the document.
20+
# Use pipe (|) for multiline entries.
21+
subtext:
22+
23+
# If your change doesn't affect end users or the exported elements of any package,
24+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
25+
# Optional: The change log or logs in which this entry should be included.
26+
# e.g. '[user]' or '[user, api]'
27+
# Include 'user' if the change is relevant to end users.
28+
# Include 'api' if there is a change to a library API.
29+
# Default: '[user]'
30+
change_logs: [user]

exporter/prometheusremotewriteexporter/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,4 +177,4 @@ Out-of-order support in Prometheus must be enabled for multiple consumers.
177177
This can be done by using the `tsdb.out_of_order_time_window: 10m` settings. Please choose an appropriate time window to support pushing the worst-case scenarios of a "queue" build-up on the sender side.
178178

179179
See for more info:
180-
- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tsdb
180+
- https://prometheus.io/docs/prometheus/latest/configuration/configuration/#tsdb

exporter/prometheusremotewriteexporter/documentation.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,19 @@ Number of Prometheus time series that were translated from OTel metrics
3737
| Unit | Metric Type | Value Type | Monotonic |
3838
| ---- | ----------- | ---------- | --------- |
3939
| 1 | Sum | Int | true |
40+
41+
### otelcol_exporter_prometheusremotewrite_wal_writes
42+
43+
Number of WAL writes
44+
45+
| Unit | Metric Type | Value Type | Monotonic |
46+
| ---- | ----------- | ---------- | --------- |
47+
| 1 | Sum | Int | true |
48+
49+
### otelcol_exporter_prometheusremotewrite_wal_writes_failures
50+
51+
Number of WAL writes that failed
52+
53+
| Unit | Metric Type | Value Type | Monotonic |
54+
| ---- | ----------- | ---------- | --------- |
55+
| 1 | Sum | Int | true |

exporter/prometheusremotewriteexporter/exporter.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func newPRWExporter(cfg *Config, set exporter.Settings) (*prwExporter, error) {
135135
return nil, err
136136
}
137137

138-
if err := config.RemoteWriteProtoMsg.Validate(cfg.RemoteWriteProtoMsg); err != nil {
138+
if err = config.RemoteWriteProtoMsg.Validate(cfg.RemoteWriteProtoMsg); err != nil {
139139
return nil, err
140140
}
141141

@@ -177,7 +177,10 @@ func newPRWExporter(cfg *Config, set exporter.Settings) (*prwExporter, error) {
177177

178178
prwe.settings.Logger.Info("starting prometheus remote write exporter", zap.Any("ProtoMsg", cfg.RemoteWriteProtoMsg))
179179

180-
prwe.wal = newWAL(cfg.WAL, prwe.export)
180+
prwe.wal, err = newWAL(cfg.WAL, set, prwe.export)
181+
if err != nil {
182+
return nil, err
183+
}
181184
return prwe, nil
182185
}
183186

@@ -288,8 +291,10 @@ func (prwe *prwExporter) handleExport(ctx context.Context, tsMap map[string]*pro
288291
}
289292

290293
// Otherwise the WAL is enabled, and just persist the requests to the WAL
291-
// and they'll be exported in another goroutine to the RemoteWrite endpoint.
292-
if err = prwe.wal.persistToWAL(requests); err != nil {
294+
prwe.wal.telemetry.recordWALWrites(ctx)
295+
err = prwe.wal.persistToWAL(requests)
296+
if err != nil {
297+
prwe.wal.telemetry.recordWALWritesFailures(ctx)
293298
return consumererror.NewPermanent(err)
294299
}
295300
return nil

exporter/prometheusremotewriteexporter/internal/metadata/generated_telemetry.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/internal/metadatatest/generated_telemetrytest.go

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/internal/metadatatest/generated_telemetrytest_test.go

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/prometheusremotewriteexporter/metadata.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,17 @@ telemetry:
4646
sum:
4747
value_type: int
4848
monotonic: true
49+
exporter_prometheusremotewrite_wal_writes:
50+
enabled: true
51+
description: Number of WAL writes
52+
unit: "1"
53+
sum:
54+
value_type: int
55+
monotonic: true
56+
exporter_prometheusremotewrite_wal_writes_failures:
57+
enabled: true
58+
description: Number of WAL writes that failed
59+
unit: "1"
60+
sum:
61+
value_type: int
62+
monotonic: true

exporter/prometheusremotewriteexporter/wal.go

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,44 @@ import (
1515
"github.com/gogo/protobuf/proto"
1616
"github.com/prometheus/prometheus/prompb"
1717
"github.com/tidwall/wal"
18+
"go.opentelemetry.io/collector/exporter"
19+
"go.opentelemetry.io/otel/attribute"
20+
"go.opentelemetry.io/otel/metric"
1821
"go.uber.org/multierr"
1922
"go.uber.org/zap"
23+
24+
"github.com/open-telemetry/opentelemetry-collector-contrib/exporter/prometheusremotewriteexporter/internal/metadata"
2025
)
2126

27+
type prwWalTelemetry interface {
28+
recordWALWrites(ctx context.Context)
29+
recordWALWritesFailures(ctx context.Context)
30+
}
31+
32+
type prwWalTelemetryOTel struct {
33+
telemetryBuilder *metadata.TelemetryBuilder
34+
otelAttrs []attribute.KeyValue
35+
}
36+
37+
func (p *prwWalTelemetryOTel) recordWALWrites(ctx context.Context) {
38+
p.telemetryBuilder.ExporterPrometheusremotewriteWalWrites.Add(ctx, 1, metric.WithAttributes(p.otelAttrs...))
39+
}
40+
41+
func (p *prwWalTelemetryOTel) recordWALWritesFailures(ctx context.Context) {
42+
p.telemetryBuilder.ExporterPrometheusremotewriteWalWritesFailures.Add(ctx, 1, metric.WithAttributes(p.otelAttrs...))
43+
}
44+
45+
func newPRWWalTelemetry(set exporter.Settings) (prwWalTelemetry, error) {
46+
telemetryBuilder, err := metadata.NewTelemetryBuilder(set.TelemetrySettings)
47+
if err != nil {
48+
return nil, err
49+
}
50+
return &prwWalTelemetryOTel{
51+
telemetryBuilder: telemetryBuilder,
52+
otelAttrs: []attribute.KeyValue{},
53+
}, nil
54+
}
55+
2256
type prweWAL struct {
2357
wg sync.WaitGroup // wg waits for the go routines to finish.
2458
mu sync.Mutex // mu protects the fields below.
@@ -33,6 +67,8 @@ type prweWAL struct {
3367
rNotify chan struct{}
3468
rWALIndex *atomic.Uint64
3569
wWALIndex *atomic.Uint64
70+
71+
telemetry prwWalTelemetry
3672
}
3773

3874
const (
@@ -60,11 +96,16 @@ func (wc *WALConfig) truncateFrequency() time.Duration {
6096
return defaultWALTruncateFrequency
6197
}
6298

63-
func newWAL(walConfig *WALConfig, exportSink func(context.Context, []*prompb.WriteRequest) error) *prweWAL {
99+
func newWAL(walConfig *WALConfig, set exporter.Settings, exportSink func(context.Context, []*prompb.WriteRequest) error) (*prweWAL, error) {
64100
if walConfig == nil {
65101
// There are cases for which the WAL can be disabled.
66102
// TODO: Perhaps log that the WAL wasn't enabled.
67-
return nil
103+
return nil, nil
104+
}
105+
106+
telemetryPRWWal, err := newPRWWalTelemetry(set)
107+
if err != nil {
108+
return nil, err
68109
}
69110

70111
return &prweWAL{
@@ -74,7 +115,8 @@ func newWAL(walConfig *WALConfig, exportSink func(context.Context, []*prompb.Wri
74115
rNotify: make(chan struct{}),
75116
rWALIndex: &atomic.Uint64{},
76117
wWALIndex: &atomic.Uint64{},
77-
}
118+
telemetry: telemetryPRWWal,
119+
}, nil
78120
}
79121

80122
func (wc *WALConfig) createWAL() (*wal.Log, string, error) {
@@ -323,6 +365,7 @@ func (prweWAL *prweWAL) persistToWAL(requests []*prompb.WriteRequest) error {
323365
case prweWAL.rNotify <- struct{}{}:
324366
default:
325367
}
368+
326369
return prweWAL.wal.WriteBatch(batch)
327370
}
328371

0 commit comments

Comments
 (0)