[processor/tailsampling] Include componentID as prefix in metrics 'policy' (open-telemetry#34192)

EOjeah · f7o · commit f21ab4602794 · 2024-09-11T22:49:17.000-07:00
**Description:** Fixing a bug - This change includes the componentID as a dot prefix to the metrics `policy` dimension when generating metrics for the processor. The change ensures that similarly named policy's in the tail sampling processor that belong to different components also has a unique value in the `policy` field for the metrics. Also includes minor refactor change to rename `telemetry` to `telemetryBuilder` where applicable (return type == `NewTelemetryBuilder`) Resolves: open-telemetry#34099 **Link to tracking Issue:** <Issue number if applicable> **Testing:** Ran the collector locally with `make run` with the configuration below which uses the tail sampling processor and has metrics exposed in prometheus format. Sending sample zipkin spans to the receiver ```yaml receivers: zipkin: processors: tail_sampling: policies: [ { name: test-policy-1, type: always_sample } ] tail_sampling/custom_name: policies: [ { name: test-policy-1, type: always_sample } ] exporters: debug: service: telemetry: logs: metrics: pipelines: traces: receivers: [zipkin] processors: [tail_sampling, tail_sampling/custom_name] exporters: [debug] ``` Curling the metrics endpoint shows the policy name is unique for both tail sampling processors ```bash otelcol_processor_tail_sampling_sampling_decision_latency_bucket{policy="custom_name.test-policy-1",service_instance_id="X",service_name="otelcontribcol",service_version="0.105.0-dev",le="5000"} 1 otelcol_processor_tail_sampling_sampling_decision_latency_bucket{policy="test-policy-1",service_instance_id="X",service_name="otelcontribcol",service_version="0.105.0-dev",le="5000"} 1 ``` Tasks - [ ] Confirm prefix separator as `.` - [ ] Update change log entry
diff --git a/.chloggen/bug_unique-policy-name-tail-sampling-processor.yaml b/.chloggen/bug_unique-policy-name-tail-sampling-processor.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: 'bug_fix'
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: tailsamplingprocessor
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: "Update the `policy` value in metrics dimension value to be unique across multiple tail sampling components with the same policy name."
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [34192]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: "This change ensures that the `policy` value in the metrics exported by the tail sampling processor is unique across multiple tail sampling processors with the same policy name."
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
diff --git a/processor/tailsamplingprocessor/factory.go b/processor/tailsamplingprocessor/factory.go
@@ -38,5 +38,5 @@ func createTracesProcessor(
 	nextConsumer consumer.Traces,
 ) (processor.Traces, error) {
 	tCfg := cfg.(*Config)
-	return newTracesProcessor(ctx, params.TelemetrySettings, nextConsumer, *tCfg)
+	return newTracesProcessor(ctx, params, nextConsumer, *tCfg)
 }
diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go
@@ -83,8 +83,9 @@ type Option func(*tailSamplingSpanProcessor)
 
 // newTracesProcessor returns a processor.TracesProcessor that will perform tail sampling according to the given
 // configuration.
-func newTracesProcessor(ctx context.Context, settings component.TelemetrySettings, nextConsumer consumer.Traces, cfg Config, opts ...Option) (processor.Traces, error) {
-	telemetry, err := metadata.NewTelemetryBuilder(settings)
+func newTracesProcessor(ctx context.Context, set processor.Settings, nextConsumer consumer.Traces, cfg Config, opts ...Option) (processor.Traces, error) {
+	telemetrySettings := set.TelemetrySettings
+	telemetry, err := metadata.NewTelemetryBuilder(telemetrySettings)
 	if err != nil {
 		return nil, err
 	}
@@ -102,7 +103,7 @@ func newTracesProcessor(ctx context.Context, settings component.TelemetrySetting
 		nextConsumer:   nextConsumer,
 		maxNumTraces:   cfg.NumTraces,
 		sampledIDCache: sampledDecisions,
-		logger:         settings.Logger,
+		logger:         telemetrySettings.Logger,
 		numTracesOnMap: &atomic.Uint64{},
 		deleteChan:     make(chan pcommon.TraceID, cfg.NumTraces),
 	}
@@ -119,6 +120,7 @@ func newTracesProcessor(ctx context.Context, settings component.TelemetrySetting
 	if tsp.policies == nil {
 		policyNames := map[string]bool{}
 		tsp.policies = make([]*policy, len(cfg.PolicyCfgs))
+		componentID := set.ID.Name()
 		for i := range cfg.PolicyCfgs {
 			policyCfg := &cfg.PolicyCfgs[i]
 
@@ -127,14 +129,18 @@ func newTracesProcessor(ctx context.Context, settings component.TelemetrySetting
 			}
 			policyNames[policyCfg.Name] = true
 
-			eval, err := getPolicyEvaluator(settings, policyCfg)
+			eval, err := getPolicyEvaluator(telemetrySettings, policyCfg)
 			if err != nil {
 				return nil, err
 			}
+			uniquePolicyName := policyCfg.Name
+			if componentID != "" {
+				uniquePolicyName = fmt.Sprintf("%s.%s", componentID, policyCfg.Name)
+			}
 			p := &policy{
 				name:      policyCfg.Name,
 				evaluator: eval,
-				attribute: metric.WithAttributes(attribute.String("policy", policyCfg.Name)),
+				attribute: metric.WithAttributes(attribute.String("policy", uniquePolicyName)),
 			}
 			tsp.policies[i] = p
 		}
diff --git a/processor/tailsamplingprocessor/processor_benchmarks_test.go b/processor/tailsamplingprocessor/processor_benchmarks_test.go
@@ -12,6 +12,7 @@ import (
 	"go.opentelemetry.io/collector/component/componenttest"
 	"go.opentelemetry.io/collector/consumer/consumertest"
 	"go.opentelemetry.io/collector/pdata/ptrace"
+	"go.opentelemetry.io/collector/processor/processortest"
 
 	"github.com/open-telemetry/opentelemetry-collector-contrib/processor/tailsamplingprocessor/internal/sampling"
 )
@@ -24,8 +25,7 @@ func BenchmarkSampling(b *testing.B) {
 		ExpectedNewTracesPerSec: 64,
 		PolicyCfgs:              testPolicy,
 	}
-
-	sp, _ := newTracesProcessor(context.Background(), componenttest.NewNopTelemetrySettings(), consumertest.NewNop(), cfg)
+	sp, _ := newTracesProcessor(context.Background(), processortest.NewNopSettings(), consumertest.NewNop(), cfg)
 	tsp := sp.(*tailSamplingSpanProcessor)
 	require.NoError(b, tsp.Start(context.Background(), componenttest.NewNopHost()))
 	defer func() {
diff --git a/processor/tailsamplingprocessor/processor_decisions_test.go b/processor/tailsamplingprocessor/processor_decisions_test.go
@@ -26,7 +26,7 @@ func TestSamplingPolicyTypicalPath(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -71,7 +71,7 @@ func TestSamplingPolicyInvertSampled(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -116,7 +116,7 @@ func TestSamplingMultiplePolicies(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -167,7 +167,7 @@ func TestSamplingPolicyDecisionNotSampled(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -213,7 +213,7 @@ func TestSamplingPolicyDecisionInvertNotSampled(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -264,7 +264,7 @@ func TestLateArrivingSpansAssignedOriginalDecision(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe1 := &mockPolicyEvaluator{}
@@ -334,7 +334,7 @@ func TestLateArrivingSpanUsesDecisionCache(t *testing.T) {
 	}
 	nextConsumer := new(consumertest.TracesSink)
 	s := setupTestTelemetry()
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	idb := newSyncIDBatcher()
 
 	mpe := &mockPolicyEvaluator{}
diff --git a/processor/tailsamplingprocessor/processor_telemetry_test.go b/processor/tailsamplingprocessor/processor_telemetry_test.go
@@ -9,6 +9,7 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/collector/component"
 	"go.opentelemetry.io/collector/component/componenttest"
 	"go.opentelemetry.io/collector/consumer/consumertest"
 	"go.opentelemetry.io/collector/featuregate"
@@ -37,7 +38,7 @@ func TestMetricsAfterOneEvaluation(t *testing.T) {
 		},
 	}
 	cs := &consumertest.TracesSink{}
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
 	require.NoError(t, err)
 	defer func() {
@@ -211,6 +212,102 @@ func TestMetricsAfterOneEvaluation(t *testing.T) {
 	assert.Len(t, cs.AllTraces(), 1)
 }
 
+func TestMetricsWithComponentID(t *testing.T) {
+	// prepare
+	s := setupTestTelemetry()
+	b := newSyncIDBatcher()
+	syncBatcher := b.(*syncIDBatcher)
+
+	cfg := Config{
+		DecisionWait: 1,
+		NumTraces:    100,
+		PolicyCfgs: []PolicyCfg{
+			{
+				sharedPolicyCfg: sharedPolicyCfg{
+					Name: "always",
+					Type: AlwaysSample,
+				},
+			},
+		},
+	}
+	cs := &consumertest.TracesSink{}
+	ct := s.NewSettings()
+	ct.ID = component.MustNewIDWithName("tail_sampling", "unique_id") // e.g tail_sampling/unique_id
+	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
+	require.NoError(t, err)
+	defer func() {
+		err = proc.Shutdown(context.Background())
+		require.NoError(t, err)
+	}()
+
+	err = proc.Start(context.Background(), componenttest.NewNopHost())
+	require.NoError(t, err)
+
+	// test
+	err = proc.ConsumeTraces(context.Background(), simpleTraces())
+	require.NoError(t, err)
+
+	tsp := proc.(*tailSamplingSpanProcessor)
+	tsp.policyTicker.OnTick() // the first tick always gets an empty batch
+	tsp.policyTicker.OnTick()
+
+	// verify
+	var md metricdata.ResourceMetrics
+	require.NoError(t, s.reader.Collect(context.Background(), &md))
+	require.Equal(t, 8, s.len(md))
+
+	for _, tt := range []struct {
+		opts []metricdatatest.Option
+		m    metricdata.Metrics
+	}{
+		{
+			opts: []metricdatatest.Option{metricdatatest.IgnoreTimestamp()},
+			m: metricdata.Metrics{
+				Name:        "otelcol_processor_tail_sampling_count_traces_sampled",
+				Description: "Count of traces that were sampled or not per sampling policy",
+				Unit:        "{traces}",
+				Data: metricdata.Sum[int64]{
+					IsMonotonic: true,
+					Temporality: metricdata.CumulativeTemporality,
+					DataPoints: []metricdata.DataPoint[int64]{
+						{
+							Attributes: attribute.NewSet(
+								attribute.String("policy", "unique_id.always"),
+								attribute.String("sampled", "true"),
+							),
+							Value: 1,
+						},
+					},
+				},
+			},
+		},
+		{
+			opts: []metricdatatest.Option{metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()},
+			m: metricdata.Metrics{
+				Name:        "otelcol_processor_tail_sampling_sampling_decision_latency",
+				Description: "Latency (in microseconds) of a given sampling policy",
+				Unit:        "µs",
+				Data: metricdata.Histogram[int64]{
+					Temporality: metricdata.CumulativeTemporality,
+					DataPoints: []metricdata.HistogramDataPoint[int64]{
+						{
+							Attributes: attribute.NewSet(
+								attribute.String("policy", "unique_id.always"),
+							),
+						},
+					},
+				},
+			},
+		},
+	} {
+		got := s.getMetric(tt.m.Name, md)
+		metricdatatest.AssertEqual(t, tt.m, got, tt.opts...)
+	}
+
+	// sanity check
+	assert.Len(t, cs.AllTraces(), 1)
+}
+
 func TestProcessorTailSamplingCountSpansSampled(t *testing.T) {
 	err := featuregate.GlobalRegistry().Set("processor.tailsamplingprocessor.metricstatcountspanssampled", true)
 	require.NoError(t, err)
@@ -238,7 +335,7 @@ func TestProcessorTailSamplingCountSpansSampled(t *testing.T) {
 		},
 	}
 	cs := &consumertest.TracesSink{}
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
 	require.NoError(t, err)
 	defer func() {
@@ -303,7 +400,7 @@ func TestProcessorTailSamplingSamplingTraceRemovalAge(t *testing.T) {
 		},
 	}
 	cs := &consumertest.TracesSink{}
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
 	require.NoError(t, err)
 	defer func() {
@@ -364,7 +461,7 @@ func TestProcessorTailSamplingSamplingLateSpanAge(t *testing.T) {
 		},
 	}
 	cs := &consumertest.TracesSink{}
-	ct := s.NewSettings().TelemetrySettings
+	ct := s.NewSettings()
 	proc, err := newTracesProcessor(context.Background(), ct, cs, cfg, withDecisionBatcher(syncBatcher))
 	require.NoError(t, err)
 	defer func() {
diff --git a/processor/tailsamplingprocessor/processor_test.go b/processor/tailsamplingprocessor/processor_test.go

Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,5 @@ func createTracesProcessor(`
`38`	`38`	`nextConsumer consumer.Traces,`
`39`	`39`	`) (processor.Traces, error) {`
`40`	`40`	`tCfg := cfg.(*Config)`
`41`		`- return newTracesProcessor(ctx, params.TelemetrySettings, nextConsumer, *tCfg)`
	`41`	`+ return newTracesProcessor(ctx, params, nextConsumer, *tCfg)`
`42`	`42`	`}`