Skip to content

Commit 73d7699

Browse files
committed
fix(ddtrace/tracer): postpone span events/links serialization (#3429)
1 parent 60887f3 commit 73d7699

File tree

2 files changed

+62
-9
lines changed

2 files changed

+62
-9
lines changed

ddtrace/tracer/span.go

+15-9
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ type span struct {
9191

9292
pprofCtxActive context.Context `msg:"-"` // contains pprof.WithLabel labels to tell the profiler more about this span
9393
pprofCtxRestore context.Context `msg:"-"` // contains pprof.WithLabel labels of the parent span (if any) that need to be restored when this span finishes
94-
finishGuard atomic.Uint32 `msg:"-"` // finishGuard value to detect if Span.Finish has been called to only finish the span once
9594

9695
taskEnd func() // ends execution tracer (runtime/trace) task, if started
9796
}
@@ -485,6 +484,15 @@ func (s *span) setMetric(key string, v float64) {
485484

486485
// AddSpanLink appends the given link to the span's span links.
487486
func (s *span) AddSpanLink(link ddtrace.SpanLink) {
487+
s.Lock()
488+
defer s.Unlock()
489+
// We don't lock spans when flushing, so we could have a data race when
490+
// modifying a span as it's being flushed. This protects us against that
491+
// race, since spans are marked `finished` before we flush them.
492+
if s.finished {
493+
// already finished
494+
return
495+
}
488496
s.SpanLinks = append(s.SpanLinks, link)
489497
}
490498

@@ -529,11 +537,6 @@ func (s *span) serializeSpanEvents() {
529537
// Finish closes this Span (but not its children) providing the duration
530538
// of its part of the tracing session.
531539
func (s *span) Finish(opts ...ddtrace.FinishOption) {
532-
// If Span.Finish has already been called, do nothing.
533-
// In this way, we can ensure that Span.Finish is called at most once.
534-
if !s.finishGuard.CompareAndSwap(0, 1) {
535-
return
536-
}
537540
t := now()
538541
if len(opts) > 0 {
539542
cfg := ddtrace.FinishConfig{
@@ -581,9 +584,6 @@ func (s *span) Finish(opts ...ddtrace.FinishOption) {
581584
}
582585
}
583586

584-
s.serializeSpanLinksInMeta()
585-
s.serializeSpanEvents()
586-
587587
s.finish(t)
588588
orchestrion.GLSPopValue(sharedinternal.ActiveSpanKey)
589589
}
@@ -612,6 +612,10 @@ func (s *span) finish(finishTime int64) {
612612
// already finished
613613
return
614614
}
615+
616+
s.serializeSpanLinksInMeta()
617+
s.serializeSpanEvents()
618+
615619
if s.Duration == 0 {
616620
s.Duration = finishTime - s.Start
617621
}
@@ -795,6 +799,8 @@ func (s *span) Format(f fmt.State, c rune) {
795799

796800
// AddEvent attaches a new event to the current span.
797801
func (s *span) AddEvent(name string, opts ...ddtrace.SpanEventOption) {
802+
s.Lock()
803+
defer s.Unlock()
798804
if s.finished {
799805
return
800806
}

ddtrace/tracer/spancontext_test.go

+47
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"testing"
1414
"time"
1515

16+
"gopkg.in/DataDog/dd-trace-go.v1/ddtrace"
1617
"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/ext"
1718
"gopkg.in/DataDog/dd-trace-go.v1/internal/globalconfig"
1819
"gopkg.in/DataDog/dd-trace-go.v1/internal/log"
@@ -55,6 +56,52 @@ func TestNewSpanContextPushError(t *testing.T) {
5556
assert.Contains(t, tp.Logs()[0], "ERROR: trace buffer full (2)")
5657
}
5758

59+
/*
60+
This test is an attempt to reproduce one of the panics from incident 37240 [1].
61+
Run the test with -count 100, and you should get a crash like the one shown
62+
below.
63+
64+
[1] https://dd.slack.com/archives/C08NGNZR0C8/p1744390495200339
65+
66+
fatal error: concurrent map iteration and map write
67+
68+
goroutine 354 [running]:
69+
internal/runtime/maps.fatal({0x102db4b48?, 0x14000101d98?})
70+
/Users/felix.geisendoerfer/.local/share/mise/installs/go/1.24.2/src/runtime/panic.go:1058 +0x20
71+
internal/runtime/maps.(*Iter).Next(0x1400009dca0?)
72+
/Users/felix.geisendoerfer/.local/share/mise/installs/go/1.24.2/src/internal/runtime/maps/table.go:683 +0x94
73+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.(*span).EncodeMsg(0x14000140140, 0x14000c1c000)
74+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/span_msgp.go:392 +0x2e8
75+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.spanList.EncodeMsg({0x140003e0320, 0x1, 0x18?}, 0x14000c1c000)
76+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/span_msgp.go:596 +0x80
77+
github.com/tinylib/msgp/msgp.Encode({0x1031ebd00?, 0x14000294d48?}, {0x1031ec060?, 0x1400000e090?})
78+
/Users/felix.geisendoerfer/go/pkg/mod/github.com/tinylib/[email protected]/msgp/write.go:156 +0x60
79+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.(*payload).push(0x14000294d20, {0x140003e0320, 0x1, 0xa})
80+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/payload.go:76 +0x98
81+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.(*agentTraceWriter).add(0x140003e0000, {0x140003e0320?, 0x0?, 0x0?})
82+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/writer.go:69 +0x28
83+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.(*tracer).worker(0x140000c40d0, 0x1400012b2d0)
84+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/tracer.go:457 +0x154
85+
gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.newTracer.func2()
86+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/tracer.go:412 +0xa8
87+
created by gopkg.in/DataDog/dd-trace-go.v1/ddtrace/tracer.newTracer in goroutine 326
88+
/Users/felix.geisendoerfer/go/src/github.com/DataDog/dd-trace-go/ddtrace/tracer/tracer.go:404 +0x2b4
89+
*/
90+
func TestIncident37240DoubleFinish(t *testing.T) {
91+
_, _, _, stop := startTestTracer(t)
92+
defer stop()
93+
94+
root, _ := StartSpanFromContext(context.Background(), "root", Tag(ext.ManualKeep, true))
95+
// My theory is that contrib/aws/internal/span_pointers/span_pointers.go
96+
// adds a span link which is causes `serializeSpanLinksInMeta` to write to
97+
// `s.Meta` without holding the lock. This crashes when the span flushing
98+
// code tries to read `s.Meta` without holding the lock.
99+
root.(*span).AddSpanLink(ddtrace.SpanLink{TraceID: 1, SpanID: 2})
100+
for i := 0; i < 1000; i++ {
101+
root.Finish()
102+
}
103+
}
104+
58105
func TestAsyncSpanRace(t *testing.T) {
59106
testAsyncSpanRace(t)
60107
}

0 commit comments

Comments
 (0)