@@ -56,6 +56,8 @@ import (
56
56
//go:embed parca-agent.bpf.o
57
57
var bpfObj []byte
58
58
59
+ var errUnrecoverable = errors .New ("unrecoverable error" )
60
+
59
61
const (
60
62
stackDepth = 127 // Always needs to be sync with MAX_STACK_DEPTH in parca-agent.bpf.c
61
63
doubleStackDepth = 254
@@ -118,9 +120,51 @@ func (m bpfMaps) clean() error {
118
120
return nil
119
121
}
120
122
123
+ type metrics struct {
124
+ reg prometheus.Registerer
125
+
126
+ missingStacks * prometheus.CounterVec
127
+ missingPIDs prometheus.Counter
128
+ failedStackUnwindingAttempts * prometheus.CounterVec
129
+ }
130
+
131
+ func (m metrics ) unregister () bool {
132
+ return m .reg .Unregister (m .missingStacks ) &&
133
+ m .reg .Unregister (m .missingPIDs ) &&
134
+ m .reg .Unregister (m .failedStackUnwindingAttempts )
135
+ }
136
+
137
+ func newMetrics (reg prometheus.Registerer , target model.LabelSet ) * metrics {
138
+ return & metrics {
139
+ reg : reg ,
140
+ missingStacks : promauto .With (reg ).NewCounterVec (
141
+ prometheus.CounterOpts {
142
+ Name : "parca_agent_profiler_missing_stacks_total" ,
143
+ Help : "Number of missing profile stacks" ,
144
+ ConstLabels : map [string ]string {"target" : target .String ()},
145
+ },
146
+ []string {"type" },
147
+ ),
148
+ missingPIDs : promauto .With (reg ).NewCounter (
149
+ prometheus.CounterOpts {
150
+ Name : "parca_agent_profiler_missing_pid_total" ,
151
+ Help : "Number of missing PIDs" ,
152
+ ConstLabels : map [string ]string {"target" : target .String ()},
153
+ },
154
+ ),
155
+ failedStackUnwindingAttempts : promauto .With (reg ).NewCounterVec (
156
+ prometheus.CounterOpts {
157
+ Name : "parca_agent_profiler_failed_stack_unwinding_attempts_total" ,
158
+ Help : "Number of failed stack unwinding attempts" ,
159
+ ConstLabels : map [string ]string {"target" : target .String ()},
160
+ },
161
+ []string {"type" },
162
+ ),
163
+ }
164
+ }
165
+
121
166
type CgroupProfiler struct {
122
167
logger log.Logger
123
- reg prometheus.Registerer
124
168
125
169
mtx * sync.RWMutex
126
170
cancel func ()
@@ -130,9 +174,9 @@ type CgroupProfiler struct {
130
174
ksymCache * ksym.Cache
131
175
objCache objectfile.Cache
132
176
133
- bpfMaps * bpfMaps
177
+ bpfMaps * bpfMaps
178
+ byteOrder binary.ByteOrder
134
179
135
- missingStacks * prometheus.CounterVec
136
180
lastError error
137
181
lastProfileTakenAt time.Time
138
182
@@ -143,6 +187,8 @@ type CgroupProfiler struct {
143
187
profilingDuration time.Duration
144
188
145
189
profileBufferPool sync.Pool
190
+
191
+ metrics * metrics
146
192
}
147
193
148
194
func NewCgroupProfiler (
@@ -158,7 +204,6 @@ func NewCgroupProfiler(
158
204
) * CgroupProfiler {
159
205
return & CgroupProfiler {
160
206
logger : log .With (logger , "labels" , target .String ()),
161
- reg : reg ,
162
207
mtx : & sync.RWMutex {},
163
208
target : target ,
164
209
profilingDuration : profilingDuration ,
@@ -172,19 +217,13 @@ func NewCgroupProfiler(
172
217
debugInfoClient ,
173
218
tmp ,
174
219
),
175
- missingStacks : promauto .With (reg ).NewCounterVec (
176
- prometheus.CounterOpts {
177
- Name : "parca_agent_profiler_missing_stacks_total" ,
178
- Help : "Number of missing profile stacks" ,
179
- ConstLabels : map [string ]string {"target" : target .String ()},
180
- },
181
- []string {"type" },
182
- ),
183
220
profileBufferPool : sync.Pool {
184
221
New : func () interface {} {
185
222
return bytes .NewBuffer (nil )
186
223
},
187
224
},
225
+ byteOrder : byteorder .GetHostByteOrder (),
226
+ metrics : newMetrics (reg , target ),
188
227
}
189
228
}
190
229
@@ -212,8 +251,8 @@ func (p *CgroupProfiler) Stop() {
212
251
p .mtx .Lock ()
213
252
defer p .mtx .Unlock ()
214
253
level .Debug (p .logger ).Log ("msg" , "stopping cgroup profiler" )
215
- if ! p .reg . Unregister ( p . missingStacks ) {
216
- level .Debug (p .logger ).Log ("msg" , "cannot unregister metric " )
254
+ if ! p .metrics . unregister ( ) {
255
+ level .Debug (p .logger ).Log ("msg" , "cannot unregister metrics " )
217
256
}
218
257
if p .cancel != nil {
219
258
p .cancel ()
@@ -366,64 +405,55 @@ func (p *CgroupProfiler) profileLoop(ctx context.Context, captureTime time.Time)
366
405
samples := map [[doubleStackDepth ]uint64 ]* profile.Sample {}
367
406
368
407
it := p .bpfMaps .counts .Iterator ()
369
- byteOrder := byteorder .GetHostByteOrder ()
370
408
371
409
// TODO(brancz): Use libbpf batch functions.
372
410
for it .Next () {
373
411
// This byte slice is only valid for this iteration, so it must be
374
412
// copied if we want to do anything with it outside of this loop.
375
413
keyBytes := it .Key ()
376
-
377
414
r := bytes .NewBuffer (keyBytes )
378
415
379
416
pidBytes := make ([]byte , 4 )
380
417
if _ , err := io .ReadFull (r , pidBytes ); err != nil {
381
418
return fmt .Errorf ("read pid bytes: %w" , err )
382
419
}
383
- pid := byteOrder .Uint32 (pidBytes )
384
-
385
- userStackIDBytes := make ([] byte , 4 )
386
- if _ , err := io . ReadFull ( r , userStackIDBytes ); err != nil {
387
- return fmt . Errorf ( "read user stack ID bytes: %w" , err )
420
+ pid := p . byteOrder .Uint32 (pidBytes )
421
+ if pid == 0 {
422
+ level . Debug ( p . logger ). Log ( "msg" , "missing pid" )
423
+ p . metrics . missingPIDs . Inc ()
424
+ continue
388
425
}
389
- userStackID := int32 (byteOrder .Uint32 (userStackIDBytes ))
390
426
391
- kernelStackIDBytes := make ([]byte , 4 )
392
- if _ , err := io .ReadFull (r , kernelStackIDBytes ); err != nil {
393
- return fmt .Errorf ("read kernel stack ID bytes: %w" , err )
427
+ // Twice the stack depth because we have a user and a potential Kernel stack.
428
+ // Read order matters, since we read from the key buffer.
429
+ stack := [doubleStackDepth ]uint64 {}
430
+ userErr := p .readUserStack (r , & stack )
431
+ if userErr != nil {
432
+ if errors .Is (userErr , errUnrecoverable ) {
433
+ return userErr
434
+ }
435
+ level .Debug (p .logger ).Log ("msg" , "failed to read user stack" , "err" , userErr )
394
436
}
395
- kernelStackID := int32 (byteOrder .Uint32 (kernelStackIDBytes ))
396
-
397
- valueBytes , err := p .bpfMaps .counts .GetValue (unsafe .Pointer (& keyBytes [0 ]))
398
- if err != nil {
399
- return fmt .Errorf ("get count value: %w" , err )
437
+ kernelErr := p .readKernelStack (r , & stack )
438
+ if kernelErr != nil {
439
+ if errors .Is (kernelErr , errUnrecoverable ) {
440
+ return kernelErr
441
+ }
442
+ level .Debug (p .logger ).Log ("msg" , "failed to read kernel stack" , "err" , kernelErr )
400
443
}
401
- value := byteOrder .Uint64 (valueBytes )
402
-
403
- stackBytes , err := p .bpfMaps .stackTraces .GetValue (unsafe .Pointer (& userStackID ))
404
- if err != nil {
405
- p .missingStacks .WithLabelValues ("user" ).Inc ()
444
+ if userErr != nil && kernelErr != nil {
445
+ // Both stacks are missing. Nothing to do.
406
446
continue
407
447
}
408
448
409
- // Twice the stack depth because we have a user and a potential Kernel stack.
410
- stack := [doubleStackDepth ]uint64 {}
411
- err = binary .Read (bytes .NewBuffer (stackBytes ), byteOrder , stack [:stackDepth ])
449
+ value , err := p .readValue (keyBytes )
412
450
if err != nil {
413
- return fmt .Errorf ("read user stack trace : %w" , err )
451
+ return fmt .Errorf ("read value : %w" , err )
414
452
}
415
-
416
- if kernelStackID >= 0 {
417
- stackBytes , err = p .bpfMaps .stackTraces .GetValue (unsafe .Pointer (& kernelStackID ))
418
- if err != nil {
419
- p .missingStacks .WithLabelValues ("kernel" ).Inc ()
420
- continue
421
- }
422
-
423
- err = binary .Read (bytes .NewBuffer (stackBytes ), byteOrder , stack [stackDepth :])
424
- if err != nil {
425
- return fmt .Errorf ("read kernel stack trace: %w" , err )
426
- }
453
+ if value == 0 {
454
+ // This should never happen, but it's here just in case.
455
+ // If we have a zero value, we don't want to add it to the profile.
456
+ continue
427
457
}
428
458
429
459
sample , ok := samples [stack ]
@@ -587,6 +617,64 @@ func (p *CgroupProfiler) profileLoop(ctx context.Context, captureTime time.Time)
587
617
return nil
588
618
}
589
619
620
+ func (p * CgroupProfiler ) readUserStack (r * bytes.Buffer , stack * [254 ]uint64 ) error {
621
+ userStackIDBytes := make ([]byte , 4 )
622
+ if _ , err := io .ReadFull (r , userStackIDBytes ); err != nil {
623
+ return fmt .Errorf ("read user stack bytes, %s: %w" , err , errUnrecoverable )
624
+ }
625
+
626
+ userStackID := int32 (p .byteOrder .Uint32 (userStackIDBytes ))
627
+ if userStackID == 0 {
628
+ p .metrics .failedStackUnwindingAttempts .WithLabelValues ("user" ).Inc ()
629
+ return errors .New ("user stack ID is 0, probably stack unwinding failed" )
630
+ }
631
+
632
+ stackBytes , err := p .bpfMaps .stackTraces .GetValue (unsafe .Pointer (& userStackID ))
633
+ if err != nil {
634
+ p .metrics .missingStacks .WithLabelValues ("user" ).Inc ()
635
+ return fmt .Errorf ("read user stack trace: %w" , err )
636
+ }
637
+
638
+ if err := binary .Read (bytes .NewBuffer (stackBytes ), p .byteOrder , stack [:stackDepth ]); err != nil {
639
+ return fmt .Errorf ("read user stack bytes, %s: %w" , err , errUnrecoverable )
640
+ }
641
+
642
+ return nil
643
+ }
644
+
645
+ func (p * CgroupProfiler ) readKernelStack (r * bytes.Buffer , stack * [254 ]uint64 ) error {
646
+ kernelStackIDBytes := make ([]byte , 4 )
647
+ if _ , err := io .ReadFull (r , kernelStackIDBytes ); err != nil {
648
+ return fmt .Errorf ("read kernel stack bytes, %s: %w" , err , errUnrecoverable )
649
+ }
650
+
651
+ kernelStackID := int32 (p .byteOrder .Uint32 (kernelStackIDBytes ))
652
+ if kernelStackID == 0 {
653
+ p .metrics .failedStackUnwindingAttempts .WithLabelValues ("kernel" ).Inc ()
654
+ return errors .New ("kernel stack ID is 0, probably stack unwinding failed" )
655
+ }
656
+
657
+ stackBytes , err := p .bpfMaps .stackTraces .GetValue (unsafe .Pointer (& kernelStackID ))
658
+ if err != nil {
659
+ p .metrics .missingStacks .WithLabelValues ("kernel" ).Inc ()
660
+ return fmt .Errorf ("read kernel stack trace: %w" , err )
661
+ }
662
+
663
+ if err := binary .Read (bytes .NewBuffer (stackBytes ), p .byteOrder , stack [stackDepth :]); err != nil {
664
+ return fmt .Errorf ("read kernel stack bytes, %s: %w" , err , errUnrecoverable )
665
+ }
666
+
667
+ return nil
668
+ }
669
+
670
+ func (p * CgroupProfiler ) readValue (keyBytes []byte ) (uint64 , error ) {
671
+ valueBytes , err := p .bpfMaps .counts .GetValue (unsafe .Pointer (& keyBytes [0 ]))
672
+ if err != nil {
673
+ return 0 , fmt .Errorf ("get count value: %w" , err )
674
+ }
675
+ return p .byteOrder .Uint64 (valueBytes ), nil
676
+ }
677
+
590
678
func (p * CgroupProfiler ) normalizeAddress (m * profile.Mapping , pid uint32 , addr uint64 ) uint64 {
591
679
if m == nil {
592
680
return addr
0 commit comments