Skip to content

Commit 433f7ae

Browse files
mwearAlex Boten
and
Alex Boten
authored
Automate status reporting on start (#8836)
This is part of the continued component status reporting effort. Currently we have automated status reporting for the following component lifecycle events: `Starting`, `Stopping`, `Stopped` as well as definitive errors that occur in the starting or stopping process (e.g. as determined by an error return value). This leaves the responsibility to the component to report runtime status after start and before stop. We'd like to be able to extend the automatic status reporting to report `StatusOK` if `Start` completes without an error. One complication with this approach is that some components spawn async work (via goroutines) that, depending on the Go scheduler, can report status before `Start` returns. As such, we cannot assume a nil return value from `Start` means the component has started properly. The solution is to detect if the component has already reported status when start returns, if it has, we will use the component-reported status and will not automatically report status. If it hasn't, and `Start` returns without an error, we can report `StatusOK`. Any subsequent reports from the component (async or otherwise) will transition the component status accordingly. The tl;dr is that we cannot control the execution of async code, that's up to the Go scheduler, but we can handle the race, report the status based on the execution, and not clobber status reported from within the component during the startup process. That said, for components with async starts, you may see a `StatusOK` before the component-reported status, or just the component-reported status depending on the actual execution of the code. In both cases, the end status will be same. The work in this PR will allow us to simplify #8684 and #8788 and ultimately choose which direction we want to go for runtime status reporting. **Link to tracking Issue:** #7682 **Testing:** units / manual --------- Co-authored-by: Alex Boten <[email protected]>
1 parent 1a4ed9e commit 433f7ae

15 files changed

+346
-112
lines changed
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
7+
component: statusreporting
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Automates status reporting upon the completion of component.Start().
11+
12+
# One or more tracking issues or pull requests related to the change
13+
issues: [7682]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# Optional: The change log or logs in which this entry should be included.
21+
# e.g. '[user]' or '[user, api]'
22+
# Include 'user' if the change is relevant to end users.
23+
# Include 'api' if there is a change to a library API.
24+
# Default: '[user]'
25+
change_logs: []

component/telemetry.go

+37-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,43 @@ import (
1212
"go.opentelemetry.io/collector/pdata/pcommon"
1313
)
1414

15+
// TelemetrySettings provides components with APIs to report telemetry.
16+
//
17+
// Note: there is a service version of this struct, servicetelemetry.TelemetrySettings, that mirrors
18+
// this struct with the exception of ReportComponentStatus. When adding or removing anything from
19+
// this struct consider whether or not the same should be done for the service version.
20+
type TelemetrySettings struct {
21+
// Logger that the factory can use during creation and can pass to the created
22+
// component to be used later as well.
23+
Logger *zap.Logger
24+
25+
// TracerProvider that the factory can pass to other instrumented third-party libraries.
26+
TracerProvider trace.TracerProvider
27+
28+
// MeterProvider that the factory can pass to other instrumented third-party libraries.
29+
MeterProvider metric.MeterProvider
30+
31+
// MetricsLevel controls the level of detail for metrics emitted by the collector.
32+
// Experimental: *NOTE* this field is experimental and may be changed or removed.
33+
MetricsLevel configtelemetry.Level
34+
35+
// Resource contains the resource attributes for the collector's telemetry.
36+
Resource pcommon.Resource
37+
38+
// ReportComponentStatus allows a component to report runtime changes in status. The service
39+
// will automatically report status for a component during startup and shutdown. Components can
40+
// use this method to report status after start and before shutdown. ReportComponentStatus
41+
// will only return errors if the API used incorrectly. The two scenarios where an error will
42+
// be returned are:
43+
//
44+
// - An illegal state transition
45+
// - Calling this method before component startup
46+
//
47+
// If the API is being used properly, these errors are safe to ignore.
48+
ReportComponentStatus StatusFunc
49+
}
50+
51+
// Deprecated: [0.91.0] Use TelemetrySettings directly
1552
type TelemetrySettingsBase[T any] struct {
1653
// Logger that the factory can use during creation and can pass to the created
1754
// component to be used later as well.
@@ -42,7 +79,3 @@ type TelemetrySettingsBase[T any] struct {
4279
// If the API is being used properly, these errors are safe to ignore.
4380
ReportComponentStatus T
4481
}
45-
46-
// TelemetrySettings and servicetelemetry.Settings differ in the method signature for
47-
// ReportComponentStatus
48-
type TelemetrySettings TelemetrySettingsBase[StatusFunc]

internal/sharedcomponent/sharedcomponent.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,11 @@ func (scs *SharedComponents[K, V]) GetOrAdd(key K, create func() (V, error), tel
3636
c.seenSettings[telemetrySettings] = struct{}{}
3737
prev := c.telemetry.ReportComponentStatus
3838
c.telemetry.ReportComponentStatus = func(ev *component.StatusEvent) error {
39-
if err := telemetrySettings.ReportComponentStatus(ev); err != nil {
40-
return err
39+
err := telemetrySettings.ReportComponentStatus(ev)
40+
if prevErr := prev(ev); prevErr != nil {
41+
err = prevErr
4142
}
42-
return prev(ev)
43+
return err
4344
}
4445
}
4546
return c, nil

otelcol/collector_test.go

+19-5
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,24 @@ func TestComponentStatusWatcher(t *testing.T) {
182182
// Start the newly created collector.
183183
wg := startCollector(context.Background(), t, col)
184184

185-
// An unhealthy processor asynchronously reports a recoverable error.
186-
expectedStatuses := []component.Status{
185+
// An unhealthy processor asynchronously reports a recoverable error. Depending on the Go
186+
// Scheduler the statuses reported at startup will be one of the two valid sequnces below.
187+
startupStatuses1 := []component.Status{
187188
component.StatusStarting,
189+
component.StatusOK,
188190
component.StatusRecoverableError,
189191
}
192+
startupStatuses2 := []component.Status{
193+
component.StatusStarting,
194+
component.StatusRecoverableError,
195+
}
196+
// the modulus of the actual statuses will match the modulus of the startup statuses
197+
startupStatuses := func(actualStatuses []component.Status) []component.Status {
198+
if len(actualStatuses)%2 == 1 {
199+
return startupStatuses1
200+
}
201+
return startupStatuses2
202+
}
190203

191204
// The "unhealthy" processors will now begin to asynchronously report StatusRecoverableError.
192205
// We expect to see these reports.
@@ -197,8 +210,8 @@ func TestComponentStatusWatcher(t *testing.T) {
197210
for k, v := range changedComponents {
198211
// All processors must report a status change with the same ID
199212
assert.EqualValues(t, component.NewID(unhealthyProcessorFactory.Type()), k.ID)
200-
// And all must have the expected statuses
201-
assert.Equal(t, expectedStatuses, v)
213+
// And all must have a valid startup sequence
214+
assert.Equal(t, startupStatuses(v), v)
202215
}
203216
// We have 3 processors with exactly the same ID in otelcol-statuswatcher.yaml
204217
// We must have exactly 3 items in our map. This ensures that the "source" argument
@@ -212,8 +225,9 @@ func TestComponentStatusWatcher(t *testing.T) {
212225
wg.Wait()
213226

214227
// Check for additional statuses after Shutdown.
215-
expectedStatuses = append(expectedStatuses, component.StatusStopping, component.StatusStopped)
216228
for _, v := range changedComponents {
229+
expectedStatuses := append([]component.Status{}, startupStatuses(v)...)
230+
expectedStatuses = append(expectedStatuses, component.StatusStopping, component.StatusStopped)
217231
assert.Equal(t, expectedStatuses, v)
218232
}
219233

service/extensions/extensions.go

+21-5
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,18 @@ func (bes *Extensions) Start(ctx context.Context, host component.Host) error {
3737
extLogger.Info("Extension is starting...")
3838
instanceID := bes.instanceIDs[extID]
3939
ext := bes.extMap[extID]
40-
_ = bes.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStarting))
40+
_ = bes.telemetry.Status.ReportComponentStatus(
41+
instanceID,
42+
component.NewStatusEvent(component.StatusStarting),
43+
)
4144
if err := ext.Start(ctx, components.NewHostWrapper(host, extLogger)); err != nil {
42-
_ = bes.telemetry.ReportComponentStatus(instanceID, component.NewPermanentErrorEvent(err))
45+
_ = bes.telemetry.Status.ReportComponentStatus(
46+
instanceID,
47+
component.NewPermanentErrorEvent(err),
48+
)
4349
return err
4450
}
51+
_ = bes.telemetry.Status.ReportComponentOKIfStarting(instanceID)
4552
extLogger.Info("Extension started.")
4653
}
4754
return nil
@@ -55,13 +62,22 @@ func (bes *Extensions) Shutdown(ctx context.Context) error {
5562
extID := bes.extensionIDs[i]
5663
instanceID := bes.instanceIDs[extID]
5764
ext := bes.extMap[extID]
58-
_ = bes.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStopping))
65+
_ = bes.telemetry.Status.ReportComponentStatus(
66+
instanceID,
67+
component.NewStatusEvent(component.StatusStopping),
68+
)
5969
if err := ext.Shutdown(ctx); err != nil {
60-
_ = bes.telemetry.ReportComponentStatus(instanceID, component.NewPermanentErrorEvent(err))
70+
_ = bes.telemetry.Status.ReportComponentStatus(
71+
instanceID,
72+
component.NewPermanentErrorEvent(err),
73+
)
6174
errs = multierr.Append(errs, err)
6275
continue
6376
}
64-
_ = bes.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStopped))
77+
_ = bes.telemetry.Status.ReportComponentStatus(
78+
instanceID,
79+
component.NewStatusEvent(component.StatusStopped),
80+
)
6581
}
6682

6783
return errs

service/extensions/extensions_test.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
381381
name: "successful startup/shutdown",
382382
expectedStatuses: []*component.StatusEvent{
383383
component.NewStatusEvent(component.StatusStarting),
384+
component.NewStatusEvent(component.StatusOK),
384385
component.NewStatusEvent(component.StatusStopping),
385386
component.NewStatusEvent(component.StatusStopped),
386387
},
@@ -400,6 +401,7 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
400401
name: "shutdown error",
401402
expectedStatuses: []*component.StatusEvent{
402403
component.NewStatusEvent(component.StatusStarting),
404+
component.NewStatusEvent(component.StatusOK),
403405
component.NewStatusEvent(component.StatusStopping),
404406
component.NewPermanentErrorEvent(assert.AnError),
405407
},
@@ -430,11 +432,11 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
430432
assert.NoError(t, err)
431433

432434
var actualStatuses []*component.StatusEvent
433-
init, statusFunc := status.NewServiceStatusFunc(func(id *component.InstanceID, ev *component.StatusEvent) {
435+
rep := status.NewReporter(func(id *component.InstanceID, ev *component.StatusEvent) {
434436
actualStatuses = append(actualStatuses, ev)
435437
})
436-
extensions.telemetry.ReportComponentStatus = statusFunc
437-
init()
438+
extensions.telemetry.Status = rep
439+
rep.Ready()
438440

439441
assert.Equal(t, tc.startErr, extensions.Start(context.Background(), componenttest.NewNopHost()))
440442
if tc.startErr == nil {

service/internal/graph/graph.go

+22-5
Original file line numberDiff line numberDiff line change
@@ -386,12 +386,20 @@ func (g *Graph) StartAll(ctx context.Context, host component.Host) error {
386386
}
387387

388388
instanceID := g.instanceIDs[node.ID()]
389-
_ = g.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStarting))
389+
_ = g.telemetry.Status.ReportComponentStatus(
390+
instanceID,
391+
component.NewStatusEvent(component.StatusStarting),
392+
)
390393

391394
if compErr := comp.Start(ctx, host); compErr != nil {
392-
_ = g.telemetry.ReportComponentStatus(instanceID, component.NewPermanentErrorEvent(compErr))
395+
_ = g.telemetry.Status.ReportComponentStatus(
396+
instanceID,
397+
component.NewPermanentErrorEvent(compErr),
398+
)
393399
return compErr
394400
}
401+
402+
_ = g.telemetry.Status.ReportComponentOKIfStarting(instanceID)
395403
}
396404
return nil
397405
}
@@ -417,15 +425,24 @@ func (g *Graph) ShutdownAll(ctx context.Context) error {
417425
}
418426

419427
instanceID := g.instanceIDs[node.ID()]
420-
_ = g.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStopping))
428+
_ = g.telemetry.Status.ReportComponentStatus(
429+
instanceID,
430+
component.NewStatusEvent(component.StatusStopping),
431+
)
421432

422433
if compErr := comp.Shutdown(ctx); compErr != nil {
423434
errs = multierr.Append(errs, compErr)
424-
_ = g.telemetry.ReportComponentStatus(instanceID, component.NewPermanentErrorEvent(compErr))
435+
_ = g.telemetry.Status.ReportComponentStatus(
436+
instanceID,
437+
component.NewPermanentErrorEvent(compErr),
438+
)
425439
continue
426440
}
427441

428-
_ = g.telemetry.ReportComponentStatus(instanceID, component.NewStatusEvent(component.StatusStopped))
442+
_ = g.telemetry.Status.ReportComponentStatus(
443+
instanceID,
444+
component.NewStatusEvent(component.StatusStopped),
445+
)
429446
}
430447
return errs
431448
}

service/internal/graph/graph_test.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -2163,11 +2163,13 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
21632163
expectedStatuses: map[*component.InstanceID][]*component.StatusEvent{
21642164
instanceIDs[rNoErr]: {
21652165
component.NewStatusEvent(component.StatusStarting),
2166+
component.NewStatusEvent(component.StatusOK),
21662167
component.NewStatusEvent(component.StatusStopping),
21672168
component.NewStatusEvent(component.StatusStopped),
21682169
},
21692170
instanceIDs[eNoErr]: {
21702171
component.NewStatusEvent(component.StatusStarting),
2172+
component.NewStatusEvent(component.StatusOK),
21712173
component.NewStatusEvent(component.StatusStopping),
21722174
component.NewStatusEvent(component.StatusStopped),
21732175
},
@@ -2194,6 +2196,7 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
21942196
},
21952197
instanceIDs[eNoErr]: {
21962198
component.NewStatusEvent(component.StatusStarting),
2199+
component.NewStatusEvent(component.StatusOK),
21972200
component.NewStatusEvent(component.StatusStopping),
21982201
component.NewStatusEvent(component.StatusStopped),
21992202
},
@@ -2206,11 +2209,13 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
22062209
expectedStatuses: map[*component.InstanceID][]*component.StatusEvent{
22072210
instanceIDs[rSdErr]: {
22082211
component.NewStatusEvent(component.StatusStarting),
2212+
component.NewStatusEvent(component.StatusOK),
22092213
component.NewStatusEvent(component.StatusStopping),
22102214
component.NewPermanentErrorEvent(assert.AnError),
22112215
},
22122216
instanceIDs[eNoErr]: {
22132217
component.NewStatusEvent(component.StatusStarting),
2218+
component.NewStatusEvent(component.StatusOK),
22142219
component.NewStatusEvent(component.StatusStopping),
22152220
component.NewStatusEvent(component.StatusStopped),
22162221
},
@@ -2223,11 +2228,13 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
22232228
expectedStatuses: map[*component.InstanceID][]*component.StatusEvent{
22242229
instanceIDs[rNoErr]: {
22252230
component.NewStatusEvent(component.StatusStarting),
2231+
component.NewStatusEvent(component.StatusOK),
22262232
component.NewStatusEvent(component.StatusStopping),
22272233
component.NewStatusEvent(component.StatusStopped),
22282234
},
22292235
instanceIDs[eSdErr]: {
22302236
component.NewStatusEvent(component.StatusStarting),
2237+
component.NewStatusEvent(component.StatusOK),
22312238
component.NewStatusEvent(component.StatusStopping),
22322239
component.NewPermanentErrorEvent(assert.AnError),
22332240
},
@@ -2240,12 +2247,12 @@ func TestStatusReportedOnStartupShutdown(t *testing.T) {
22402247
pg.telemetry = servicetelemetry.NewNopTelemetrySettings()
22412248

22422249
actualStatuses := make(map[*component.InstanceID][]*component.StatusEvent)
2243-
init, statusFunc := status.NewServiceStatusFunc(func(id *component.InstanceID, ev *component.StatusEvent) {
2250+
rep := status.NewReporter(func(id *component.InstanceID, ev *component.StatusEvent) {
22442251
actualStatuses[id] = append(actualStatuses[id], ev)
22452252
})
22462253

2247-
pg.telemetry.ReportComponentStatus = statusFunc
2248-
init()
2254+
pg.telemetry.Status = rep
2255+
rep.Ready()
22492256

22502257
e0, e1 := tc.edge[0], tc.edge[1]
22512258
pg.instanceIDs = map[int64]*component.InstanceID{

service/internal/servicetelemetry/nop_telemetry_settings.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"go.opentelemetry.io/collector/component"
1212
"go.opentelemetry.io/collector/config/configtelemetry"
1313
"go.opentelemetry.io/collector/pdata/pcommon"
14+
"go.opentelemetry.io/collector/service/internal/status"
1415
)
1516

1617
// NewNopTelemetrySettings returns a new nop settings for Create* functions.
@@ -21,8 +22,6 @@ func NewNopTelemetrySettings() TelemetrySettings {
2122
MeterProvider: noopmetric.NewMeterProvider(),
2223
MetricsLevel: configtelemetry.LevelNone,
2324
Resource: pcommon.NewResource(),
24-
ReportComponentStatus: func(*component.InstanceID, *component.StatusEvent) error {
25-
return nil
26-
},
25+
Status: status.NewReporter(func(*component.InstanceID, *component.StatusEvent) {}),
2726
}
2827
}

service/internal/servicetelemetry/nop_telemetry_settings_test.go

+8-2
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,19 @@ import (
1818

1919
func TestNewNopSettings(t *testing.T) {
2020
set := NewNopTelemetrySettings()
21-
21+
set.Status.Ready()
2222
require.NotNil(t, set)
2323
require.IsType(t, TelemetrySettings{}, set)
2424
require.Equal(t, zap.NewNop(), set.Logger)
2525
require.Equal(t, nooptrace.NewTracerProvider(), set.TracerProvider)
2626
require.Equal(t, noopmetric.NewMeterProvider(), set.MeterProvider)
2727
require.Equal(t, configtelemetry.LevelNone, set.MetricsLevel)
2828
require.Equal(t, pcommon.NewResource(), set.Resource)
29-
require.NoError(t, set.ReportComponentStatus(&component.InstanceID{}, component.NewStatusEvent(component.StatusStarting)))
29+
require.NoError(t,
30+
set.Status.ReportComponentStatus(
31+
&component.InstanceID{},
32+
component.NewStatusEvent(component.StatusStarting),
33+
),
34+
)
35+
require.NoError(t, set.Status.ReportComponentOKIfStarting(&component.InstanceID{}))
3036
}

0 commit comments

Comments
 (0)