Skip to content

Commit 43c35b3

Browse files
committed
store/serverservice: return errors with a bit more consistency
This ensures the worker can decide if the work needs to head back to the queue or is an uncorrectable error.
1 parent 9096a7f commit 43c35b3

File tree

4 files changed

+103
-73
lines changed

4 files changed

+103
-73
lines changed

internal/model/model.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package model
22

33
import (
4+
"errors"
45
"net"
56

67
"github.com/bmc-toolbox/common"
@@ -20,6 +21,10 @@ type (
2021
CollectorError string
2122
)
2223

24+
var (
25+
ErrInventoryQuery = errors.New("inventory query returned error")
26+
)
27+
2328
const (
2429
AppName = "alloy"
2530
AppKindInband AppKind = "inband"

internal/store/serverservice/errors.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import "github.com/pkg/errors"
44

55
var (
66
ErrSlugs = errors.New("slugs error")
7-
ErrServerServiceQuery = errors.New("error in server service API query")
87
ErrServerServiceRegisterChanges = errors.New("error in server service API register changes")
98
ErrAssetObject = errors.New("asset object error")
109
ErrAssetObjectConversion = errors.New("error converting asset object")

internal/store/serverservice/serverservice.go

Lines changed: 50 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func (r *Store) AssetByID(ctx context.Context, id string, fetchBmcCredentials bo
107107
if err != nil {
108108
span.SetStatus(codes.Error, "Get() server failed")
109109

110-
return nil, errors.Wrap(ErrServerServiceQuery, "error querying server attributes: "+err.Error())
110+
return nil, errors.Wrap(model.ErrInventoryQuery, "error querying server attributes: "+err.Error())
111111
}
112112

113113
var credential *serverserviceapi.ServerCredential
@@ -120,7 +120,7 @@ func (r *Store) AssetByID(ctx context.Context, id string, fetchBmcCredentials bo
120120
if err != nil {
121121
span.SetStatus(codes.Error, "GetCredential() failed")
122122

123-
return nil, errors.Wrap(ErrServerServiceQuery, "error querying BMC credentials: "+err.Error())
123+
return nil, errors.Wrap(model.ErrInventoryQuery, "error querying BMC credentials: "+err.Error())
124124
}
125125
}
126126

@@ -157,7 +157,7 @@ func (r *Store) AssetsByOffsetLimit(ctx context.Context, offset, limit int) (ass
157157
if err != nil {
158158
span.SetStatus(codes.Error, "List() servers failed")
159159

160-
return nil, 0, errors.Wrap(ErrServerServiceQuery, err.Error())
160+
return nil, 0, errors.Wrap(model.ErrInventoryQuery, err.Error())
161161
}
162162

163163
assets = make([]*model.Asset, 0, len(servers))
@@ -168,7 +168,7 @@ func (r *Store) AssetsByOffsetLimit(ctx context.Context, offset, limit int) (ass
168168
if err != nil {
169169
span.SetStatus(codes.Error, "GetCredential() failed")
170170

171-
return nil, 0, errors.Wrap(ErrServerServiceQuery, err.Error())
171+
return nil, 0, errors.Wrap(model.ErrInventoryQuery, err.Error())
172172
}
173173

174174
asset, err := toAsset(server, credential, true)
@@ -213,7 +213,7 @@ func (r *Store) AssetUpdate(ctx context.Context, asset *model.Asset) error {
213213
// count error
214214
metrics.ServerServiceQueryErrorCount.With(stageLabel).Inc()
215215

216-
return errors.Wrap(ErrServerServiceQuery, err.Error())
216+
return errors.Wrap(model.ErrInventoryQuery, err.Error())
217217
}
218218

219219
if server == nil {
@@ -223,28 +223,58 @@ func (r *Store) AssetUpdate(ctx context.Context, asset *model.Asset) error {
223223
"hr": hr,
224224
}).Warn("server service server query returned nil object")
225225

226-
return errors.Wrap(ErrServerServiceQuery, "got nil Server object")
226+
return errors.Wrap(model.ErrInventoryQuery, "got nil Server object")
227227
}
228228

229229
// publish server inventory
230-
if errPublish := r.publish(ctx, asset, server); errPublish != nil {
230+
if errPublishInv := r.publishInventory(ctx, asset, server); errPublishInv != nil {
231231
r.logger.WithFields(
232232
logrus.Fields{
233233
"id": id,
234234
"hr": hr,
235-
"err": errPublish.Error(),
236-
}).Warn("inventory asset insert/update error")
235+
"err": errPublishInv.Error(),
236+
}).Warn("asset inventory insert/update error")
237237

238238
metricInventorized.With(prometheus.Labels{"status": "failed"}).Add(1)
239+
240+
return errors.Wrap(model.ErrInventoryQuery, errPublishInv.Error())
239241
}
240242

241243
// count devices with no errors
242244
metricInventorized.With(prometheus.Labels{"status": "success"}).Add(1)
243245

246+
if errPublishBiosCfg := r.publishBiosConfig(ctx, asset, server); errPublishBiosCfg != nil {
247+
r.logger.WithFields(
248+
logrus.Fields{
249+
"id": server.UUID.String(),
250+
"err": errPublishBiosCfg,
251+
}).Warn("asset bios configuration insert/update error")
252+
253+
metricBiosCfgCollected.With(prometheus.Labels{"status": "failed"}).Add(1)
254+
255+
return errors.Wrap(model.ErrInventoryQuery, errPublishBiosCfg.Error())
256+
}
257+
258+
metricBiosCfgCollected.With(prometheus.Labels{"status": "success"}).Add(1)
259+
244260
return nil
245261
}
246262

247-
func (r *Store) publish(ctx context.Context, asset *model.Asset, server *serverserviceapi.Server) error {
263+
func (r *Store) publishBiosConfig(ctx context.Context, asset *model.Asset, server *serverserviceapi.Server) error {
264+
// Don't publish bios config if there's no bios config data made avai
265+
if len(asset.BiosConfig) == 0 {
266+
errNoBiosConfig := errors.New("no BIOS configuration collected")
267+
return errNoBiosConfig
268+
}
269+
270+
if asset.HasError(outofband.GetBiosConfigError) {
271+
return errors.New(string(outofband.GetBiosConfigError))
272+
}
273+
274+
return r.createUpdateServerBIOSConfiguration(ctx, server.UUID, asset.BiosConfig)
275+
}
276+
277+
func (r *Store) publishInventory(ctx context.Context, asset *model.Asset, server *serverserviceapi.Server) error {
248278
// create/update server bmc error attributes - for out of band data collection
249279
if r.appKind == model.AppKindOutOfBand && len(asset.Errors) > 0 {
250280
if err := r.createUpdateServerBMCErrorAttributes(
@@ -253,59 +283,32 @@ func (r *Store) publish(ctx context.Context, asset *model.Asset, server *servers
253283
attributeByNamespace(serverBMCErrorsAttributeNS, server.Attributes),
254284
asset,
255285
); err != nil {
256-
return errors.Wrap(ErrServerServiceQuery, "BMC error attribute create/update error: "+err.Error())
286+
return errors.Wrap(model.ErrInventoryQuery, "BMC error attribute create/update error: "+err.Error())
257287
}
258288

259289
// both inventory and BIOS configuration collection failed on a login failure
260290
if asset.HasError(outofband.LoginError) {
261291
return errors.New(string(outofband.LoginError))
262292
}
263-
}
264293

265-
// publish server inventory
266-
if !asset.HasError(outofband.InventoryError) {
267-
if errPublishInventory := r.publishInventory(ctx, server, asset); errPublishInventory != nil {
268-
return errPublishInventory
294+
if asset.HasError(outofband.InventoryError) {
295+
return errors.New(string(outofband.InventoryError))
269296
}
270297
}
271298

272-
// Don't publish bios config if there's no bios config data
273-
if len(asset.BiosConfig) == 0 {
274-
errNoBiosConfig := errors.New("no BIOS configuration collected")
275-
return errNoBiosConfig
276-
}
277-
278-
if asset.HasError(outofband.GetBiosConfigError) {
279-
return errors.New(string(outofband.GetBiosConfigError))
280-
}
281-
282-
// publish bios configuration
283-
err := r.createUpdateServerBIOSConfiguration(ctx, server.UUID, asset.BiosConfig)
284-
if err != nil {
285-
r.logger.WithFields(
286-
logrus.Fields{
287-
"id": server.UUID.String(),
288-
"err": err,
289-
}).Warn("error in server bios configuration versioned attribute update")
290-
}
291-
292-
return nil
293-
}
294-
295-
func (r *Store) publishInventory(ctx context.Context, server *serverserviceapi.Server, asset *model.Asset) error {
296299
// create/update server serial, vendor, model attributes
297300
if err := r.createUpdateServerAttributes(ctx, server, asset); err != nil {
298-
return errors.Wrap(ErrServerServiceQuery, "Server Vendor attribute create/update error: "+err.Error())
301+
return errors.Wrap(model.ErrInventoryQuery, "Server Vendor attribute create/update error: "+err.Error())
299302
}
300303

301304
// create update server metadata attributes
302305
if err := r.createUpdateServerMetadataAttributes(ctx, server.UUID, asset); err != nil {
303-
return errors.Wrap(ErrServerServiceQuery, "Server Metadata attribute create/update error: "+err.Error())
306+
return errors.Wrap(model.ErrInventoryQuery, "Server Metadata attribute create/update error: "+err.Error())
304307
}
305308

306309
// create update server component
307310
if err := r.createUpdateServerComponents(ctx, server.UUID, asset); err != nil {
308-
return errors.Wrap(ErrServerServiceQuery, "Server Component create/update error: "+err.Error())
311+
return errors.Wrap(model.ErrInventoryQuery, "Server Component create/update error: "+err.Error())
309312
}
310313

311314
return nil
@@ -345,7 +348,7 @@ func (r *Store) createUpdateServerComponents(ctx context.Context, serverID uuid.
345348
// set span status
346349
span.SetStatus(codes.Error, "GetComponents() failed")
347350

348-
return errors.Wrap(ErrServerServiceQuery, err.Error())
351+
return errors.Wrap(model.ErrInventoryQuery, err.Error())
349352
}
350353

351354
// For debugging and to capture test fixtures data.
@@ -372,7 +375,7 @@ func (r *Store) createUpdateServerComponents(ctx context.Context, serverID uuid.
372375
// identify changes to be applied
373376
add, update, remove, err := serverServiceChangeList(ctx, currentInventoryPtrSlice, newInventory)
374377
if err != nil {
375-
return errors.Wrap(ErrServerServiceQuery, err.Error())
378+
return errors.Wrap(model.ErrInventoryQuery, err.Error())
376379
}
377380

378381
if len(add) == 0 && len(update) == 0 && len(remove) == 0 {
@@ -409,7 +412,7 @@ func (r *Store) createUpdateServerComponents(ctx context.Context, serverID uuid.
409412
// set span status
410413
span.SetStatus(codes.Error, "CreateComponents() failed")
411414

412-
return errors.Wrap(ErrServerServiceQuery, "CreateComponents: "+err.Error())
415+
return errors.Wrap(model.ErrInventoryQuery, "CreateComponents: "+err.Error())
413416
}
414417
}
415418

@@ -433,7 +436,7 @@ func (r *Store) createUpdateServerComponents(ctx context.Context, serverID uuid.
433436
// set span status
434437
span.SetStatus(codes.Error, "UpdateComponents() failed")
435438

436-
return errors.Wrap(ErrServerServiceQuery, "UpdateComponents: "+err.Error())
439+
return errors.Wrap(model.ErrInventoryQuery, "UpdateComponents: "+err.Error())
437440
}
438441
}
439442

internal/worker/worker.go

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ var (
4949
errTaskFirmwareParam = errors.New("error in task firmware parameters")
5050
errInitTask = errors.New("error initializing new task from event")
5151

52-
errAssetNotFound = errors.New("asset not found in inventory")
53-
errInventoryQuery = errors.New("inventory query returned error")
52+
errAssetNotFound = errors.New("asset not found in inventory")
5453

5554
errCollector = errors.New("collector error")
5655
)
@@ -313,67 +312,91 @@ func (w *Worker) doWork(ctx context.Context, condition *rctypes.Condition, e eve
313312
// check no error
314313
err = w.runTaskWithMonitor(ctx, task, e)
315314
switch err {
316-
case errInventoryQuery:
317-
// inventory lookup failure - non 404 errors
318-
task.SetState(rctypes.Failed)
319-
task.Status = err.Error()
315+
case nil:
316+
// work completed successfully
317+
task.SetState(rctypes.Succeeded)
318+
task.Status = "collection completed successfully"
320319

321-
metrics.RegisterEventCounter(true, "nack")
322-
w.eventNak(e) // have the message bus re-deliver the message
320+
publisher.Publish(ctx, task)
321+
322+
w.eventAckComplete(e)
323323

324+
metrics.RegisterConditionMetrics(startTS, string(rctypes.Succeeded))
325+
metrics.RegisterEventCounter(true, "ack")
324326
metrics.RegisterSpanEvent(
325327
span,
326328
condition,
327329
w.id.String(),
328330
task.Parameters.AssetID.String(),
329-
"sent nack: store query error",
330-
err,
331+
"sent ack: condition finalized",
332+
nil,
331333
)
332334

333-
case errAssetNotFound, errCollector:
334-
// asset was not found
335+
publisher.Publish(ctx, task)
336+
337+
w.logger.WithFields(logrus.Fields{
338+
"serverID": task.Parameters.AssetID.String(),
339+
"conditionID": task.ID,
340+
"elapsed": time.Since(startTS).String(),
341+
"state": task.state,
342+
"status": task.Status,
343+
}).Info("task for device completed")
344+
345+
case model.ErrInventoryQuery:
346+
// inventory lookup failure - non 404 errors
335347
task.SetState(rctypes.Failed)
336348
task.Status = err.Error()
337349

338-
w.eventAckComplete(e)
350+
w.eventNak(e) // have the message bus re-deliver the message
351+
352+
publisher.Publish(ctx, task)
339353

354+
metrics.RegisterEventCounter(true, "nack")
355+
metrics.RegisterConditionMetrics(startTS, string(rctypes.Failed))
340356
metrics.RegisterSpanEvent(
341357
span,
342358
condition,
343359
w.id.String(),
344360
task.Parameters.AssetID.String(),
345-
"sent ack: error"+err.Error(),
361+
"sent nack: store query error",
346362
err,
347363
)
348364

349-
case nil:
350-
// work completed successfully
351-
task.SetState(rctypes.Succeeded)
352-
task.Status = "collection completed successfully"
365+
w.logger.WithFields(logrus.Fields{
366+
"serverID": task.Parameters.AssetID.String(),
367+
"conditionID": task.ID,
368+
"elapsed": time.Since(startTS).String(),
369+
"state": task.state,
370+
"status": task.Status,
371+
}).Info("task for device failed and will be retried")
353372

354-
publisher.Publish(ctx, task)
373+
default:
374+
// all other error cases
375+
task.SetState(rctypes.Failed)
376+
task.Status = err.Error()
355377

356378
w.eventAckComplete(e)
357379

358-
metrics.RegisterConditionMetrics(startTS, string(rctypes.Succeeded))
380+
publisher.Publish(ctx, task)
381+
382+
metrics.RegisterConditionMetrics(startTS, string(rctypes.Failed))
359383
metrics.RegisterEventCounter(true, "ack")
360384
metrics.RegisterSpanEvent(
361385
span,
362386
condition,
363387
w.id.String(),
364388
task.Parameters.AssetID.String(),
365-
"sent ack: condition finalized",
366-
nil,
389+
"sent ack: error"+err.Error(),
390+
err,
367391
)
368392

369-
publisher.Publish(ctx, task)
370393
w.logger.WithFields(logrus.Fields{
371394
"serverID": task.Parameters.AssetID.String(),
372395
"conditionID": task.ID,
373396
"elapsed": time.Since(startTS).String(),
374397
"state": task.state,
375398
"status": task.Status,
376-
}).Info("task for device completed")
399+
}).Info("task for device failed")
377400
}
378401
}
379402

@@ -446,7 +469,7 @@ func (w *Worker) inventoryOutofband(ctx context.Context, task *Task, doneCh chan
446469
return errors.Wrap(errAssetNotFound, err.Error())
447470
}
448471

449-
return errors.Wrap(errInventoryQuery, err.Error())
472+
return errors.Wrap(model.ErrInventoryQuery, err.Error())
450473
}
451474

452475
c, err := collector.NewDeviceCollectorWithStore(w.repository, w.cfg.AppKind, w.logger)

0 commit comments

Comments
 (0)