Skip to content

Commit c2ac20f

Browse files
status: Report the operators that have not yet deployed
The current single error return strategy from the CVO sync loop predates parallel payload execution and limited retries. Instead, collect all errors from the task execution graph and attempt to synthesize better messages that describe what is actually happening. 1. Filter out cancellation error messages - they aren't useful and are a normal part of execution 2. When multiple errors are reported, display a reasonable multi-line error that summarizes any blockers 3. Treat ClusterOperatorNotAvailable as a special case - if all errors reported are of that type convert it to ClusterOperatorsNotAvailable and synthesize a better message 4. In the sync loop, if we are still making progress towards the update goal and we haven't waited too long for an update, and if the error is the specific cluster operator not available types, display the condition Progressing=True instead of Failing=true with a synthetic message. This also passes along the task with the UpdateError so that we can do more selective error messages for specific error cases.
1 parent 1fd0041 commit c2ac20f

File tree

13 files changed

+545
-98
lines changed

13 files changed

+545
-98
lines changed

pkg/cvo/cvo_scenarios_test.go

Lines changed: 262 additions & 62 deletions
Large diffs are not rendered by default.

pkg/cvo/status.go

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"encoding/json"
66
"fmt"
7+
"time"
78

89
"github.com/golang/glog"
910

@@ -156,6 +157,8 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat
156157
now := metav1.Now()
157158
version := versionString(status.Actual)
158159

160+
mergeOperatorHistory(config, status.Actual, now, status.Completed > 0)
161+
159162
// update validation errors
160163
var reason string
161164
if len(validationErrs) > 0 {
@@ -200,7 +203,9 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat
200203
})
201204
}
202205

203-
if err := status.Failure; err != nil {
206+
progressReason, progressShortMessage, skipFailure := convertErrorToProgressing(config.Status.History, now.Time, status)
207+
208+
if err := status.Failure; err != nil && !skipFailure {
204209
var reason string
205210
msg := "an error occurred"
206211
if uErr, ok := err.(*payload.UpdateError); ok {
@@ -258,13 +263,19 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat
258263
switch {
259264
case len(validationErrs) > 0:
260265
message = fmt.Sprintf("Reconciling %s: the cluster version is invalid", version)
266+
case status.Fraction > 0 && skipFailure:
267+
reason = progressReason
268+
message = fmt.Sprintf("Working towards %s: %.0f%% complete, %s", version, status.Fraction*100, progressShortMessage)
261269
case status.Fraction > 0:
262270
message = fmt.Sprintf("Working towards %s: %.0f%% complete", version, status.Fraction*100)
263271
case status.Step == "RetrievePayload":
264272
if len(reason) == 0 {
265273
reason = "DownloadingUpdate"
266274
}
267275
message = fmt.Sprintf("Working towards %s: downloading update", version)
276+
case skipFailure:
277+
reason = progressReason
278+
message = fmt.Sprintf("Working towards %s: %s", version, progressShortMessage)
268279
default:
269280
message = fmt.Sprintf("Working towards %s", version)
270281
}
@@ -287,8 +298,6 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat
287298
})
288299
}
289300

290-
mergeOperatorHistory(config, status.Actual, now, status.Completed > 0)
291-
292301
if glog.V(6) {
293302
glog.Infof("Apply config: %s", diff.ObjectReflectDiff(original, config))
294303
}
@@ -297,6 +306,27 @@ func (optr *Operator) syncStatus(original, config *configv1.ClusterVersion, stat
297306
return err
298307
}
299308

309+
// convertErrorToProgressing returns true if the provided status indicates a failure condition can be interpreted as
310+
// still making internal progress. The general error we try to suppress is an operator or operators still being
311+
// unavailable AND the general payload task making progress towards its goal. An operator is given 10 minutes since
312+
// its last update to go ready, or an hour has elapsed since the update began, before the condition is ignored.
313+
func convertErrorToProgressing(history []configv1.UpdateHistory, now time.Time, status *SyncWorkerStatus) (reason string, message string, ok bool) {
314+
if len(history) == 0 || status.Failure == nil || status.Reconciling || status.LastProgress.IsZero() {
315+
return "", "", false
316+
}
317+
if now.Sub(status.LastProgress) > 10*time.Minute || now.Sub(history[0].StartedTime.Time) > time.Hour {
318+
return "", "", false
319+
}
320+
uErr, ok := status.Failure.(*payload.UpdateError)
321+
if !ok {
322+
return "", "", false
323+
}
324+
if uErr.Reason == "ClusterOperatorNotAvailable" || uErr.Reason == "ClusterOperatorsNotAvailable" {
325+
return uErr.Reason, fmt.Sprintf("waiting on %s", uErr.Name), true
326+
}
327+
return "", "", false
328+
}
329+
300330
// syncDegradedStatus handles generic errors in the cluster version. It tries to preserve
301331
// all status fields that it can by using the provided config or loading the latest version
302332
// from the cache (instead of clearing the status).

pkg/cvo/sync_worker.go

Lines changed: 166 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import (
44
"context"
55
"fmt"
66
"reflect"
7+
"sort"
8+
"strings"
79
"sync"
810
"time"
911

@@ -12,6 +14,7 @@ import (
1214
"golang.org/x/time/rate"
1315

1416
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/util/errors"
1518
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1619
"k8s.io/apimachinery/pkg/util/wait"
1720

@@ -67,6 +70,8 @@ type SyncWorkerStatus struct {
6770
Initial bool
6871
VersionHash string
6972

73+
LastProgress time.Time
74+
7075
Actual configv1.Update
7176
}
7277

@@ -304,6 +309,9 @@ func (w *statusWrapper) Report(status SyncWorkerStatus) {
304309
}
305310
}
306311
}
312+
if status.Fraction > p.Fraction || status.Completed > p.Completed || (status.Failure == nil && status.Actual != p.Actual) {
313+
status.LastProgress = time.Now()
314+
}
307315
w.w.updateStatus(status)
308316
}
309317

@@ -471,7 +479,7 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w
471479
}
472480

473481
// update each object
474-
err := payload.RunGraph(ctx, graph, maxWorkers, func(ctx context.Context, tasks []*payload.Task) error {
482+
errs := payload.RunGraph(ctx, graph, maxWorkers, func(ctx context.Context, tasks []*payload.Task) error {
475483
for _, task := range tasks {
476484
if contextIsCancelled(ctx) {
477485
return cr.CancelError()
@@ -495,8 +503,8 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w
495503
}
496504
return nil
497505
})
498-
if err != nil {
499-
cr.Error(err)
506+
if len(errs) > 0 {
507+
err := cr.Errors(errs)
500508
return err
501509
}
502510

@@ -518,6 +526,12 @@ func init() {
518526
)
519527
}
520528

529+
type errCanceled struct {
530+
err error
531+
}
532+
533+
func (e errCanceled) Error() string { return e.err.Error() }
534+
521535
// consistentReporter hides the details of calculating the status based on the progress
522536
// of the graph runner.
523537
type consistentReporter struct {
@@ -553,14 +567,31 @@ func (r *consistentReporter) Error(err error) {
553567
copied := r.status
554568
copied.Step = "ApplyResources"
555569
copied.Fraction = float32(r.done) / float32(r.total)
556-
copied.Failure = err
570+
if !isCancelledError(err) {
571+
copied.Failure = err
572+
}
573+
r.reporter.Report(copied)
574+
}
575+
576+
func (r *consistentReporter) Errors(errs []error) error {
577+
err := summarizeTaskGraphErrors(errs)
578+
579+
r.lock.Lock()
580+
defer r.lock.Unlock()
581+
copied := r.status
582+
copied.Step = "ApplyResources"
583+
copied.Fraction = float32(r.done) / float32(r.total)
584+
if err != nil {
585+
copied.Failure = err
586+
}
557587
r.reporter.Report(copied)
588+
return err
558589
}
559590

560591
func (r *consistentReporter) CancelError() error {
561592
r.lock.Lock()
562593
defer r.lock.Unlock()
563-
return fmt.Errorf("update was cancelled at %d/%d", r.done, r.total)
594+
return errCanceled{fmt.Errorf("update was cancelled at %d/%d", r.done, r.total)}
564595
}
565596

566597
func (r *consistentReporter) Complete() {
@@ -576,6 +607,136 @@ func (r *consistentReporter) Complete() {
576607
r.reporter.Report(copied)
577608
}
578609

610+
func isCancelledError(err error) bool {
611+
if err == nil {
612+
return false
613+
}
614+
_, ok := err.(errCanceled)
615+
return ok
616+
}
617+
618+
// summarizeTaskGraphErrors takes a set of errors returned by the execution of a graph and attempts
619+
// to reduce them to a single cause or message. This is domain specific to the payload and our update
620+
// algorithms. The return value is the summarized error which may be nil if provided conditions are
621+
// not truly an error (cancellation).
622+
// TODO: take into account install vs upgrade
623+
func summarizeTaskGraphErrors(errs []error) error {
624+
// we ignore cancellation errors since they don't provide good feedback to users and are an internal
625+
// detail of the server
626+
err := errors.FilterOut(errors.NewAggregate(errs), isCancelledError)
627+
if err == nil {
628+
glog.V(4).Infof("All errors were cancellation errors: %v", errs)
629+
return nil
630+
}
631+
agg, ok := err.(errors.Aggregate)
632+
if !ok {
633+
errs = []error{err}
634+
} else {
635+
errs = agg.Errors()
636+
}
637+
638+
// log the errors to assist in debugging future summarization
639+
if glog.V(4) {
640+
glog.Infof("Summarizing %d errors", len(errs))
641+
for _, err := range errs {
642+
if uErr, ok := err.(*payload.UpdateError); ok {
643+
if uErr.Task != nil {
644+
glog.Infof("Update error %d/%d: %s %s (%T: %v)", uErr.Task.Index, uErr.Task.Total, uErr.Reason, uErr.Message, uErr.Nested, uErr.Nested)
645+
} else {
646+
glog.Infof("Update error: %s %s (%T: %v)", uErr.Reason, uErr.Message, uErr.Nested, uErr.Nested)
647+
}
648+
} else {
649+
glog.Infof("Update error: %T: %v", err, err)
650+
}
651+
}
652+
}
653+
654+
// collapse into a set of common errors where necessary
655+
if len(errs) == 1 {
656+
return errs[0]
657+
}
658+
if err := newClusterOperatorsNotAvailable(errs); err != nil {
659+
return err
660+
}
661+
return newMultipleError(errs)
662+
}
663+
664+
// newClusterOperatorsNotAvailable unifies multiple ClusterOperatorNotAvailable errors into
665+
// a single error. It returns nil if the provided errors are not of the same type.
666+
func newClusterOperatorsNotAvailable(errs []error) error {
667+
names := make([]string, 0, len(errs))
668+
for _, err := range errs {
669+
uErr, ok := err.(*payload.UpdateError)
670+
if !ok || uErr.Reason != "ClusterOperatorNotAvailable" {
671+
return nil
672+
}
673+
if len(uErr.Name) > 0 {
674+
names = append(names, uErr.Name)
675+
}
676+
}
677+
if len(names) == 0 {
678+
return nil
679+
}
680+
681+
nested := make([]error, 0, len(errs))
682+
for _, err := range errs {
683+
nested = append(nested, err)
684+
}
685+
sort.Strings(names)
686+
name := strings.Join(names, ", ")
687+
return &payload.UpdateError{
688+
Nested: errors.NewAggregate(errs),
689+
Reason: "ClusterOperatorsNotAvailable",
690+
Message: fmt.Sprintf("Some cluster operators are still updating: %s", name),
691+
Name: name,
692+
}
693+
}
694+
695+
// uniqueStrings returns an array with all sequential identical items removed. It modifies the contents
696+
// of arr. Sort the input array before calling to remove all duplicates.
697+
func uniqueStrings(arr []string) []string {
698+
var last int
699+
for i := 1; i < len(arr); i++ {
700+
if arr[i] == arr[last] {
701+
continue
702+
}
703+
last++
704+
if last != i {
705+
arr[last] = arr[i]
706+
}
707+
}
708+
if last < len(arr) {
709+
last++
710+
}
711+
return arr[:last]
712+
}
713+
714+
// newMultipleError reports a generic set of errors that block progress. This method expects multiple
715+
// errors but handles singular and empty arrays gracefully. If all errors have the same message, the
716+
// first item is returned.
717+
func newMultipleError(errs []error) error {
718+
if len(errs) == 0 {
719+
return nil
720+
}
721+
if len(errs) == 1 {
722+
return errs[0]
723+
}
724+
messages := make([]string, 0, len(errs))
725+
for _, err := range errs {
726+
messages = append(messages, err.Error())
727+
}
728+
sort.Strings(messages)
729+
messages = uniqueStrings(messages)
730+
if len(messages) == 0 {
731+
return errs[0]
732+
}
733+
return &payload.UpdateError{
734+
Nested: errors.NewAggregate(errs),
735+
Reason: "MultipleErrors",
736+
Message: fmt.Sprintf("Multiple errors are preventing progress:\n* %s", strings.Join(messages, "\n* ")),
737+
}
738+
}
739+
579740
// getOverrideForManifest returns the override and true when override exists for manifest.
580741
func getOverrideForManifest(overrides []configv1.ComponentOverride, manifest *lib.Manifest) (configv1.ComponentOverride, bool) {
581742
for idx, ov := range overrides {

0 commit comments

Comments
 (0)