Skip to content

OTA-1531: Rework error handling in FeatureGate processing #1206

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 29 additions & 36 deletions pkg/start/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"k8s.io/apimachinery/pkg/fields"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
coreinformers "k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -244,48 +243,42 @@ func (o *Options) getOpenShiftVersion() string {
}

func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, featuregates.CvoGates, error) {
var startingFeatureSet configv1.FeatureSet
var cvoGates featuregates.CvoGates

featureGates := configInformerFactory.Config().V1().FeatureGates().Lister()
configInformerFactory.Start(ctx.Done())
configInformerFactory.WaitForCacheSync(ctx.Done())

ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just want to understand your reasoning on the number:
30s here are WaitForCacheSync (maybe 5s?) + poll timeout (25s)?

defer cancel()

for key, synced := range configInformerFactory.WaitForCacheSync(ctx.Done()) {
if !synced {
return startingFeatureSet, cvoGates, fmt.Errorf("failed to sync %s informer cache: %v", key.String(), ctx.Err())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
return startingFeatureSet, cvoGates, fmt.Errorf("failed to sync %s informer cache: %v", key.String(), ctx.Err())
return startingFeatureSet, cvoGates, fmt.Errorf("failed to sync %s informer cache: %w", key.String(), ctx.Err())

}
}

cvoOpenShiftVersion := o.getOpenShiftVersion()
cvoGates := featuregates.DefaultCvoGates(cvoOpenShiftVersion)
cvoGates = featuregates.DefaultCvoGates(cvoOpenShiftVersion)

var startingFeatureSet configv1.FeatureSet
var clusterFeatureGate *configv1.FeatureGate

// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
// retry the remaining ones ourselves. If we fail longer than that, CVO won't be able to do work anyway.
// Return the error and crashloop.
//
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
var lastError error
if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) {
gate, fgErr := featureGates.Get("cluster")
switch {
case apierrors.IsNotFound(fgErr):
// if we have no featuregates, then the cluster is using the default featureset, which is "".
// This excludes everything that could possibly depend on a different feature set.
startingFeatureSet = ""
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
return true, nil
case fgErr != nil:
lastError = fgErr
klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr)
return false, nil
default:
clusterFeatureGate = gate
startingFeatureSet = gate.Spec.FeatureSet
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
return true, nil
}
}); err != nil {
if lastError != nil {
return "", cvoGates, lastError
}
return "", cvoGates, err
gate, err := featureGates.Get("cluster")
switch {
case err != nil && apierrors.IsNotFound(err):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: non-nil checking seems redundant:

// IsNotFound returns true if the specified error was created by NewNotFound.
// It supports wrapped errors and returns false when the error is nil.
func IsNotFound(err error) bool {

Suggested change
case err != nil && apierrors.IsNotFound(err):
case apierrors.IsNotFound(err):

// if we have no featuregates, then the cluster is using the default featureset, which is "".
// This excludes everything that could possibly depend on a different feature set.
startingFeatureSet = ""
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
case err != nil:
// This should not happen because featureGates is backed by the informer cache which successfully synced earlier
klog.Errorf("Failed to get FeatureGate from cluster: %v", err)
return startingFeatureSet, cvoGates, fmt.Errorf("failed to get FeatureGate from informer cache: %w", err)
default:
clusterFeatureGate = gate
startingFeatureSet = gate.Spec.FeatureSet
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
}

if cvoGates.UnknownVersion() {
Expand Down