Skip to content

Commit 8357a33

Browse files
committed
OTA-1531: Rework error handling in FeatureGate processing
- The `FeatureGate` GET is now made over an informer client which is backed by a cache, so polling to smooth over non-isNotFound errors no longer makes sense. The GET runs after a cache sync succeeded earlier so no errors are expected (and if an error happens, it is unlikely to be resolved by a retry) - Waiting for cache sync was not bounded; impose a 30s timeout on startup (this is similar to the deadline imposed by the poll before we started to use the informer client)
1 parent 2d837d9 commit 8357a33

File tree

1 file changed

+29
-36
lines changed

1 file changed

+29
-36
lines changed

pkg/start/start.go

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"k8s.io/apimachinery/pkg/fields"
2020
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2121
"k8s.io/apimachinery/pkg/util/sets"
22-
"k8s.io/apimachinery/pkg/util/wait"
2322
coreinformers "k8s.io/client-go/informers"
2423
"k8s.io/client-go/kubernetes"
2524
"k8s.io/client-go/kubernetes/scheme"
@@ -244,48 +243,42 @@ func (o *Options) getOpenShiftVersion() string {
244243
}
245244

246245
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, featuregates.CvoGates, error) {
246+
var startingFeatureSet configv1.FeatureSet
247+
var cvoGates featuregates.CvoGates
248+
247249
featureGates := configInformerFactory.Config().V1().FeatureGates().Lister()
248250
configInformerFactory.Start(ctx.Done())
249-
configInformerFactory.WaitForCacheSync(ctx.Done())
251+
252+
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
253+
defer cancel()
254+
255+
for key, synced := range configInformerFactory.WaitForCacheSync(ctx.Done()) {
256+
if !synced {
257+
return startingFeatureSet, cvoGates, fmt.Errorf("failed to sync %s informer cache: %v", key.String(), ctx.Err())
258+
}
259+
}
250260

251261
cvoOpenShiftVersion := o.getOpenShiftVersion()
252-
cvoGates := featuregates.DefaultCvoGates(cvoOpenShiftVersion)
262+
cvoGates = featuregates.DefaultCvoGates(cvoOpenShiftVersion)
253263

254-
var startingFeatureSet configv1.FeatureSet
255264
var clusterFeatureGate *configv1.FeatureGate
256265

257-
// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
258-
// retry the remaining ones ourselves. If we fail longer than that, CVO won't be able to do work anyway.
259-
// Return the error and crashloop.
260-
//
261-
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
262-
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
263-
var lastError error
264-
if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) {
265-
gate, fgErr := featureGates.Get("cluster")
266-
switch {
267-
case apierrors.IsNotFound(fgErr):
268-
// if we have no featuregates, then the cluster is using the default featureset, which is "".
269-
// This excludes everything that could possibly depend on a different feature set.
270-
startingFeatureSet = ""
271-
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
272-
return true, nil
273-
case fgErr != nil:
274-
lastError = fgErr
275-
klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr)
276-
return false, nil
277-
default:
278-
clusterFeatureGate = gate
279-
startingFeatureSet = gate.Spec.FeatureSet
280-
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
281-
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
282-
return true, nil
283-
}
284-
}); err != nil {
285-
if lastError != nil {
286-
return "", cvoGates, lastError
287-
}
288-
return "", cvoGates, err
266+
gate, err := featureGates.Get("cluster")
267+
switch {
268+
case err != nil && apierrors.IsNotFound(err):
269+
// if we have no featuregates, then the cluster is using the default featureset, which is "".
270+
// This excludes everything that could possibly depend on a different feature set.
271+
startingFeatureSet = ""
272+
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
273+
case err != nil:
274+
// This should not happen because featureGates is backed by the informer cache which successfully synced earlier
275+
klog.Errorf("Failed to get FeatureGate from cluster: %v", err)
276+
return startingFeatureSet, cvoGates, fmt.Errorf("failed to get FeatureGate from informer cache: %w", err)
277+
default:
278+
clusterFeatureGate = gate
279+
startingFeatureSet = gate.Spec.FeatureSet
280+
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
281+
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
289282
}
290283

291284
if cvoGates.UnknownVersion() {

0 commit comments

Comments
 (0)