Skip to content

Commit db8190f

Browse files
Merge pull request #1190 from petr-muller/ota-1531-05-load-gates-early
OTA-1531: [5/x] cvo: read cluster `FeatureGate` early
2 parents 07c658d + 9751e8f commit db8190f

File tree

2 files changed

+91
-25
lines changed

2 files changed

+91
-25
lines changed

pkg/featuregates/featuregates.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ import (
55
"github.com/openshift/api/features"
66
)
77

8+
// StubOpenShiftVersion is the default OpenShift version placeholder for the purpose of determining
9+
// enabled and disabled CVO feature gates. It is assumed to never conflict with a real OpenShift
10+
// version. Both DefaultCvoGates and CvoGatesFromFeatureGate should return a safe conservative
11+
// default set of enabled and disabled gates, typically with unknownVersion set to true.
12+
const StubOpenShiftVersion = "0.0.1-snapshot"
13+
814
// CvoGateChecker allows CVO code to check which feature gates are enabled
915
type CvoGateChecker interface {
1016
// UnknownVersion flag is set to true if CVO did not find a matching version in the FeatureGate

pkg/start/start.go

Lines changed: 85 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -189,32 +189,11 @@ func (o *Options) Run(ctx context.Context) error {
189189
return err
190190
}
191191

192-
payloadRoot := payload.DefaultRootPath
193-
if o.PayloadOverride != "" {
194-
payloadRoot = payload.RootPath(o.PayloadOverride)
195-
}
196-
197-
cvoOcpVersion := "0.0.1-snapshot"
198-
// Peek at the local release metadata to determine the version of OCP this CVO belongs to. This assumes the CVO is
199-
// executing in a container from the payload image. Full payload content is only read later once leader lease is
200-
// acquired, and here we should only read as little data as possible to determine the version so we can establish
201-
// enabled feature gate checker for all following code.
202-
//
203-
// We cannot refuse to start CVO if for some reason we cannot determine the OCP version on startup from the local
204-
// release metadata. The only consequence is we fail to determine enabled/disabled feature gates and will have to use
205-
// some defaults.
206-
releaseMetadata, err := payloadRoot.LoadReleaseMetadata()
207-
switch {
208-
case err != nil:
209-
klog.Warningf("Failed to read release metadata to determine OCP version for this CVO (will use placeholder version %q): %v", cvoOcpVersion, err)
210-
case releaseMetadata.Version == "":
211-
klog.Warningf("Version missing from release metadata, cannot determine OCP version for this CVO (will use placeholder version %q)", cvoOcpVersion)
212-
default:
213-
cvoOcpVersion = releaseMetadata.Version
214-
klog.Infof("Determined OCP version for this CVO: %q", cvoOcpVersion)
215-
}
216-
217192
clusterVersionConfigInformerFactory, configInformerFactory := o.prepareConfigInformerFactories(cb)
193+
_, _, err = o.processInitialFeatureGate(ctx, configInformerFactory)
194+
if err != nil {
195+
return fmt.Errorf("error processing feature gates: %w", err)
196+
}
218197

219198
// initialize the controllers and attempt to load the payload information
220199
controllerCtx, err := o.NewControllerContext(cb, clusterVersionConfigInformerFactory, configInformerFactory)
@@ -237,6 +216,87 @@ func (o *Options) prepareConfigInformerFactories(cb *ClientBuilder) (configinfor
237216
return clusterVersionConfigInformerFactory, configInformerFactory
238217
}
239218

219+
// getOpenShiftVersion peeks at the local release metadata to determine the version of OpenShift this CVO belongs to.
220+
// This assumes the CVO is executing in a container from the payload image. This does not and should not fully load
221+
// whole payload content, that is only loaded later once leader lease is acquired. Here we should only read as little
222+
// data as possible to determine the version so we can establish enabled feature gate checker for all following code.
223+
func (o *Options) getOpenShiftVersion() string {
224+
payloadRoot := payload.DefaultRootPath
225+
if o.PayloadOverride != "" {
226+
payloadRoot = payload.RootPath(o.PayloadOverride)
227+
}
228+
229+
// We cannot refuse to start CVO if for some reason we cannot determine the OpenShift version on startup from the local
230+
// release metadata. The only consequence is we fail to determine enabled/disabled feature gates and will have to use
231+
// some defaults.
232+
releaseMetadata, err := payloadRoot.LoadReleaseMetadata()
233+
if err != nil {
234+
klog.Warningf("Failed to read release metadata to determine OpenShift version for this CVO (will use placeholder version %q): %v", featuregates.StubOpenShiftVersion, err)
235+
return featuregates.StubOpenShiftVersion
236+
}
237+
238+
if releaseMetadata.Version == "" {
239+
klog.Warningf("Version missing from release metadata, cannot determine OpenShift version for this CVO (will use placeholder version %q)", featuregates.StubOpenShiftVersion)
240+
return featuregates.StubOpenShiftVersion
241+
}
242+
243+
klog.Infof("Determined OpenShift version for this CVO: %q", releaseMetadata.Version)
244+
return releaseMetadata.Version
245+
}
246+
247+
func (o *Options) processInitialFeatureGate(ctx context.Context, configInformerFactory configinformers.SharedInformerFactory) (configv1.FeatureSet, *featuregates.CvoGates, error) {
248+
featureGates := configInformerFactory.Config().V1().FeatureGates().Lister()
249+
configInformerFactory.Start(ctx.Done())
250+
configInformerFactory.WaitForCacheSync(ctx.Done())
251+
252+
cvoOpenShiftVersion := o.getOpenShiftVersion()
253+
cvoGates := featuregates.DefaultCvoGates(cvoOpenShiftVersion)
254+
255+
var startingFeatureSet configv1.FeatureSet
256+
var clusterFeatureGate *configv1.FeatureGate
257+
258+
// client-go automatically retries some network blip errors on GETs for 30s by default, and we want to
259+
// retry the remaining ones ourselves. If we fail longer than that, the operator won't be able to do work
260+
// anyway. Return the error and crashloop.
261+
//
262+
// We implement the timeout with a context because the timeout in PollImmediateWithContext does not behave
263+
// well when ConditionFunc takes longer time to execute, like here where the GET can be retried by client-go
264+
var lastError error
265+
if err := wait.PollUntilContextTimeout(context.Background(), 2*time.Second, 25*time.Second, true, func(ctx context.Context) (bool, error) {
266+
gate, fgErr := featureGates.Get("cluster")
267+
switch {
268+
case apierrors.IsNotFound(fgErr):
269+
// if we have no featuregates, then the cluster is using the default featureset, which is "".
270+
// This excludes everything that could possibly depend on a different feature set.
271+
startingFeatureSet = ""
272+
klog.Infof("FeatureGate not found in cluster, will assume default feature set %q at startup", startingFeatureSet)
273+
return true, nil
274+
case fgErr != nil:
275+
lastError = fgErr
276+
klog.Warningf("Failed to get FeatureGate from cluster: %v", fgErr)
277+
return false, nil
278+
default:
279+
clusterFeatureGate = gate
280+
startingFeatureSet = gate.Spec.FeatureSet
281+
cvoGates = featuregates.CvoGatesFromFeatureGate(clusterFeatureGate, cvoOpenShiftVersion)
282+
klog.Infof("FeatureGate found in cluster, using its feature set %q at startup", startingFeatureSet)
283+
return true, nil
284+
}
285+
}); err != nil {
286+
if lastError != nil {
287+
return "", nil, lastError
288+
}
289+
return "", nil, err
290+
}
291+
292+
if cvoGates.UnknownVersion() {
293+
klog.Warningf("CVO features for version %s could not be detected from FeatureGate; will use defaults plus special UnknownVersion feature gate", cvoOpenShiftVersion)
294+
}
295+
klog.Infof("CVO features for version %s enabled at startup: %+v", cvoOpenShiftVersion, cvoGates)
296+
297+
return startingFeatureSet, &cvoGates, nil
298+
}
299+
240300
// run launches a number of goroutines to handle manifest application,
241301
// metrics serving, etc. It continues operating until ctx.Done(),
242302
// and then attempts a clean shutdown limited by an internal context

0 commit comments

Comments
 (0)