Skip to content

Commit 5001599

Browse files
authored
Merge pull request #10422 from smartxworks/huangwei/cherry-pick-10196-to-release-1.6
[release-1.6] 🐛 Delete out of date machines with unhealthy control plane component conditions when rolling out KCP
2 parents e20e649 + d0530a6 commit 5001599

File tree

9 files changed

+197
-22
lines changed

9 files changed

+197
-22
lines changed

controlplane/kubeadm/internal/control_plane.go

+14-7
Original file line numberDiff line numberDiff line change
@@ -238,19 +238,26 @@ func (c *ControlPlane) IsEtcdManaged() bool {
238238
return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil
239239
}
240240

241-
// UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC.
242-
func (c *ControlPlane) UnhealthyMachines() collections.Machines {
241+
// UnhealthyMachinesWithUnhealthyControlPlaneComponents returns all unhealthy control plane machines that
242+
// have unhealthy control plane components.
243+
// It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions.
244+
func (c *ControlPlane) UnhealthyMachinesWithUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines {
245+
return machines.Filter(collections.HasUnhealthyControlPlaneComponents(c.IsEtcdManaged()))
246+
}
247+
248+
// UnhealthyMachinesByMachineHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check.
249+
func (c *ControlPlane) UnhealthyMachinesByMachineHealthCheck() collections.Machines {
243250
return c.Machines.Filter(collections.HasUnhealthyCondition)
244251
}
245252

246-
// HealthyMachines returns the list of control plane machines not marked as unhealthy by MHC.
247-
func (c *ControlPlane) HealthyMachines() collections.Machines {
253+
// HealthyMachinesByMachineHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check.
254+
func (c *ControlPlane) HealthyMachinesByMachineHealthCheck() collections.Machines {
248255
return c.Machines.Filter(collections.Not(collections.HasUnhealthyCondition))
249256
}
250257

251-
// HasUnhealthyMachine returns true if any machine in the control plane is marked as unhealthy by MHC.
252-
func (c *ControlPlane) HasUnhealthyMachine() bool {
253-
return len(c.UnhealthyMachines()) > 0
258+
// HasUnhealthyMachineByMachineHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check.
259+
func (c *ControlPlane) HasUnhealthyMachineByMachineHealthCheck() bool {
260+
return len(c.UnhealthyMachinesByMachineHealthCheck()) > 0
254261
}
255262

256263
// PatchMachines patches all the machines conditions.

controlplane/kubeadm/internal/control_plane_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func TestHasUnhealthyMachine(t *testing.T) {
8686
}
8787

8888
g := NewWithT(t)
89-
g.Expect(c.HasUnhealthyMachine()).To(BeTrue())
89+
g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeTrue())
9090
}
9191

9292
type machineOpt func(*clusterv1.Machine)

controlplane/kubeadm/internal/controllers/remediation.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
4747
// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
4848
// if the underlying machine is now back to healthy / not deleting.
4949
errList := []error{}
50-
healthyMachines := controlPlane.HealthyMachines()
50+
healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck()
5151
for _, m := range healthyMachines {
5252
if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
5353
conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) &&
@@ -73,7 +73,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
7373

7474
// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
7575
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
76-
unhealthyMachines := controlPlane.UnhealthyMachines()
76+
unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck()
7777

7878
// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
7979
if len(unhealthyMachines) == 0 {
@@ -183,7 +183,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
183183

184184
// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
185185
if controlPlane.IsEtcdManaged() {
186-
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
186+
etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest()
187187
if etcdLeaderCandidate == nil {
188188
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
189189
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,

controlplane/kubeadm/internal/controllers/remediation_test.go

+6
Original file line numberDiff line numberDiff line change
@@ -1680,6 +1680,12 @@ func withUnhealthyEtcdMember() machineOption {
16801680
}
16811681
}
16821682

1683+
func withUnhealthyAPIServerPod() machineOption {
1684+
return func(machine *clusterv1.Machine) {
1685+
conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "")
1686+
}
1687+
}
1688+
16831689
func withNodeRef(ref string) machineOption {
16841690
return func(machine *clusterv1.Machine) {
16851691
machine.Status.NodeRef = &corev1.ObjectReference{

controlplane/kubeadm/internal/controllers/scale.go

+2
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ func selectMachineForScaleDown(controlPlane *internal.ControlPlane, outdatedMach
230230
machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines)
231231
case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0:
232232
machines = controlPlane.MachineWithDeleteAnnotation(machines)
233+
case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0:
234+
machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines)
233235
case outdatedMachines.Len() > 0:
234236
machines = outdatedMachines
235237
}

controlplane/kubeadm/internal/controllers/scale_test.go

+37-9
Original file line numberDiff line numberDiff line change
@@ -368,17 +368,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
368368
Spec: controlplanev1.KubeadmControlPlaneSpec{},
369369
}
370370
startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC)
371-
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)))
372-
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)))
373-
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)))
374-
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)))
375-
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)))
376-
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)))
377-
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
378-
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
371+
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), machineOpt(withNodeRef("machine-1")))
372+
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), machineOpt(withNodeRef("machine-2")))
373+
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), machineOpt(withNodeRef("machine-3")))
374+
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), machineOpt(withNodeRef("machine-4")))
375+
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), machineOpt(withNodeRef("machine-5")))
376+
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), machineOpt(withNodeRef("machine-6")))
377+
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
378+
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-7")))
379+
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)),
380+
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-8")))
381+
m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
382+
machineOpt(withNodeRef("machine-9")))
383+
m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-4*time.Hour)),
384+
machineOpt(withNodeRef("machine-10")), machineOpt(withUnhealthyAPIServerPod()))
385+
m11 := machine("machine-11", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)),
386+
machineOpt(withNodeRef("machine-11")), machineOpt(withUnhealthyEtcdMember()))
379387

380388
mc3 := collections.FromMachines(m1, m2, m3, m4, m5)
381389
mc6 := collections.FromMachines(m6, m7, m8)
390+
mc9 := collections.FromMachines(m9, m10, m11)
382391
fd := clusterv1.FailureDomains{
383392
"one": failureDomain(true),
384393
"two": failureDomain(true),
@@ -389,6 +398,11 @@ func TestSelectMachineForScaleDown(t *testing.T) {
389398
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
390399
Machines: mc3,
391400
}
401+
needsUpgradeControlPlane1 := &internal.ControlPlane{
402+
KCP: &kcp,
403+
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
404+
Machines: mc9,
405+
}
392406
upToDateControlPlane := &internal.ControlPlane{
393407
KCP: &kcp,
394408
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
@@ -452,12 +466,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
452466
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-7"}},
453467
},
454468
{
455-
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotatio that still exist, it returns oldest marked machine first",
469+
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotation that still exist, it returns oldest marked machine first",
456470
cp: upToDateControlPlane,
457471
outDatedMachines: collections.FromMachines(m5, m3, m8, m7, m6, m1, m2),
458472
expectErr: false,
459473
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-8"}},
460474
},
475+
{
476+
name: "when there are machines needing upgrade, it returns the single unhealthy machine with MachineAPIServerPodHealthyCondition set to False",
477+
cp: needsUpgradeControlPlane1,
478+
outDatedMachines: collections.FromMachines(m9, m10),
479+
expectErr: false,
480+
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
481+
},
482+
{
483+
name: "when there are machines needing upgrade, it returns the oldest unhealthy machine with MachineEtcdMemberHealthyCondition set to False",
484+
cp: needsUpgradeControlPlane1,
485+
outDatedMachines: collections.FromMachines(m9, m10, m11),
486+
expectErr: false,
487+
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
488+
},
461489
}
462490

463491
for _, tc := range testCases {

docs/proposals/20191017-kubeadm-based-control-plane.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -401,8 +401,9 @@ spec:
401401
- See [Preflight checks](#preflight-checks) below.
402402
- Scale down operations removes the oldest machine in the failure domain that has the most control-plane machines on it.
403403
- Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down:
404-
- outdatedMachines with the delete annotation
404+
- outdated machines with the delete annotation
405405
- machines with the delete annotation
406+
- outdated machines with unhealthy control plane component pods
406407
- outdated machines
407408
- all machines
408409

util/collections/machine_filters.go

+36
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,42 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool {
158158
return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition)
159159
}
160160

161+
// HasUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that
162+
// have any of the following control plane component conditions set to False:
163+
// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy & EtcdMemberHealthy (if using managed etcd).
164+
// It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions.
165+
func HasUnhealthyControlPlaneComponents(isEtcdManaged bool) Func {
166+
controlPlaneMachineHealthConditions := []clusterv1.ConditionType{
167+
controlplanev1.MachineAPIServerPodHealthyCondition,
168+
controlplanev1.MachineControllerManagerPodHealthyCondition,
169+
controlplanev1.MachineSchedulerPodHealthyCondition,
170+
}
171+
if isEtcdManaged {
172+
controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions,
173+
controlplanev1.MachineEtcdPodHealthyCondition,
174+
controlplanev1.MachineEtcdMemberHealthyCondition,
175+
)
176+
}
177+
return func(machine *clusterv1.Machine) bool {
178+
if machine == nil {
179+
return false
180+
}
181+
182+
// The machine without a node could be in failure status due to the kubelet config error, or still provisioning components (including etcd).
183+
// So do not treat it as unhealthy.
184+
185+
for _, condition := range controlPlaneMachineHealthConditions {
186+
// Do not return true when the condition is not set or is set to Unknown because
187+
// it means a transient state and can not be considered as unhealthy.
188+
// preflightCheckCondition() can cover these two cases and skip the scaling up/down.
189+
if conditions.IsFalse(machine, condition) {
190+
return true
191+
}
192+
}
193+
return false
194+
}
195+
}
196+
161197
// IsReady returns a filter to find all machines with the ReadyCondition equals to True.
162198
func IsReady() Func {
163199
return func(machine *clusterv1.Machine) bool {

util/collections/machine_filters_test.go

+96-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"time"
2222

2323
. "github.com/onsi/gomega"
24+
corev1 "k8s.io/api/core/v1"
2425
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2526
"k8s.io/utils/pointer"
2627
"sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -377,7 +378,7 @@ func TestWithVersion(t *testing.T) {
377378
})
378379
}
379380

380-
func TestHealtyAPIServer(t *testing.T) {
381+
func TestHealthyAPIServer(t *testing.T) {
381382
t.Run("nil machine returns false", func(t *testing.T) {
382383
g := NewWithT(t)
383384
g.Expect(collections.HealthyAPIServer()(nil)).To(BeFalse())
@@ -432,6 +433,100 @@ func TestGetFilteredMachinesForCluster(t *testing.T) {
432433
g.Expect(machines).To(HaveLen(1))
433434
}
434435

436+
func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) {
437+
t.Run("nil machine returns false", func(t *testing.T) {
438+
g := NewWithT(t)
439+
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse())
440+
})
441+
442+
t.Run("machine without node returns false", func(t *testing.T) {
443+
g := NewWithT(t)
444+
machine := &clusterv1.Machine{}
445+
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
446+
})
447+
448+
t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) {
449+
g := NewWithT(t)
450+
machine := &clusterv1.Machine{}
451+
machine.Status.NodeRef = &corev1.ObjectReference{
452+
Name: "node1",
453+
}
454+
machine.Status.Conditions = clusterv1.Conditions{
455+
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
456+
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
457+
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
458+
}
459+
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
460+
})
461+
462+
t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) {
463+
g := NewWithT(t)
464+
machine := &clusterv1.Machine{}
465+
machine.Status.NodeRef = &corev1.ObjectReference{
466+
Name: "node1",
467+
}
468+
machine.Status.Conditions = clusterv1.Conditions{
469+
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
470+
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
471+
*conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "",
472+
clusterv1.ConditionSeverityWarning, ""),
473+
}
474+
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue())
475+
})
476+
477+
t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) {
478+
g := NewWithT(t)
479+
machine := &clusterv1.Machine{}
480+
machine.Status.NodeRef = &corev1.ObjectReference{
481+
Name: "node1",
482+
}
483+
machine.Status.Conditions = clusterv1.Conditions{
484+
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
485+
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
486+
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
487+
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
488+
clusterv1.ConditionSeverityWarning, ""),
489+
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
490+
clusterv1.ConditionSeverityWarning, ""),
491+
}
492+
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
493+
})
494+
495+
t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) {
496+
g := NewWithT(t)
497+
machine := &clusterv1.Machine{}
498+
machine.Status.NodeRef = &corev1.ObjectReference{
499+
Name: "node1",
500+
}
501+
machine.Status.Conditions = clusterv1.Conditions{
502+
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
503+
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
504+
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
505+
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
506+
clusterv1.ConditionSeverityWarning, ""),
507+
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
508+
clusterv1.ConditionSeverityWarning, ""),
509+
}
510+
g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue())
511+
})
512+
513+
t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) {
514+
g := NewWithT(t)
515+
machine := &clusterv1.Machine{}
516+
machine.Status.NodeRef = &corev1.ObjectReference{
517+
Name: "node1",
518+
}
519+
machine.Status.Conditions = clusterv1.Conditions{
520+
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
521+
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
522+
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
523+
*conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition),
524+
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
525+
}
526+
g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse())
527+
})
528+
}
529+
435530
func testControlPlaneMachine(name string) *clusterv1.Machine {
436531
owned := true
437532
ownedRef := []metav1.OwnerReference{

0 commit comments

Comments
 (0)