From 22474f936f0d066835cb7d2791a9ce3af0946dae Mon Sep 17 00:00:00 2001 From: Johannes Nicolai Date: Sat, 13 Feb 2021 12:50:41 +0100 Subject: [PATCH] Introduce pod deletion timeout and forcefully delete stuck pods * if a k8s node becomes unresponsive, the kube controller will soft delete all pods after the eviction time (default 5 mins) * as long as the node stays unresponsive, the pod will never leave the last status and hence the runner controller will assume that everything is fine with the pod and will not try to create new pods * this can result in a situation where a horizontal autoscaler thinks that none of its runners are currently busy and will not schedule any further runners / pods, resulting in a broken runner deployment until the runnerreplicaset is deleted or the node comes back online * introducing a pod deletion timeout (1 minute) after which the runner controller will try to reboot the runner and create a pod on a working node * use forceful deletion and requeue for pods that have been stuck for more than one minute in terminating state * gracefully handling race conditions if pod gets finally forcefully deleted within --- controllers/runner_controller.go | 34 +++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 1279109568..00a1b514f2 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -185,7 +185,39 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } if !pod.ObjectMeta.DeletionTimestamp.IsZero() { - return ctrl.Result{}, err + deletionTimeout := 1 * time.Minute + currentTime := time.Now() + deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0 + + if deletionDidTimeout { + log.Info( + "Pod failed to delete itself in a timely manner. "+ + "This is typically the case when a Kubernetes node became unreachable "+ + "and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.", + "podDeletionTimestamp", pod.DeletionTimestamp, + "currentTime", currentTime, + "configuredDeletionTimeout", deletionTimeout, + ) + + var force int64 = 0 + // forcefully delete runner as we would otherwise get stuck if the node stays unreachable + if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil { + // probably + if !kerrors.IsNotFound(err) { + log.Error(err, "Failed to forcefully delete pod resource ...") + return ctrl.Result{}, err + } + // forceful deletion finally succeeded + return ctrl.Result{Requeue: true}, nil + } + + r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name)) + log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository) + // give kube manager a little time to forcefully delete the stuck pod + return ctrl.Result{RequeueAfter: 3 * time.Second}, err + } else { + return ctrl.Result{}, err + } } if pod.Status.Phase == corev1.PodRunning {