From 22474f936f0d066835cb7d2791a9ce3af0946dae Mon Sep 17 00:00:00 2001
From: Johannes Nicolai <jonico@github.com>
Date: Sat, 13 Feb 2021 12:50:41 +0100
Subject: [PATCH] Introduce pod deletion timeout and forcefully delete stuck
 pods

* if a k8s node becomes unresponsive, the kube controller will soft
delete all pods after the eviction time (default 5 mins)
* as long as the node stays unresponsive, the pod will never leave the
last status and hence the runner controller will assume that everything
is fine with the pod and will not try to create new pods
* this can result in a situation where a horizontal autoscaler thinks
that none of its runners are currently busy and will not schedule any
further runners / pods, resulting in a broken  runner deployment until the
runnerreplicaset is deleted or the node comes back online
* introducing a pod deletion timeout (1 minute) after which the runner
controller will try to reboot the runner and create a pod on a working
node
* use forceful deletion and requeue for pods that have been stuck for
more than one minute in terminating state
* gracefully handling race conditions if pod gets finally forcefully deleted within
---
 controllers/runner_controller.go | 34 +++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go
index 1279109568..00a1b514f2 100644
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@@ -185,7 +185,39 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 		}
 
 		if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
-			return ctrl.Result{}, err
+			deletionTimeout := 1 * time.Minute
+			currentTime := time.Now()
+			deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0
+
+			if deletionDidTimeout {
+				log.Info(
+					"Pod failed to delete itself in a timely manner. "+
+						"This is typically the case when a Kubernetes node became unreachable "+
+						"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
+					"podDeletionTimestamp", pod.DeletionTimestamp,
+					"currentTime", currentTime,
+					"configuredDeletionTimeout", deletionTimeout,
+				)
+
+				var force int64 = 0
+				// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
+				if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
+					// probably
+					if !kerrors.IsNotFound(err) {
+						log.Error(err, "Failed to forcefully delete pod resource ...")
+						return ctrl.Result{}, err
+					}
+					// forceful deletion finally succeeded
+					return ctrl.Result{Requeue: true}, nil
+				}
+
+				r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name))
+				log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository)
+				// give kube manager a little time to forcefully delete the stuck pod
+				return ctrl.Result{RequeueAfter: 3 * time.Second}, err
+			} else {
+				return ctrl.Result{}, err
+			}
 		}
 
 		if pod.Status.Phase == corev1.PodRunning {