Introduce pod deletion timeout

jonico · jonico · commit e6083c42ef31 · 2021-02-13T03:22:04.000+01:00
* if a k8s node becomes unresponsive, the kube controller will soft 
delete all pods after the eviction time (default 5 mins)
* as long as the node stays unresponsive, the pod will never leave the 
last status and hence the runner controller will assume that everything 
is fine with the pod and will not try to create new pods
* this can result in a situation where a horizontal autoscaler thinks 
that none of its runners are currently busy and will not schedule any 
further runners / pods, resulting in a dead runner deployment until the 
runnerreplicaset is deleted or the node comes back online
* introducing a pod deletion timeout (1 minute) after which the runner 
controller will try to reboot the runner and create a pod on a working 
node
diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go
@@ -185,7 +185,24 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 		}
 
 		if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
-			return ctrl.Result{}, err
+			deletionTimeout := 1 * time.Minute
+			currentTime := time.Now()
+			deletionDidTimeout := pod.DeletionTimestamp.Add(deletionTimeout).Sub(currentTime) > 0
+
+			if deletionDidTimeout {
+				log.Info(
+					"Runner failed to delete itself in a timely manner "+
+						"Recreating the pod to see if it resolves the issue. "+
+						"This is typically the case when a Kubernetes node became unreachable "+
+						"and the kube contreoller started evicting nodes.",
+					"podDeletionTimestamp", pod.DeletionTimestamp,
+					"currentTime", currentTime,
+					"configuredDeletionTimeout", deletionTimeout,
+				)
+				restart = true
+			} else {
+				return ctrl.Result{}, err
+			}
 		}
 
 		if pod.Status.Phase == corev1.PodRunning {