Merge pull request #1045 from aleksandra-malinowska/cherry-pick-gpu-metrics-2

aleksandra-malinowska · web-flow · commit 89a1c4259b42 · 2018-07-05T14:06:21.000+02:00
Cherry pick of GPU metrics fixes in #1017 and #1043
diff --git a/cluster-autoscaler/core/scale_down.go b/cluster-autoscaler/core/scale_down.go
@@ -565,6 +565,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
 	nodesWithoutMaster := filterOutMasters(allNodes, pods)
 	candidates := make([]*apiv1.Node, 0)
 	readinessMap := make(map[string]bool)
+	candidateNodeGroups := make(map[string]cloudprovider.NodeGroup)
 
 	resourceLimiter, errCP := sd.context.CloudProvider.GetResourceLimiter()
 	if errCP != nil {
@@ -635,6 +636,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
 			}
 
 			candidates = append(candidates, node)
+			candidateNodeGroups[node.Name] = nodeGroup
 		}
 	}
 	if len(candidates) == 0 {
@@ -649,7 +651,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
 	if len(emptyNodes) > 0 {
 		nodeDeletionStart := time.Now()
 		confirmation := make(chan errors.AutoscalerError, len(emptyNodes))
-		sd.scheduleDeleteEmptyNodes(emptyNodes, sd.context.ClientSet, sd.context.Recorder, readinessMap, confirmation)
+		sd.scheduleDeleteEmptyNodes(emptyNodes, sd.context.ClientSet, sd.context.Recorder, readinessMap, candidateNodeGroups, confirmation)
 		err := sd.waitForEmptyNodesDeleted(emptyNodes, confirmation)
 		nodeDeletionDuration = time.Now().Sub(nodeDeletionStart)
 		if err == nil {
@@ -701,10 +703,11 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
 			glog.Errorf("Failed to delete %s: %v", toRemove.Node.Name, err)
 			return
 		}
+		nodeGroup := candidateNodeGroups[toRemove.Node.Name]
 		if readinessMap[toRemove.Node.Name] {
-			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
+			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node, nodeGroup), metrics.Underutilized)
 		} else {
-			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
+			metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node, nodeGroup), metrics.Unready)
 		}
 	}()
 
@@ -784,7 +787,8 @@ func getEmptyNodes(candidates []*apiv1.Node, pods []*apiv1.Pod, maxEmptyBulkDele
 }
 
 func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client kube_client.Interface,
-	recorder kube_record.EventRecorder, readinessMap map[string]bool, confirmation chan errors.AutoscalerError) {
+	recorder kube_record.EventRecorder, readinessMap map[string]bool,
+	candidateNodeGroups map[string]cloudprovider.NodeGroup, confirmation chan errors.AutoscalerError) {
 	for _, node := range emptyNodes {
 		glog.V(0).Infof("Scale-down: removing empty node %s", node.Name)
 		sd.context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDownEmpty", "Scale-down: removing empty node %s", node.Name)
@@ -809,10 +813,11 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
 			deleteErr = deleteNodeFromCloudProvider(nodeToDelete, sd.context.CloudProvider,
 				sd.context.Recorder, sd.clusterStateRegistry)
 			if deleteErr == nil {
+				nodeGroup := candidateNodeGroups[nodeToDelete.Name]
 				if readinessMap[nodeToDelete.Name] {
-					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
+					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete, nodeGroup), metrics.Empty)
 				} else {
-					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
+					metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete, nodeGroup), metrics.Unready)
 				}
 			}
 			confirmation <- deleteErr
diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go
@@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
 		}
 		glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
 		for _, info := range scaleUpInfos {
-			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
+			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node(), nil))
 			if typedErr != nil {
 				return nil, typedErr
 			}
diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -20,6 +20,7 @@ import (
 	"time"
 
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 
 	"github.com/golang/glog"
 	"github.com/prometheus/client_golang/prometheus"
@@ -311,7 +312,7 @@ func RegisterError(err errors.AutoscalerError) {
 // RegisterScaleUp records number of nodes added by scale up
 func RegisterScaleUp(nodesCount int, gpuType string) {
 	scaleUpCount.Add(float64(nodesCount))
-	if gpuType != "" {
+	if gpuType != gpu.MetricsNoGPU {
 		gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
 	}
 }
@@ -324,7 +325,7 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
 // RegisterScaleDown records number of nodes removed by scale down
 func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
 	scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
-	if gpuType != "" {
+	if gpuType != gpu.MetricsNoGPU {
 		gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
 	}
 }
diff --git a/cluster-autoscaler/proposals/metrics.md b/cluster-autoscaler/proposals/metrics.md
@@ -74,6 +74,8 @@ This metrics describe internal state and actions taken by Cluster Autoscaler.
 | errors_total | Counter | `type`=&lt;error-type&gt; | The number of CA loops failed due to an error. |
 | scaled_up_nodes_total | Counter | | Number of nodes added by CA. |
 | scaled_down_nodes_total | Counter | `reason`=&lt;scale-down-reason&gt; | Number of nodes removed by CA. |
+| scaled_up_gpu_nodes_total | Counter | `gpu_name`=&lt;gpu-name&gt; | Number of GPU-enabled nodes added by CA. |
+| scaled_down_gpu_nodes_total | Counter | `reason`=&lt;scale-down-reason&gt;, `gpu_name`=&lt;gpu-name&gt; | Number of GPU-enabled nodes removed by CA. |
 | failed_scale_ups_total | Counter | `reason`=&lt;failure-reason&gt; | Number of times scale-up operation has failed. |
 | evicted_pods_total | Counter | | Number of pods evicted by CA. |
 | unneeded_nodes_count | Gauge | | Number of nodes currently considered unneeded by CA. |
@@ -106,6 +108,12 @@ This metrics describe internal state and actions taken by Cluster Autoscaler.
   at all in that case).
 * `scaled_down_nodes_total` counts the number of nodes removed by CA. Possible
 scale down reasons are `empty`, `underutilized`, `unready`.
+* `scaled_up_gpu_nodes_total` counts the number of GPU-enabled nodes
+  successfully added by CA, similar to `scaled_up_nodes_total`. Additionally
+  `gpu_name` specifies name of the GPU (e.g. nvidia-tesla-k80).
+* `scaled_down_gpu_nodes_total` counts the number of nodes removed by CA. Scale
+  down reasons are identical to `scaled_down_nodes_total`, `gpu_name` to
+  `scaled_up_gpu_nodes_total`.
 
 ### Node Autoprovisioning operations
 
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -20,9 +20,9 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 
 	"github.com/golang/glog"
-	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 )
 
 const (
@@ -35,6 +35,31 @@ const (
 	DefaultGPUType = "nvidia-tesla-k80"
 )
 
+const (
+	// MetricsGenericGPU - for when there is no information about GPU type
+	MetricsGenericGPU = "generic"
+	// MetricsMissingGPU - for when there's a label, but GPU didn't appear
+	MetricsMissingGPU = "missing-gpu"
+	// MetricsUnexpectedLabelGPU - for when there's a label, but no GPU at all
+	MetricsUnexpectedLabelGPU = "unexpected-label"
+	// MetricsUnknownGPU - for when GPU type is unknown
+	MetricsUnknownGPU = "not-listed"
+	// MetricsErrorGPU - for when there was an error obtaining GPU type
+	MetricsErrorGPU = "error"
+	// MetricsNoGPU - for when there is no GPU and no label all
+	MetricsNoGPU = ""
+)
+
+var (
+	// knownGpuTypes lists all known GPU types, to be used in metrics; map for convenient access
+	// TODO(kgolab) obtain this from Cloud Provider
+	knownGpuTypes = map[string]struct{}{
+		"nvidia-tesla-k80":  {},
+		"nvidia-tesla-p100": {},
+		"nvidia-tesla-v100": {},
+	}
+)
+
 // FilterOutNodesWithUnreadyGpus removes nodes that should have GPU, but don't have it in allocatable
 // from ready nodes list and updates their status to unready on all nodes list.
 // This is a hack/workaround for nodes with GPU coming up without installed drivers, resulting
@@ -71,19 +96,51 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
 // GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
 // if the GPU type is unknown, "generic" is returned
 // NOTE: current implementation is GKE/GCE-specific
-func GetGpuTypeForMetrics(node *apiv1.Node) string {
+func GetGpuTypeForMetrics(node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string {
 	// we use the GKE label if there is one
-	gpuType, found := node.Labels[GPULabel]
-	if found {
-		return gpuType
+	gpuType, labelFound := node.Labels[GPULabel]
+	capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU]
+
+	if !labelFound {
+		// no label, fallback to generic solution
+		if capacityFound && !capacity.IsZero() {
+			return MetricsGenericGPU
+		}
+
+		// no signs of GPU
+		return MetricsNoGPU
 	}
 
-	// no label, fallback to generic solution
-	capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
-	if !found || capacity.IsZero() {
-		return ""
+	// GKE-specific label & capacity are present - consistent state
+	if capacityFound {
+		return validateGpuType(gpuType)
+	}
+
+	// GKE-specific label present but no capacity (yet?) - check the node template
+	if nodeGroup != nil {
+		template, err := nodeGroup.TemplateNodeInfo()
+		if err != nil {
+			glog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err)
+			return MetricsErrorGPU
+		}
+
+		if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found {
+			return MetricsMissingGPU
+		}
+
+		// if template does not define GPUs we assume node will not have any even if it has gpu label
+		glog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name)
+		return MetricsUnexpectedLabelGPU
+	}
+
+	return MetricsUnexpectedLabelGPU
+}
+
+func validateGpuType(gpu string) string {
+	if _, found := knownGpuTypes[gpu]; found {
+		return gpu
 	}
-	return "generic"
+	return MetricsUnknownGPU
 }
 
 func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {

Original file line number	Diff line number	Diff line change
`@@ -465,7 +465,7 @@ func ScaleUp(context context.AutoscalingContext, processors ca_processors.Auto`
`465`	`465`	`}`
`466`	`466`	`glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)`
`467`	`467`	`for _, info := range scaleUpInfos {`
`468`		`- typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))`
	`468`	`+ typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node(), nil))`
`469`	`469`	`if typedErr != nil {`
`470`	`470`	`return nil, typedErr`
`471`	`471`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ import (`
`20`	`20`	`"time"`
`21`	`21`
`22`	`22`	`"k8s.io/autoscaler/cluster-autoscaler/utils/errors"`
	`23`	`+ "k8s.io/autoscaler/cluster-autoscaler/utils/gpu"`
`23`	`24`
`24`	`25`	`"github.com/golang/glog"`
`25`	`26`	`"github.com/prometheus/client_golang/prometheus"`
`@@ -311,7 +312,7 @@ func RegisterError(err errors.AutoscalerError) {`
`311`	`312`	`// RegisterScaleUp records number of nodes added by scale up`
`312`	`313`	`func RegisterScaleUp(nodesCount int, gpuType string) {`
`313`	`314`	`scaleUpCount.Add(float64(nodesCount))`
`314`		`- if gpuType != "" {`
	`315`	`+ if gpuType != gpu.MetricsNoGPU {`
`315`	`316`	`gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))`
`316`	`317`	`}`
`317`	`318`	`}`
`@@ -324,7 +325,7 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {`
`324`	`325`	`// RegisterScaleDown records number of nodes removed by scale down`
`325`	`326`	`func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {`
`326`	`327`	`scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))`
`327`		`- if gpuType != "" {`
	`328`	`+ if gpuType != gpu.MetricsNoGPU {`
`328`	`329`	`gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))`
`329`	`330`	`}`
`330`	`331`	`}`