Skip to content

Commit 89a1c42

Browse files
Merge pull request #1045 from aleksandra-malinowska/cherry-pick-gpu-metrics-2
Cherry pick of GPU metrics fixes in #1017 and #1043
2 parents 99301bb + 3dcad89 commit 89a1c42

File tree

5 files changed

+90
-19
lines changed

5 files changed

+90
-19
lines changed

cluster-autoscaler/core/scale_down.go

+11-6
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
565565
nodesWithoutMaster := filterOutMasters(allNodes, pods)
566566
candidates := make([]*apiv1.Node, 0)
567567
readinessMap := make(map[string]bool)
568+
candidateNodeGroups := make(map[string]cloudprovider.NodeGroup)
568569

569570
resourceLimiter, errCP := sd.context.CloudProvider.GetResourceLimiter()
570571
if errCP != nil {
@@ -635,6 +636,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
635636
}
636637

637638
candidates = append(candidates, node)
639+
candidateNodeGroups[node.Name] = nodeGroup
638640
}
639641
}
640642
if len(candidates) == 0 {
@@ -649,7 +651,7 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
649651
if len(emptyNodes) > 0 {
650652
nodeDeletionStart := time.Now()
651653
confirmation := make(chan errors.AutoscalerError, len(emptyNodes))
652-
sd.scheduleDeleteEmptyNodes(emptyNodes, sd.context.ClientSet, sd.context.Recorder, readinessMap, confirmation)
654+
sd.scheduleDeleteEmptyNodes(emptyNodes, sd.context.ClientSet, sd.context.Recorder, readinessMap, candidateNodeGroups, confirmation)
653655
err := sd.waitForEmptyNodesDeleted(emptyNodes, confirmation)
654656
nodeDeletionDuration = time.Now().Sub(nodeDeletionStart)
655657
if err == nil {
@@ -701,10 +703,11 @@ func (sd *ScaleDown) TryToScaleDown(allNodes []*apiv1.Node, pods []*apiv1.Pod, p
701703
glog.Errorf("Failed to delete %s: %v", toRemove.Node.Name, err)
702704
return
703705
}
706+
nodeGroup := candidateNodeGroups[toRemove.Node.Name]
704707
if readinessMap[toRemove.Node.Name] {
705-
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Underutilized)
708+
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node, nodeGroup), metrics.Underutilized)
706709
} else {
707-
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node), metrics.Unready)
710+
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(toRemove.Node, nodeGroup), metrics.Unready)
708711
}
709712
}()
710713

@@ -784,7 +787,8 @@ func getEmptyNodes(candidates []*apiv1.Node, pods []*apiv1.Pod, maxEmptyBulkDele
784787
}
785788

786789
func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client kube_client.Interface,
787-
recorder kube_record.EventRecorder, readinessMap map[string]bool, confirmation chan errors.AutoscalerError) {
790+
recorder kube_record.EventRecorder, readinessMap map[string]bool,
791+
candidateNodeGroups map[string]cloudprovider.NodeGroup, confirmation chan errors.AutoscalerError) {
788792
for _, node := range emptyNodes {
789793
glog.V(0).Infof("Scale-down: removing empty node %s", node.Name)
790794
sd.context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDownEmpty", "Scale-down: removing empty node %s", node.Name)
@@ -809,10 +813,11 @@ func (sd *ScaleDown) scheduleDeleteEmptyNodes(emptyNodes []*apiv1.Node, client k
809813
deleteErr = deleteNodeFromCloudProvider(nodeToDelete, sd.context.CloudProvider,
810814
sd.context.Recorder, sd.clusterStateRegistry)
811815
if deleteErr == nil {
816+
nodeGroup := candidateNodeGroups[nodeToDelete.Name]
812817
if readinessMap[nodeToDelete.Name] {
813-
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Empty)
818+
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete, nodeGroup), metrics.Empty)
814819
} else {
815-
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete), metrics.Unready)
820+
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(nodeToDelete, nodeGroup), metrics.Unready)
816821
}
817822
}
818823
confirmation <- deleteErr

cluster-autoscaler/core/scale_up.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
465465
}
466466
glog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
467467
for _, info := range scaleUpInfos {
468-
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node()))
468+
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(nodeInfo.Node(), nil))
469469
if typedErr != nil {
470470
return nil, typedErr
471471
}

cluster-autoscaler/metrics/metrics.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
23+
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
2324

2425
"github.com/golang/glog"
2526
"github.com/prometheus/client_golang/prometheus"
@@ -311,7 +312,7 @@ func RegisterError(err errors.AutoscalerError) {
311312
// RegisterScaleUp records number of nodes added by scale up
312313
func RegisterScaleUp(nodesCount int, gpuType string) {
313314
scaleUpCount.Add(float64(nodesCount))
314-
if gpuType != "" {
315+
if gpuType != gpu.MetricsNoGPU {
315316
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
316317
}
317318
}
@@ -324,7 +325,7 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
324325
// RegisterScaleDown records number of nodes removed by scale down
325326
func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
326327
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
327-
if gpuType != "" {
328+
if gpuType != gpu.MetricsNoGPU {
328329
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
329330
}
330331
}

cluster-autoscaler/proposals/metrics.md

+8
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ This metrics describe internal state and actions taken by Cluster Autoscaler.
7474
| errors_total | Counter | `type`=&lt;error-type&gt; | The number of CA loops failed due to an error. |
7575
| scaled_up_nodes_total | Counter | | Number of nodes added by CA. |
7676
| scaled_down_nodes_total | Counter | `reason`=&lt;scale-down-reason&gt; | Number of nodes removed by CA. |
77+
| scaled_up_gpu_nodes_total | Counter | `gpu_name`=&lt;gpu-name&gt; | Number of GPU-enabled nodes added by CA. |
78+
| scaled_down_gpu_nodes_total | Counter | `reason`=&lt;scale-down-reason&gt;, `gpu_name`=&lt;gpu-name&gt; | Number of GPU-enabled nodes removed by CA. |
7779
| failed_scale_ups_total | Counter | `reason`=&lt;failure-reason&gt; | Number of times scale-up operation has failed. |
7880
| evicted_pods_total | Counter | | Number of pods evicted by CA. |
7981
| unneeded_nodes_count | Gauge | | Number of nodes currently considered unneeded by CA. |
@@ -106,6 +108,12 @@ This metrics describe internal state and actions taken by Cluster Autoscaler.
106108
at all in that case).
107109
* `scaled_down_nodes_total` counts the number of nodes removed by CA. Possible
108110
scale down reasons are `empty`, `underutilized`, `unready`.
111+
* `scaled_up_gpu_nodes_total` counts the number of GPU-enabled nodes
112+
successfully added by CA, similar to `scaled_up_nodes_total`. Additionally
113+
`gpu_name` specifies name of the GPU (e.g. nvidia-tesla-k80).
114+
* `scaled_down_gpu_nodes_total` counts the number of nodes removed by CA. Scale
115+
down reasons are identical to `scaled_down_nodes_total`, `gpu_name` to
116+
`scaled_up_gpu_nodes_total`.
109117

110118
### Node Autoprovisioning operations
111119

cluster-autoscaler/utils/gpu/gpu.go

+67-10
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ import (
2020
apiv1 "k8s.io/api/core/v1"
2121
"k8s.io/apimachinery/pkg/api/resource"
2222
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
23+
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
2324

2425
"github.com/golang/glog"
25-
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
2626
)
2727

2828
const (
@@ -35,6 +35,31 @@ const (
3535
DefaultGPUType = "nvidia-tesla-k80"
3636
)
3737

38+
const (
39+
// MetricsGenericGPU - for when there is no information about GPU type
40+
MetricsGenericGPU = "generic"
41+
// MetricsMissingGPU - for when there's a label, but GPU didn't appear
42+
MetricsMissingGPU = "missing-gpu"
43+
// MetricsUnexpectedLabelGPU - for when there's a label, but no GPU at all
44+
MetricsUnexpectedLabelGPU = "unexpected-label"
45+
// MetricsUnknownGPU - for when GPU type is unknown
46+
MetricsUnknownGPU = "not-listed"
47+
// MetricsErrorGPU - for when there was an error obtaining GPU type
48+
MetricsErrorGPU = "error"
49+
// MetricsNoGPU - for when there is no GPU and no label all
50+
MetricsNoGPU = ""
51+
)
52+
53+
var (
54+
// knownGpuTypes lists all known GPU types, to be used in metrics; map for convenient access
55+
// TODO(kgolab) obtain this from Cloud Provider
56+
knownGpuTypes = map[string]struct{}{
57+
"nvidia-tesla-k80": {},
58+
"nvidia-tesla-p100": {},
59+
"nvidia-tesla-v100": {},
60+
}
61+
)
62+
3863
// FilterOutNodesWithUnreadyGpus removes nodes that should have GPU, but don't have it in allocatable
3964
// from ready nodes list and updates their status to unready on all nodes list.
4065
// This is a hack/workaround for nodes with GPU coming up without installed drivers, resulting
@@ -71,19 +96,51 @@ func FilterOutNodesWithUnreadyGpus(allNodes, readyNodes []*apiv1.Node) ([]*apiv1
7196
// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
7297
// if the GPU type is unknown, "generic" is returned
7398
// NOTE: current implementation is GKE/GCE-specific
74-
func GetGpuTypeForMetrics(node *apiv1.Node) string {
99+
func GetGpuTypeForMetrics(node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string {
75100
// we use the GKE label if there is one
76-
gpuType, found := node.Labels[GPULabel]
77-
if found {
78-
return gpuType
101+
gpuType, labelFound := node.Labels[GPULabel]
102+
capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU]
103+
104+
if !labelFound {
105+
// no label, fallback to generic solution
106+
if capacityFound && !capacity.IsZero() {
107+
return MetricsGenericGPU
108+
}
109+
110+
// no signs of GPU
111+
return MetricsNoGPU
79112
}
80113

81-
// no label, fallback to generic solution
82-
capacity, found := node.Status.Capacity[ResourceNvidiaGPU]
83-
if !found || capacity.IsZero() {
84-
return ""
114+
// GKE-specific label & capacity are present - consistent state
115+
if capacityFound {
116+
return validateGpuType(gpuType)
117+
}
118+
119+
// GKE-specific label present but no capacity (yet?) - check the node template
120+
if nodeGroup != nil {
121+
template, err := nodeGroup.TemplateNodeInfo()
122+
if err != nil {
123+
glog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err)
124+
return MetricsErrorGPU
125+
}
126+
127+
if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found {
128+
return MetricsMissingGPU
129+
}
130+
131+
// if template does not define GPUs we assume node will not have any even if it has gpu label
132+
glog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name)
133+
return MetricsUnexpectedLabelGPU
134+
}
135+
136+
return MetricsUnexpectedLabelGPU
137+
}
138+
139+
func validateGpuType(gpu string) string {
140+
if _, found := knownGpuTypes[gpu]; found {
141+
return gpu
85142
}
86-
return "generic"
143+
return MetricsUnknownGPU
87144
}
88145

89146
func getUnreadyNodeCopy(node *apiv1.Node) *apiv1.Node {

0 commit comments

Comments
 (0)