Remove support for p2 instances which require NVIDIA driver <= 470 (#8226)

bryantbiggs · web-flow · commit d8e9633a3216 · 2025-02-19T11:36:23.000-06:00
* Remove support for `p2` instances which require NVIDIA driver &lt;= 470

* chore: Update references of `p2` to `g5`
diff --git a/cmd/ec2geninfo/main.go b/cmd/ec2geninfo/main.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"regexp"
 	"text/template"
 
 	"github.com/aws/aws-sdk-go-v2/aws"
@@ -120,6 +121,7 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
 	}
 
 	paginator := ec2.NewDescribeInstanceTypesPaginator(client, input)
+	unsupportedRegexp, _ := regexp.Compile("^(p2).*")
 
 	for paginator.HasMorePages() {
 		page, err := paginator.NextPage(context.TODO())
@@ -130,6 +132,10 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
 		for _, inst := range page.InstanceTypes {
 			itype := string(inst.InstanceType)
 
+			if unsupportedRegexp.MatchString(itype) {
+				continue
+			}
+
 			efaSupported := inst.NetworkInfo != nil && inst.NetworkInfo.EfaSupported != nil && *inst.NetworkInfo.EfaSupported
 
 			nvidiaGPUSupported := false
diff --git a/examples/23-kubeflow-spot-instance.yaml b/examples/23-kubeflow-spot-instance.yaml
@@ -1,12 +1,12 @@
 # Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
 # Built in efforts to reducing training costs of ML workloads.
-# Supporting tutorial can be found at the following link: 
+# Supporting tutorial can be found at the following link:
 # https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
-# This spec creates a cluster on EKS with the following active nodes 
+# This spec creates a cluster on EKS with the following active nodes
 # - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
 # It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
 # - m5a.2xlarge   -- Max Allowed 10 worker nodes
-# - p2.xlarge     -- Max Allowed 10 worker nodes
+# - g5.xlarge     -- Max Allowed 10 worker nodes
 # - p3.2xlarge    -- Max Allowed 10 worker nodes
 # - p3.8xlarge    -- Max Allowed 04 worker nodes
 # - p3dn.24xlarge -- Max Allowed 01 worker nodes
@@ -16,7 +16,7 @@ kind: ClusterConfig
 
 metadata:
   # Name of your cluster, change to whatever you find fit.
-  # If changed, make sure to change all nodegroup tags from 
+  # If changed, make sure to change all nodegroup tags from
   # 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
   name: cluster-23
   # choose your region wisely, this will significantly impact the cost incurred
@@ -27,7 +27,7 @@ metadata:
     # Add more cloud tags if needed for billing
     environment: staging
 
-# Add all possible AZs to ensure nodes can be spun up in any AZ later on. 
+# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
 # THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
 # This list applies to the whole cluster and isn't specific to nodegroups
 availabilityZones: ["us-east-1a", "us-east-1b",  "us-east-1d",  "us-east-1f"]
@@ -37,8 +37,8 @@ nodeGroups:
     desiredCapacity: 2
     minSize: 0
     maxSize: 3
-    # Set one nodegroup with 100GB volumes for Kubeflow to get deployed. 
-    # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node. 
+    # Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
+    # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
     volumeSize: 100
     volumeType: gp2
     instanceType: m5a.2xlarge
@@ -78,23 +78,23 @@ nodeGroups:
         autoScaler: true
         cloudWatch: true
 
-  - name: 1-gpu-spot-p2-xlarge
+  - name: 1-gpu-spot-g5-xlarge
     minSize: 0
     maxSize: 10
     instancesDistribution:
-      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
-      # Comment out the field to default to OnDemand as max price. 
+      # set your own max price. AWS spot instance prices no longer cross OnDemand price.
+      # Comment out the field to default to OnDemand as max price.
       maxPrice: 1.2
-      instanceTypes: ["p2.xlarge"]
+      instanceTypes: ["g5.xlarge"]
       onDemandBaseCapacity: 0
       onDemandPercentageAboveBaseCapacity: 0
       spotAllocationStrategy: capacity-optimized
     labels:
       lifecycle: Ec2Spot
       aws.amazon.com/spot: "true"
       gpu-count: "1"
-    # Stick to one AZ for all GPU nodes. 
-    # In case of termination, this will prevent volumes from being unavailable 
+    # Stick to one AZ for all GPU nodes.
+    # In case of termination, this will prevent volumes from being unavailable
     # if the new instance got spun up in another AZ.
     availabilityZones: ["us-east-1a"]
     taints:
@@ -118,8 +118,8 @@ nodeGroups:
     minSize: 0
     maxSize: 10
     instancesDistribution:
-      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
-      # Comment out the field to default to OnDemand as max price. 
+      # set your own max price. AWS spot instance prices no longer cross OnDemand price.
+      # Comment out the field to default to OnDemand as max price.
       maxPrice: 1.2
       instanceTypes: ["p3.2xlarge"]
       onDemandBaseCapacity: 0
@@ -129,8 +129,8 @@ nodeGroups:
       lifecycle: Ec2Spot
       aws.amazon.com/spot: "true"
       gpu-count: "1"
-    # Stick to one AZ for all GPU nodes. 
-    # In case of termination, this will prevent volumes from being unavailable 
+    # Stick to one AZ for all GPU nodes.
+    # In case of termination, this will prevent volumes from being unavailable
     # if the new instance got spun up in another AZ.
     availabilityZones: ["us-east-1a"]
     taints:
@@ -154,8 +154,8 @@ nodeGroups:
     minSize: 0
     maxSize: 4
     instancesDistribution:
-      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
-      # Comment out the field to default to OnDemand as max price. 
+      # set your own max price. AWS spot instance prices no longer cross OnDemand price.
+      # Comment out the field to default to OnDemand as max price.
       # maxPrice: 4.4
       instanceTypes: ["p3.8xlarge"]
       onDemandBaseCapacity: 0
@@ -165,8 +165,8 @@ nodeGroups:
       lifecycle: Ec2Spot
       aws.amazon.com/spot: "true"
       gpu-count: "4"
-    # Stick to one AZ for all GPU nodes. 
-    # In case of termination, this will prevent volumes from being unavailable 
+    # Stick to one AZ for all GPU nodes.
+    # In case of termination, this will prevent volumes from being unavailable
     # if the new instance got spun up in another AZ.
     availabilityZones: ["us-east-1a"]
     taints:
@@ -190,8 +190,8 @@ nodeGroups:
     minSize: 0
     maxSize: 1
     instancesDistribution:
-      # set your own max price. AWS spot instance prices no longer cross OnDemand price. 
-      # Comment out the field to default to OnDemand as max price. 
+      # set your own max price. AWS spot instance prices no longer cross OnDemand price.
+      # Comment out the field to default to OnDemand as max price.
       maxPrice: 11
       instanceTypes: ["p3dn.24xlarge"]
       onDemandBaseCapacity: 0
diff --git a/pkg/ami/auto_resolver_test.go b/pkg/ami/auto_resolver_test.go
@@ -217,7 +217,7 @@ var _ = Describe("AMI Auto Resolution", func() {
 
 			Context("and gpu instance type", func() {
 				BeforeEach(func() {
-					instanceType = "p2.xlarge"
+					instanceType = "g5.xlarge"
 				})
 
 				Context("and ami is available", func() {
diff --git a/pkg/ami/ssm_resolver_test.go b/pkg/ami/ssm_resolver_test.go
@@ -96,7 +96,7 @@ var _ = Describe("AMI Auto Resolution", func() {
 
 			Context("and gpu instance type", func() {
 				BeforeEach(func() {
-					instanceType = "p2.xlarge"
+					instanceType = "g5.xlarge"
 				})
 
 				Context("and ami is available", func() {
diff --git a/pkg/cfn/builder/managed_nodegroup_ami_type_test.go b/pkg/cfn/builder/managed_nodegroup_ami_type_test.go
@@ -39,7 +39,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
 		[]string{}, // local zones
 		[]ec2types.InstanceType{
 			ec2types.InstanceTypeM5Large,
-			ec2types.InstanceTypeP2Xlarge,
+			ec2types.InstanceTypeG5Xlarge,
 			ec2types.InstanceTypeA12xlarge,
 			ec2types.InstanceTypeG5gXlarge,
 			ec2types.InstanceTypeG4dnXlarge,
@@ -81,7 +81,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
 		nodeGroup: &api.ManagedNodeGroup{
 			NodeGroupBase: &api.NodeGroupBase{
 				Name:         "test",
-				InstanceType: "p2.8xlarge",
+				InstanceType: "g5.8xlarge",
 			},
 		},
 		expectedAMIType: "AL2023_x86_64_NVIDIA",
@@ -102,7 +102,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
 			NodeGroupBase: &api.NodeGroupBase{
 				Name:         "test",
 				AMIFamily:    api.NodeImageFamilyAmazonLinux2,
-				InstanceType: "p2.xlarge",
+				InstanceType: "g5.xlarge",
 			},
 		},
 		expectedAMIType: "AL2_x86_64_GPU",
diff --git a/pkg/eks/api_test.go b/pkg/eks/api_test.go
@@ -355,7 +355,7 @@ var _ = Describe("eksctl API", func() {
 
 		It("should retrieve the AMI from EC2 when AMI is auto", func() {
 			ng.AMI = "auto"
-			ng.InstanceType = "p2.xlarge"
+			ng.InstanceType = "g5.xlarge"
 			mockDescribeImages(provider, "ami-auto", func(input *ec2.DescribeImagesInput) bool {
 				return len(input.ImageIds) == 0
 			})
diff --git a/pkg/eks/instance_selection_test.go b/pkg/eks/instance_selection_test.go
@@ -59,9 +59,9 @@ var _ = DescribeTable("Instance type selection", func(t instanceTypeCase) {
 	}),
 
 	Entry("all GPU instances", instanceTypeCase{
-		instanceTypes: []string{"p2.8xlarge", "p3.8xlarge", "g4dn.xlarge"},
+		instanceTypes: []string{"g5.8xlarge", "p3.8xlarge", "g4dn.xlarge"},
 
-		expectedInstanceType: "p2.8xlarge",
+		expectedInstanceType: "g5.8xlarge",
 	}),
 
 	Entry("single instance type", instanceTypeCase{
diff --git a/pkg/eks/tasks_test.go b/pkg/eks/tasks_test.go
@@ -23,7 +23,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
 			nodeGroups: []*v1alpha5.NodeGroup{
 				{
 					NodeGroupBase: &v1alpha5.NodeGroupBase{
-						InstanceType: "p2.xlarge",
+						InstanceType: "g5.xlarge",
 						AMIFamily:    v1alpha5.NodeImageFamilyAmazonLinux2,
 					},
 				},
@@ -46,7 +46,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
 			nodeGroups: []*v1alpha5.NodeGroup{
 				{
 					NodeGroupBase: &v1alpha5.NodeGroupBase{
-						InstanceType: "p2.xlarge",
+						InstanceType: "g5.xlarge",
 						AMIFamily:    v1alpha5.NodeImageFamilyAmazonLinux2,
 					},
 				},
@@ -117,7 +117,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
 			nodeGroups: []*v1alpha5.NodeGroup{
 				{
 					NodeGroupBase: &v1alpha5.NodeGroupBase{
-						InstanceType: "p2.xlarge",
+						InstanceType: "g5.xlarge",
 						AMIFamily:    v1alpha5.NodeImageFamilyAmazonLinux2023,
 					},
 				},
@@ -140,7 +140,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
 			nodeGroups: []*v1alpha5.NodeGroup{
 				{
 					NodeGroupBase: &v1alpha5.NodeGroupBase{
-						InstanceType: "p2.xlarge",
+						InstanceType: "g5.xlarge",
 						AMIFamily:    v1alpha5.NodeImageFamilyAmazonLinux2023,
 					},
 				},
diff --git a/pkg/utils/instance/instance_types.go b/pkg/utils/instance/instance_types.go
@@ -5160,39 +5160,6 @@ var InstanceTypes = []InstanceInfo{
 		CBRSupported:             false,
 		CPUArch:                  "arm64",
 	},
-	{
-		InstanceType:             "p2.16xlarge",
-		InstanceStorageSupported: false,
-		EFASupported:             false,
-		NvidiaGPUSupported:       true,
-		NvidiaGPUType:            "K80",
-		NeuronSupported:          false,
-		NeuronDeviceType:         "",
-		CBRSupported:             false,
-		CPUArch:                  "x86-64",
-	},
-	{
-		InstanceType:             "p2.8xlarge",
-		InstanceStorageSupported: false,
-		EFASupported:             false,
-		NvidiaGPUSupported:       true,
-		NvidiaGPUType:            "K80",
-		NeuronSupported:          false,
-		NeuronDeviceType:         "",
-		CBRSupported:             false,
-		CPUArch:                  "x86-64",
-	},
-	{
-		InstanceType:             "p2.xlarge",
-		InstanceStorageSupported: false,
-		EFASupported:             false,
-		NvidiaGPUSupported:       true,
-		NvidiaGPUType:            "K80",
-		NeuronSupported:          false,
-		NeuronDeviceType:         "",
-		CBRSupported:             false,
-		CPUArch:                  "x86-64",
-	},
 	{
 		InstanceType:             "p3.16xlarge",
 		InstanceStorageSupported: false,
diff --git a/userdocs/src/usage/custom-ami-support.md b/userdocs/src/usage/custom-ami-support.md
@@ -30,7 +30,7 @@ Config file example:
 ```yaml
 nodeGroups:
   - name: ng1
-    instanceType: p2.xlarge
+    instanceType: g5.xlarge
     amiFamily: AmazonLinux2
     ami: auto
   - name: ng2
diff --git a/userdocs/src/usage/gpu-support.md b/userdocs/src/usage/gpu-support.md
@@ -4,7 +4,7 @@ Eksctl supports selecting GPU instance types for nodegroups. Simply supply a
 compatible instance type to the create command, or via the config file.
 
 ```
-eksctl create cluster --node-type=p2.xlarge
+eksctl create cluster --node-type=g5.xlarge
 ```
 
 ???+ note
@@ -23,7 +23,7 @@ To disable the automatic plugin installation, and manually install a specific ve
 use `--install-nvidia-plugin=false` with the create command. For example:
 
 ```
-eksctl create cluster --node-type=p2.xlarge --install-nvidia-plugin=false
+eksctl create cluster --node-type=g5.xlarge --install-nvidia-plugin=false
 ```
 
 and, for versions 0.15.0 and above,
diff --git a/userdocs/src/usage/spot-instances.md b/userdocs/src/usage/spot-instances.md
@@ -96,9 +96,9 @@ nodeGroups:
     desiredCapacity: 1
     instancesDistribution:
       instanceTypes:
-        - p2.xlarge
-        - p2.8xlarge
-        - p2.16xlarge
+        - g5.xlarge
+        - g5.8xlarge
+        - g5.16xlarge
       maxPrice: 0.50
 ```