Skip to content

Commit d8e9633

Browse files
authored
Remove support for p2 instances which require NVIDIA driver <= 470 (#8226)
* Remove support for `p2` instances which require NVIDIA driver <= 470 * chore: Update references of `p2` to `g5`
1 parent 8f510ad commit d8e9633

12 files changed

+47
-74
lines changed

cmd/ec2geninfo/main.go

+6
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"os"
7+
"regexp"
78
"text/template"
89

910
"github.com/aws/aws-sdk-go-v2/aws"
@@ -120,6 +121,7 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
120121
}
121122

122123
paginator := ec2.NewDescribeInstanceTypesPaginator(client, input)
124+
unsupportedRegexp, _ := regexp.Compile("^(p2).*")
123125

124126
for paginator.HasMorePages() {
125127
page, err := paginator.NextPage(context.TODO())
@@ -130,6 +132,10 @@ func getEC2Instances(region string, instances map[string]InstanceInfo) (map[stri
130132
for _, inst := range page.InstanceTypes {
131133
itype := string(inst.InstanceType)
132134

135+
if unsupportedRegexp.MatchString(itype) {
136+
continue
137+
}
138+
133139
efaSupported := inst.NetworkInfo != nil && inst.NetworkInfo.EfaSupported != nil && *inst.NetworkInfo.EfaSupported
134140

135141
nvidiaGPUSupported := false

examples/23-kubeflow-spot-instance.yaml

+23-23
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
22
# Built in efforts to reducing training costs of ML workloads.
3-
# Supporting tutorial can be found at the following link:
3+
# Supporting tutorial can be found at the following link:
44
# https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
5-
# This spec creates a cluster on EKS with the following active nodes
5+
# This spec creates a cluster on EKS with the following active nodes
66
# - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
77
# It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
88
# - m5a.2xlarge -- Max Allowed 10 worker nodes
9-
# - p2.xlarge -- Max Allowed 10 worker nodes
9+
# - g5.xlarge -- Max Allowed 10 worker nodes
1010
# - p3.2xlarge -- Max Allowed 10 worker nodes
1111
# - p3.8xlarge -- Max Allowed 04 worker nodes
1212
# - p3dn.24xlarge -- Max Allowed 01 worker nodes
@@ -16,7 +16,7 @@ kind: ClusterConfig
1616

1717
metadata:
1818
# Name of your cluster, change to whatever you find fit.
19-
# If changed, make sure to change all nodegroup tags from
19+
# If changed, make sure to change all nodegroup tags from
2020
# 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
2121
name: cluster-23
2222
# choose your region wisely, this will significantly impact the cost incurred
@@ -27,7 +27,7 @@ metadata:
2727
# Add more cloud tags if needed for billing
2828
environment: staging
2929

30-
# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
30+
# Add all possible AZs to ensure nodes can be spun up in any AZ later on.
3131
# THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
3232
# This list applies to the whole cluster and isn't specific to nodegroups
3333
availabilityZones: ["us-east-1a", "us-east-1b", "us-east-1d", "us-east-1f"]
@@ -37,8 +37,8 @@ nodeGroups:
3737
desiredCapacity: 2
3838
minSize: 0
3939
maxSize: 3
40-
# Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
41-
# Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
40+
# Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
41+
# Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
4242
volumeSize: 100
4343
volumeType: gp2
4444
instanceType: m5a.2xlarge
@@ -78,23 +78,23 @@ nodeGroups:
7878
autoScaler: true
7979
cloudWatch: true
8080

81-
- name: 1-gpu-spot-p2-xlarge
81+
- name: 1-gpu-spot-g5-xlarge
8282
minSize: 0
8383
maxSize: 10
8484
instancesDistribution:
85-
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
86-
# Comment out the field to default to OnDemand as max price.
85+
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
86+
# Comment out the field to default to OnDemand as max price.
8787
maxPrice: 1.2
88-
instanceTypes: ["p2.xlarge"]
88+
instanceTypes: ["g5.xlarge"]
8989
onDemandBaseCapacity: 0
9090
onDemandPercentageAboveBaseCapacity: 0
9191
spotAllocationStrategy: capacity-optimized
9292
labels:
9393
lifecycle: Ec2Spot
9494
aws.amazon.com/spot: "true"
9595
gpu-count: "1"
96-
# Stick to one AZ for all GPU nodes.
97-
# In case of termination, this will prevent volumes from being unavailable
96+
# Stick to one AZ for all GPU nodes.
97+
# In case of termination, this will prevent volumes from being unavailable
9898
# if the new instance got spun up in another AZ.
9999
availabilityZones: ["us-east-1a"]
100100
taints:
@@ -118,8 +118,8 @@ nodeGroups:
118118
minSize: 0
119119
maxSize: 10
120120
instancesDistribution:
121-
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
122-
# Comment out the field to default to OnDemand as max price.
121+
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
122+
# Comment out the field to default to OnDemand as max price.
123123
maxPrice: 1.2
124124
instanceTypes: ["p3.2xlarge"]
125125
onDemandBaseCapacity: 0
@@ -129,8 +129,8 @@ nodeGroups:
129129
lifecycle: Ec2Spot
130130
aws.amazon.com/spot: "true"
131131
gpu-count: "1"
132-
# Stick to one AZ for all GPU nodes.
133-
# In case of termination, this will prevent volumes from being unavailable
132+
# Stick to one AZ for all GPU nodes.
133+
# In case of termination, this will prevent volumes from being unavailable
134134
# if the new instance got spun up in another AZ.
135135
availabilityZones: ["us-east-1a"]
136136
taints:
@@ -154,8 +154,8 @@ nodeGroups:
154154
minSize: 0
155155
maxSize: 4
156156
instancesDistribution:
157-
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
158-
# Comment out the field to default to OnDemand as max price.
157+
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
158+
# Comment out the field to default to OnDemand as max price.
159159
# maxPrice: 4.4
160160
instanceTypes: ["p3.8xlarge"]
161161
onDemandBaseCapacity: 0
@@ -165,8 +165,8 @@ nodeGroups:
165165
lifecycle: Ec2Spot
166166
aws.amazon.com/spot: "true"
167167
gpu-count: "4"
168-
# Stick to one AZ for all GPU nodes.
169-
# In case of termination, this will prevent volumes from being unavailable
168+
# Stick to one AZ for all GPU nodes.
169+
# In case of termination, this will prevent volumes from being unavailable
170170
# if the new instance got spun up in another AZ.
171171
availabilityZones: ["us-east-1a"]
172172
taints:
@@ -190,8 +190,8 @@ nodeGroups:
190190
minSize: 0
191191
maxSize: 1
192192
instancesDistribution:
193-
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
194-
# Comment out the field to default to OnDemand as max price.
193+
# set your own max price. AWS spot instance prices no longer cross OnDemand price.
194+
# Comment out the field to default to OnDemand as max price.
195195
maxPrice: 11
196196
instanceTypes: ["p3dn.24xlarge"]
197197
onDemandBaseCapacity: 0

pkg/ami/auto_resolver_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ var _ = Describe("AMI Auto Resolution", func() {
217217

218218
Context("and gpu instance type", func() {
219219
BeforeEach(func() {
220-
instanceType = "p2.xlarge"
220+
instanceType = "g5.xlarge"
221221
})
222222

223223
Context("and ami is available", func() {

pkg/ami/ssm_resolver_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ var _ = Describe("AMI Auto Resolution", func() {
9696

9797
Context("and gpu instance type", func() {
9898
BeforeEach(func() {
99-
instanceType = "p2.xlarge"
99+
instanceType = "g5.xlarge"
100100
})
101101

102102
Context("and ami is available", func() {

pkg/cfn/builder/managed_nodegroup_ami_type_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
3939
[]string{}, // local zones
4040
[]ec2types.InstanceType{
4141
ec2types.InstanceTypeM5Large,
42-
ec2types.InstanceTypeP2Xlarge,
42+
ec2types.InstanceTypeG5Xlarge,
4343
ec2types.InstanceTypeA12xlarge,
4444
ec2types.InstanceTypeG5gXlarge,
4545
ec2types.InstanceTypeG4dnXlarge,
@@ -81,7 +81,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
8181
nodeGroup: &api.ManagedNodeGroup{
8282
NodeGroupBase: &api.NodeGroupBase{
8383
Name: "test",
84-
InstanceType: "p2.8xlarge",
84+
InstanceType: "g5.8xlarge",
8585
},
8686
},
8787
expectedAMIType: "AL2023_x86_64_NVIDIA",
@@ -102,7 +102,7 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) {
102102
NodeGroupBase: &api.NodeGroupBase{
103103
Name: "test",
104104
AMIFamily: api.NodeImageFamilyAmazonLinux2,
105-
InstanceType: "p2.xlarge",
105+
InstanceType: "g5.xlarge",
106106
},
107107
},
108108
expectedAMIType: "AL2_x86_64_GPU",

pkg/eks/api_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ var _ = Describe("eksctl API", func() {
355355

356356
It("should retrieve the AMI from EC2 when AMI is auto", func() {
357357
ng.AMI = "auto"
358-
ng.InstanceType = "p2.xlarge"
358+
ng.InstanceType = "g5.xlarge"
359359
mockDescribeImages(provider, "ami-auto", func(input *ec2.DescribeImagesInput) bool {
360360
return len(input.ImageIds) == 0
361361
})

pkg/eks/instance_selection_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ var _ = DescribeTable("Instance type selection", func(t instanceTypeCase) {
5959
}),
6060

6161
Entry("all GPU instances", instanceTypeCase{
62-
instanceTypes: []string{"p2.8xlarge", "p3.8xlarge", "g4dn.xlarge"},
62+
instanceTypes: []string{"g5.8xlarge", "p3.8xlarge", "g4dn.xlarge"},
6363

64-
expectedInstanceType: "p2.8xlarge",
64+
expectedInstanceType: "g5.8xlarge",
6565
}),
6666

6767
Entry("single instance type", instanceTypeCase{

pkg/eks/tasks_test.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
2323
nodeGroups: []*v1alpha5.NodeGroup{
2424
{
2525
NodeGroupBase: &v1alpha5.NodeGroupBase{
26-
InstanceType: "p2.xlarge",
26+
InstanceType: "g5.xlarge",
2727
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2,
2828
},
2929
},
@@ -46,7 +46,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
4646
nodeGroups: []*v1alpha5.NodeGroup{
4747
{
4848
NodeGroupBase: &v1alpha5.NodeGroupBase{
49-
InstanceType: "p2.xlarge",
49+
InstanceType: "g5.xlarge",
5050
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2,
5151
},
5252
},
@@ -117,7 +117,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
117117
nodeGroups: []*v1alpha5.NodeGroup{
118118
{
119119
NodeGroupBase: &v1alpha5.NodeGroupBase{
120-
InstanceType: "p2.xlarge",
120+
InstanceType: "g5.xlarge",
121121
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2023,
122122
},
123123
},
@@ -140,7 +140,7 @@ var _ = Describe("ClusterTasksForNodeGroups", func() {
140140
nodeGroups: []*v1alpha5.NodeGroup{
141141
{
142142
NodeGroupBase: &v1alpha5.NodeGroupBase{
143-
InstanceType: "p2.xlarge",
143+
InstanceType: "g5.xlarge",
144144
AMIFamily: v1alpha5.NodeImageFamilyAmazonLinux2023,
145145
},
146146
},

pkg/utils/instance/instance_types.go

-33
Original file line numberDiff line numberDiff line change
@@ -5160,39 +5160,6 @@ var InstanceTypes = []InstanceInfo{
51605160
CBRSupported: false,
51615161
CPUArch: "arm64",
51625162
},
5163-
{
5164-
InstanceType: "p2.16xlarge",
5165-
InstanceStorageSupported: false,
5166-
EFASupported: false,
5167-
NvidiaGPUSupported: true,
5168-
NvidiaGPUType: "K80",
5169-
NeuronSupported: false,
5170-
NeuronDeviceType: "",
5171-
CBRSupported: false,
5172-
CPUArch: "x86-64",
5173-
},
5174-
{
5175-
InstanceType: "p2.8xlarge",
5176-
InstanceStorageSupported: false,
5177-
EFASupported: false,
5178-
NvidiaGPUSupported: true,
5179-
NvidiaGPUType: "K80",
5180-
NeuronSupported: false,
5181-
NeuronDeviceType: "",
5182-
CBRSupported: false,
5183-
CPUArch: "x86-64",
5184-
},
5185-
{
5186-
InstanceType: "p2.xlarge",
5187-
InstanceStorageSupported: false,
5188-
EFASupported: false,
5189-
NvidiaGPUSupported: true,
5190-
NvidiaGPUType: "K80",
5191-
NeuronSupported: false,
5192-
NeuronDeviceType: "",
5193-
CBRSupported: false,
5194-
CPUArch: "x86-64",
5195-
},
51965163
{
51975164
InstanceType: "p3.16xlarge",
51985165
InstanceStorageSupported: false,

userdocs/src/usage/custom-ami-support.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Config file example:
3030
```yaml
3131
nodeGroups:
3232
- name: ng1
33-
instanceType: p2.xlarge
33+
instanceType: g5.xlarge
3434
amiFamily: AmazonLinux2
3535
ami: auto
3636
- name: ng2

userdocs/src/usage/gpu-support.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Eksctl supports selecting GPU instance types for nodegroups. Simply supply a
44
compatible instance type to the create command, or via the config file.
55

66
```
7-
eksctl create cluster --node-type=p2.xlarge
7+
eksctl create cluster --node-type=g5.xlarge
88
```
99

1010
???+ note
@@ -23,7 +23,7 @@ To disable the automatic plugin installation, and manually install a specific ve
2323
use `--install-nvidia-plugin=false` with the create command. For example:
2424

2525
```
26-
eksctl create cluster --node-type=p2.xlarge --install-nvidia-plugin=false
26+
eksctl create cluster --node-type=g5.xlarge --install-nvidia-plugin=false
2727
```
2828

2929
and, for versions 0.15.0 and above,

userdocs/src/usage/spot-instances.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ nodeGroups:
9696
desiredCapacity: 1
9797
instancesDistribution:
9898
instanceTypes:
99-
- p2.xlarge
100-
- p2.8xlarge
101-
- p2.16xlarge
99+
- g5.xlarge
100+
- g5.8xlarge
101+
- g5.16xlarge
102102
maxPrice: 0.50
103103
```
104104

0 commit comments

Comments
 (0)