1
1
# Cost-Optimized EKS cluster for Kubeflow with spot GPU instances and node scale down to zero
2
2
# Built in efforts to reducing training costs of ML workloads.
3
- # Supporting tutorial can be found at the following link:
3
+ # Supporting tutorial can be found at the following link:
4
4
# https://blog.gofynd.com/how-we-reduced-our-ml-training-costs-by-78-a33805cb00cf
5
- # This spec creates a cluster on EKS with the following active nodes
5
+ # This spec creates a cluster on EKS with the following active nodes
6
6
# - 2x m5a.2xlarge - Accomodates all pods of Kubeflow
7
7
# It also creates the following nodegroups with 0 nodes running unless a pod comes along and requests for the node to get spun up
8
8
# - m5a.2xlarge -- Max Allowed 10 worker nodes
9
- # - p2 .xlarge -- Max Allowed 10 worker nodes
9
+ # - g5 .xlarge -- Max Allowed 10 worker nodes
10
10
# - p3.2xlarge -- Max Allowed 10 worker nodes
11
11
# - p3.8xlarge -- Max Allowed 04 worker nodes
12
12
# - p3dn.24xlarge -- Max Allowed 01 worker nodes
@@ -16,7 +16,7 @@ kind: ClusterConfig
16
16
17
17
metadata :
18
18
# Name of your cluster, change to whatever you find fit.
19
- # If changed, make sure to change all nodegroup tags from
19
+ # If changed, make sure to change all nodegroup tags from
20
20
# 'k8s.io/cluster-autoscaler/cluster-23: "owned"' --> 'k8s.io/cluster-autoscaler/your-new-name: "owned"'
21
21
name : cluster-23
22
22
# choose your region wisely, this will significantly impact the cost incurred
@@ -27,7 +27,7 @@ metadata:
27
27
# Add more cloud tags if needed for billing
28
28
environment : staging
29
29
30
- # Add all possible AZs to ensure nodes can be spun up in any AZ later on.
30
+ # Add all possible AZs to ensure nodes can be spun up in any AZ later on.
31
31
# THIS CAN'T BE CHANGED LATER. YOU WILL HAVE TO CREATE A NEW CLUSTER TO ADD NEW AZ SUPPORT.
32
32
# This list applies to the whole cluster and isn't specific to nodegroups
33
33
availabilityZones : ["us-east-1a", "us-east-1b", "us-east-1d", "us-east-1f"]
@@ -37,8 +37,8 @@ nodeGroups:
37
37
desiredCapacity : 2
38
38
minSize : 0
39
39
maxSize : 3
40
- # Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
41
- # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
40
+ # Set one nodegroup with 100GB volumes for Kubeflow to get deployed.
41
+ # Kubeflow requirement states 1-2 Nodes with 100GB volume attached to the node.
42
42
volumeSize : 100
43
43
volumeType : gp2
44
44
instanceType : m5a.2xlarge
@@ -78,23 +78,23 @@ nodeGroups:
78
78
autoScaler : true
79
79
cloudWatch : true
80
80
81
- - name : 1-gpu-spot-p2 -xlarge
81
+ - name : 1-gpu-spot-g5 -xlarge
82
82
minSize : 0
83
83
maxSize : 10
84
84
instancesDistribution :
85
- # set your own max price. AWS spot instance prices no longer cross OnDemand price.
86
- # Comment out the field to default to OnDemand as max price.
85
+ # set your own max price. AWS spot instance prices no longer cross OnDemand price.
86
+ # Comment out the field to default to OnDemand as max price.
87
87
maxPrice : 1.2
88
- instanceTypes : ["p2 .xlarge"]
88
+ instanceTypes : ["g5 .xlarge"]
89
89
onDemandBaseCapacity : 0
90
90
onDemandPercentageAboveBaseCapacity : 0
91
91
spotAllocationStrategy : capacity-optimized
92
92
labels :
93
93
lifecycle : Ec2Spot
94
94
aws.amazon.com/spot : " true"
95
95
gpu-count : " 1"
96
- # Stick to one AZ for all GPU nodes.
97
- # In case of termination, this will prevent volumes from being unavailable
96
+ # Stick to one AZ for all GPU nodes.
97
+ # In case of termination, this will prevent volumes from being unavailable
98
98
# if the new instance got spun up in another AZ.
99
99
availabilityZones : ["us-east-1a"]
100
100
taints :
@@ -118,8 +118,8 @@ nodeGroups:
118
118
minSize : 0
119
119
maxSize : 10
120
120
instancesDistribution :
121
- # set your own max price. AWS spot instance prices no longer cross OnDemand price.
122
- # Comment out the field to default to OnDemand as max price.
121
+ # set your own max price. AWS spot instance prices no longer cross OnDemand price.
122
+ # Comment out the field to default to OnDemand as max price.
123
123
maxPrice : 1.2
124
124
instanceTypes : ["p3.2xlarge"]
125
125
onDemandBaseCapacity : 0
@@ -129,8 +129,8 @@ nodeGroups:
129
129
lifecycle : Ec2Spot
130
130
aws.amazon.com/spot : " true"
131
131
gpu-count : " 1"
132
- # Stick to one AZ for all GPU nodes.
133
- # In case of termination, this will prevent volumes from being unavailable
132
+ # Stick to one AZ for all GPU nodes.
133
+ # In case of termination, this will prevent volumes from being unavailable
134
134
# if the new instance got spun up in another AZ.
135
135
availabilityZones : ["us-east-1a"]
136
136
taints :
@@ -154,8 +154,8 @@ nodeGroups:
154
154
minSize : 0
155
155
maxSize : 4
156
156
instancesDistribution :
157
- # set your own max price. AWS spot instance prices no longer cross OnDemand price.
158
- # Comment out the field to default to OnDemand as max price.
157
+ # set your own max price. AWS spot instance prices no longer cross OnDemand price.
158
+ # Comment out the field to default to OnDemand as max price.
159
159
# maxPrice: 4.4
160
160
instanceTypes : ["p3.8xlarge"]
161
161
onDemandBaseCapacity : 0
@@ -165,8 +165,8 @@ nodeGroups:
165
165
lifecycle : Ec2Spot
166
166
aws.amazon.com/spot : " true"
167
167
gpu-count : " 4"
168
- # Stick to one AZ for all GPU nodes.
169
- # In case of termination, this will prevent volumes from being unavailable
168
+ # Stick to one AZ for all GPU nodes.
169
+ # In case of termination, this will prevent volumes from being unavailable
170
170
# if the new instance got spun up in another AZ.
171
171
availabilityZones : ["us-east-1a"]
172
172
taints :
@@ -190,8 +190,8 @@ nodeGroups:
190
190
minSize : 0
191
191
maxSize : 1
192
192
instancesDistribution :
193
- # set your own max price. AWS spot instance prices no longer cross OnDemand price.
194
- # Comment out the field to default to OnDemand as max price.
193
+ # set your own max price. AWS spot instance prices no longer cross OnDemand price.
194
+ # Comment out the field to default to OnDemand as max price.
195
195
maxPrice : 11
196
196
instanceTypes : ["p3dn.24xlarge"]
197
197
onDemandBaseCapacity : 0
0 commit comments