Skip to content

Commit db1f78a

Browse files
authored
Allow provisioner to be configured to force on-demand nodes & disable auto-upgrade (GoogleCloudPlatform#656)
* Allow provisioner to be configured to force on-demand nodes * Disable auto-upgrade on node pools
1 parent b732181 commit db1f78a

File tree

4 files changed

+120
-24
lines changed

4 files changed

+120
-24
lines changed

tpu-provisioner/cmd/main.go

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ func main() {
8383
GCPNodeSecondaryDisk string `envconfig:"GCP_NODE_SECONDARY_DISK" default:""`
8484
GCPNodeSecureBoot bool `envconfig:"GCP_NODE_SECURE_BOOT" default:"true"`
8585

86+
// GCPForceOnDemand forces the controller to create nodes on demand, even if
87+
// the Pod requests a reservation or spot.
88+
GCPForceOnDemand bool `envconfig:"GCP_FORCE_ON_DEMAND" default:"false"`
89+
8690
// NodeMinLifespan is the amount of time that should pass between a Node object
8791
// creation and a cleanup of that Node. This needs to be long enough to allow
8892
// the node to become Ready and for a pending Pod to be scheduled on it.
@@ -203,6 +207,7 @@ func main() {
203207
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
204208
NodeTags: cfg.GCPNodeTags,
205209
NodeSecureBoot: cfg.GCPNodeSecureBoot,
210+
ForceOnDemand: cfg.GCPForceOnDemand,
206211
},
207212
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
208213
}

tpu-provisioner/internal/cloud/gke.go

+23-20
Original file line numberDiff line numberDiff line change
@@ -276,27 +276,30 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
276276
}
277277

278278
var reservation *containerv1beta1.ReservationAffinity
279-
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
280-
reservation = &containerv1beta1.ReservationAffinity{
281-
ConsumeReservationType: "SPECIFIC_RESERVATION",
282-
Key: "compute.googleapis.com/reservation-name",
283-
Values: []string{
284-
resName,
285-
},
286-
}
287-
}
288-
289279
var taints []*containerv1beta1.NodeTaint
280+
var spot bool
281+
282+
if !g.ClusterContext.ForceOnDemand {
283+
if resName, ok := p.Spec.NodeSelector["cloud.google.com/reservation-name"]; ok {
284+
reservation = &containerv1beta1.ReservationAffinity{
285+
ConsumeReservationType: "SPECIFIC_RESERVATION",
286+
Key: "compute.googleapis.com/reservation-name",
287+
Values: []string{
288+
resName,
289+
},
290+
}
291+
}
290292

291-
spot := p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
292-
if spot {
293-
// Add the taint that NAP would add.
294-
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
295-
taints = append(taints, &containerv1beta1.NodeTaint{
296-
Key: "cloud.google.com/gke-spot",
297-
Value: "true",
298-
Effect: "NO_SCHEDULE",
299-
})
293+
spot = p.Spec.NodeSelector["cloud.google.com/gke-spot"] == "true"
294+
if spot {
295+
// Add the taint that NAP would add.
296+
// https://cloud.google.com/kubernetes-engine/docs/concepts/spot-vms#spotvms-nap
297+
taints = append(taints, &containerv1beta1.NodeTaint{
298+
Key: "cloud.google.com/gke-spot",
299+
Value: "true",
300+
Effect: "NO_SCHEDULE",
301+
})
302+
}
300303
}
301304

302305
var secondaryDisks []*containerv1beta1.SecondaryBootDisk
@@ -336,7 +339,7 @@ func (g *GKE) nodePoolForPod(name string, p *corev1.Pod) (*containerv1beta1.Node
336339
},
337340
Management: &containerv1beta1.NodeManagement{
338341
AutoRepair: true,
339-
AutoUpgrade: true,
342+
AutoUpgrade: false,
340343
},
341344
UpgradeSettings: &containerv1beta1.UpgradeSettings{
342345
MaxSurge: 1,

tpu-provisioner/internal/cloud/gke_context.go

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ type GKEContext struct {
1111
NodeSecondaryDisk string
1212
NodeTags []string
1313
NodeSecureBoot bool
14+
ForceOnDemand bool
1415
}
1516

1617
func (c GKEContext) ClusterName() string {

tpu-provisioner/internal/cloud/gke_test.go

+91-4
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,67 @@ func TestNodePoolForPod(t *testing.T) {
242242
},
243243
InitialNodeCount: 512,
244244
Locations: []string{""},
245-
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
245+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
246+
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
247+
Name: "test-pool",
248+
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
249+
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
250+
},
251+
},
252+
{
253+
desc: "spot",
254+
selector: map[string]string{
255+
"cloud.google.com/gke-spot": "true",
256+
},
257+
want: &containerv1beta1.NodePool{
258+
Config: &container.NodeConfig{
259+
Labels: map[string]string{
260+
"google.com/nodepool-manager": "tpu-provisioner",
261+
"google.com/tpu-provisioner-jobset-name": "jobset-test",
262+
"google.com/tpu-provisioner-jobset-namespace": "default",
263+
"google.com/tpu-provisioner-parent-kind": "job",
264+
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
265+
"google.com/tpu-provisioner-parent-namespace": "default",
266+
},
267+
MachineType: "ct5p-hightpu-4t",
268+
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
269+
Spot: true,
270+
Taints: []*container.NodeTaint{
271+
{Effect: "NO_SCHEDULE", Key: "cloud.google.com/gke-spot", Value: "true"},
272+
},
273+
},
274+
InitialNodeCount: 512,
275+
Locations: []string{""},
276+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
277+
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
278+
Name: "test-pool",
279+
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
280+
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
281+
},
282+
},
283+
{
284+
desc: "spot with forced on demand",
285+
gkeContext: GKEContext{ForceOnDemand: true},
286+
selector: map[string]string{
287+
"cloud.google.com/gke-spot": "true",
288+
},
289+
want: &containerv1beta1.NodePool{
290+
Config: &container.NodeConfig{
291+
Labels: map[string]string{
292+
"google.com/nodepool-manager": "tpu-provisioner",
293+
"google.com/tpu-provisioner-jobset-name": "jobset-test",
294+
"google.com/tpu-provisioner-jobset-namespace": "default",
295+
"google.com/tpu-provisioner-parent-kind": "job",
296+
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
297+
"google.com/tpu-provisioner-parent-namespace": "default",
298+
},
299+
MachineType: "ct5p-hightpu-4t",
300+
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
301+
Spot: false,
302+
},
303+
InitialNodeCount: 512,
304+
Locations: []string{""},
305+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
246306
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
247307
Name: "test-pool",
248308
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
@@ -272,7 +332,34 @@ func TestNodePoolForPod(t *testing.T) {
272332
},
273333
InitialNodeCount: 512,
274334
Locations: []string{""},
275-
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
335+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
336+
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
337+
Name: "test-pool",
338+
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
339+
UpgradeSettings: &container.UpgradeSettings{MaxSurge: 1},
340+
},
341+
},
342+
{
343+
desc: "pod with reservation selector but on demand is forced",
344+
selector: map[string]string{"cloud.google.com/reservation-name": "tpu-rsv"},
345+
gkeContext: GKEContext{ForceOnDemand: true},
346+
want: &containerv1beta1.NodePool{
347+
Config: &container.NodeConfig{
348+
Labels: map[string]string{
349+
"google.com/nodepool-manager": "tpu-provisioner",
350+
"google.com/tpu-provisioner-jobset-name": "jobset-test",
351+
"google.com/tpu-provisioner-jobset-namespace": "default",
352+
"google.com/tpu-provisioner-parent-kind": "job",
353+
"google.com/tpu-provisioner-parent-name": "jobset-test-job-1-0",
354+
"google.com/tpu-provisioner-parent-namespace": "default",
355+
},
356+
MachineType: "ct5p-hightpu-4t",
357+
ReservationAffinity: nil,
358+
ShieldedInstanceConfig: &container.ShieldedInstanceConfig{EnableIntegrityMonitoring: true},
359+
},
360+
InitialNodeCount: 512,
361+
Locations: []string{""},
362+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
276363
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
277364
Name: "test-pool",
278365
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
@@ -298,7 +385,7 @@ func TestNodePoolForPod(t *testing.T) {
298385
},
299386
InitialNodeCount: 512,
300387
Locations: []string{""},
301-
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
388+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
302389
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
303390
Name: "test-pool",
304391
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},
@@ -329,7 +416,7 @@ func TestNodePoolForPod(t *testing.T) {
329416
},
330417
InitialNodeCount: 512,
331418
Locations: []string{""},
332-
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: true},
419+
Management: &container.NodeManagement{AutoRepair: true, AutoUpgrade: false},
333420
MaxPodsConstraint: &container.MaxPodsConstraint{MaxPodsPerNode: 15},
334421
Name: "test-pool",
335422
PlacementPolicy: &container.PlacementPolicy{TpuTopology: "8x16x16", Type: "COMPACT"},

0 commit comments

Comments
 (0)