Skip to content

Commit 393dd27

Browse files
authored
TPU Provisioner: Node pool hash comparison (#967)
* TPU Provisioner: Node pool hash comparison * Update hash to be selective * Update go.mod * Use interface for GKE interactions and start test case * Add test cases * Add logging, improve error handling * Check if NP needs deletion immediately after Pod termination * Add res affinity and remove taints from hash func * Address comments
1 parent 7634ed1 commit 393dd27

File tree

9 files changed

+746
-223
lines changed

9 files changed

+746
-223
lines changed

tpu-provisioner/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Build the manager binary
2-
FROM golang:1.23 as builder
2+
FROM golang:1.23 AS builder
33
ARG TARGETOS
44
ARG TARGETARCH
55

tpu-provisioner/cmd/main.go

+28-21
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ func main() {
9494
GCPForceOnDemand bool `envconfig:"GCP_FORCE_ON_DEMAND" default:"false"`
9595

9696
// NodeMinLifespan is the amount of time that should pass between a Node object
97-
// creation and a cleanup of that Node. This needs to be long enough to allow
98-
// the node to become Ready and for a pending Pod to be scheduled on it.
99-
NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"3m"`
97+
// creation and a cleanup of that Node. This is mostly irrelevant now that JobSet
98+
// existance is checked before deleting a NodePool.
99+
NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"10s"`
100100

101101
NodepoolDeletionDelay time.Duration `envconfig:"NODEPOOL_DELETION_DELAY" default:"30s"`
102102

@@ -198,30 +198,37 @@ func main() {
198198
"podToNodeLabels", cfg.GCPPodToNodeLabels,
199199
)
200200

201+
clusterCtx := cloud.GKEContext{
202+
ProjectID: cfg.GCPProjectID,
203+
ClusterLocation: cfg.GCPClusterLocation,
204+
Cluster: cfg.GCPCluster,
205+
NodeZone: cfg.GCPZone,
206+
NodeServiceAccount: cfg.GCPNodeServiceAccount,
207+
NodeAdditionalNetworks: cfg.GCPNodeAdditionalNetworks,
208+
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
209+
NodeTags: cfg.GCPNodeTags,
210+
NodeDiskType: cfg.GCPNodeDiskType,
211+
NodeConfidentialStorage: cfg.GCPNodeConfidentialStorage,
212+
NodeBootDiskKMSKey: cfg.GCPNodeBootDiskKMSKey,
213+
PodToNodeLabels: cfg.GCPPodToNodeLabels,
214+
NodeSecureBoot: cfg.GCPNodeSecureBoot,
215+
ForceOnDemand: cfg.GCPForceOnDemand,
216+
}
217+
201218
containers, err := containerv1beta1.NewService(context.Background() /*, option.WithCredentials(creds)*/)
202219
if err != nil {
203220
setupLog.Error(err, "unable to create gke client")
204221
os.Exit(1)
205222
}
223+
nodePoolsService := &cloud.GKENodePoolService{
224+
ClusterContext: clusterCtx,
225+
Service: containers,
226+
}
227+
206228
provider = &cloud.GKE{
207-
Service: containers,
208-
ClusterContext: cloud.GKEContext{
209-
ProjectID: cfg.GCPProjectID,
210-
ClusterLocation: cfg.GCPClusterLocation,
211-
Cluster: cfg.GCPCluster,
212-
NodeZone: cfg.GCPZone,
213-
NodeServiceAccount: cfg.GCPNodeServiceAccount,
214-
NodeAdditionalNetworks: cfg.GCPNodeAdditionalNetworks,
215-
NodeSecondaryDisk: cfg.GCPNodeSecondaryDisk,
216-
NodeTags: cfg.GCPNodeTags,
217-
NodeDiskType: cfg.GCPNodeDiskType,
218-
NodeConfidentialStorage: cfg.GCPNodeConfidentialStorage,
219-
NodeBootDiskKMSKey: cfg.GCPNodeBootDiskKMSKey,
220-
PodToNodeLabels: cfg.GCPPodToNodeLabels,
221-
NodeSecureBoot: cfg.GCPNodeSecureBoot,
222-
ForceOnDemand: cfg.GCPForceOnDemand,
223-
},
224-
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
229+
NodePools: nodePoolsService,
230+
ClusterContext: clusterCtx,
231+
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
225232
}
226233
case "mock":
227234
provider = &cloud.Mock{}

tpu-provisioner/examples/jobset.yaml

+11-7
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
maxRestarts: 3
1010
replicatedJobs:
1111
- name: workers
12-
replicas: 3 # set to number of node pools
12+
replicas: 1 # set to number of node pools
1313
template:
1414
spec:
1515
backoffLimit: 0
@@ -21,8 +21,15 @@ spec:
2121
spec:
2222
restartPolicy: Never
2323
nodeSelector:
24-
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
24+
cloud.google.com/gke-tpu-accelerator: tpu-v5p-slice
2525
cloud.google.com/gke-tpu-topology: 2x2x2
26+
cloud.google.com/gke-spot: "true"
27+
abc: xyz
28+
tolerations:
29+
- key: cloud.google.com/gke-spot
30+
operator: Equal
31+
value: "true"
32+
effect: NoSchedule
2633
containers:
2734
- name: tpu-job
2835
image: python:3.8
@@ -31,11 +38,8 @@ spec:
3138
securityContext:
3239
privileged: true
3340
command:
34-
- bash
35-
- -c
36-
- |
37-
pip install 'jax[tpu]' -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
38-
python -c 'import jax; print("TPU cores:", jax.device_count())'
41+
- "sleep"
42+
- "600"
3943
resources:
4044
requests:
4145
google.com/tpu: 4

tpu-provisioner/internal/cloud/common.go

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ const (
2222
LabelJobSetName = keyPrefix + "tpu-provisioner-jobset-name"
2323
LabelJobSetNamespace = keyPrefix + "tpu-provisioner-jobset-namespace"
2424

25+
LabelNodePoolHash = keyPrefix + "tpu-provisioner-nodepool-hash"
26+
2527
LabelProvisionerNodepoolID = "provisioner-nodepool-id"
2628

2729
// AnnotationCopyLabels is a comma-separated list of labels to copy from the Pod to the node pool config (Nodes).

0 commit comments

Comments
 (0)