Skip to content

Commit 5b2dc82

Browse files
committed
Refactored workloads and Kubernetes manifest actions (#119)
1 parent 7d86595 commit 5b2dc82

File tree

102 files changed

+2935
-129
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+2935
-129
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2024 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -o errexit
18+
set -o nounset
19+
set -o pipefail
20+
21+
SHARED_CONFIG_DIRECTORY=${1}
22+
SHARED_CONFIG_NAME=${2}
23+
24+
if [[ ${SHARED_CONFIG_DIRECTORY} != \.* ]]; then
25+
echo "The shared config directory path must be a relative path!"
26+
exit 1
27+
fi
28+
29+
if test ! -d "${SHARED_CONFIG_DIRECTORY}"; then
30+
echo "Shared config directory '${SHARED_CONFIG_DIRECTORY}' does not exist!"
31+
exit 2
32+
fi
33+
34+
if test ! -f "${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf"; then
35+
echo "Shared config '${SHARED_CONFIG_NAME}' does not exist in '${SHARED_CONFIG_DIRECTORY}'!"
36+
exit 3
37+
fi
38+
39+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf _${SHARED_CONFIG_NAME}_variables.tf
40+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}.auto.tfvars _${SHARED_CONFIG_NAME}.auto.tfvars
41+
42+
echo "Successfully linked shared config '${SHARED_CONFIG_NAME}' from '${SHARED_CONFIG_DIRECTORY}'."

platforms/gke/base/_shared_config/scripts/set_environment_variables.sh

+2
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,7 @@ for SHARED_CONFIG_PATH in "${SHARED_CONFIG_PATHS[@]}"; do
2222
terraform -chdir="${SHARED_CONFIG_PATH}" apply -auto-approve -input=false >/dev/null
2323
terraform -chdir="${SHARED_CONFIG_PATH}" output
2424
echo -e "-------------------------------------------------------------------------\n"
25+
set -o allexport
2526
eval "$(terraform -chdir="${SHARED_CONFIG_PATH}" output | sed -r 's/(\".*\")|\s*/\1/g')"
27+
set +o allexport
2628
done

platforms/gke/base/_shared_config/workloads_variables.tf

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ variable "jobset_version" {
2626
type = string
2727
}
2828

29+
variable "kuberay_version" {
30+
default = "1.3.1"
31+
description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install."
32+
type = string
33+
}
34+
2935
variable "kueue_version" {
3036
default = "0.10.2"
3137
description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install."

platforms/gke/base/core/container_cluster/container_cluster.tf

+1
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ resource "google_container_node_pool" "system" {
247247
node_config {
248248
# Variables
249249
labels = {
250+
"node-provisioning-model" : "on-demand"
250251
"resource-type" : "system"
251252
}
252253
machine_type = var.cluster_system_node_pool_machine_type

platforms/gke/base/core/container_node_pool/cpu/region/us-central1/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -118,6 +119,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
118119
node_config {
119120
# Variables
120121
labels = {
122+
"node-provisioning-model" : "spot"
121123
"resource-model" : "n4"
122124
"resource-type" : "cpu"
123125
}

platforms/gke/base/core/container_node_pool/cpu/region/us-east4/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -117,6 +118,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
117118
node_config {
118119
# Variables
119120
labels = {
121+
"node-provisioning-model" : "spot"
120122
"resource-model" : "n4"
121123
"resource-type" : "cpu"
122124
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "a100"
6061
"resource-type" : "gpu"
6162
"resource-variant" : "40GB"
@@ -143,6 +144,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
143144
node_config {
144145
# Variables
145146
labels = {
147+
"node-provisioning-model" : "on-demand"
148+
"queued-provisioning" : "true"
146149
"resource-model" : "a100"
147150
"resource-type" : "gpu"
148151
"resource-variant" : "40GB"
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "a100"
238242
"resource-type" : "gpu"
239243
"resource-variant" : "40GB"
@@ -321,6 +325,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_spot" {
321325
node_config {
322326
# Variables
323327
labels = {
328+
"node-provisioning-model" : "spot"
324329
"resource-model" : "a100"
325330
"resource-type" : "gpu"
326331
"resource-variant" : "40GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "h100"
5859
"resource-type" : "gpu"
5960
}
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
142143
node_config {
143144
# Variables
144145
labels = {
146+
"node-provisioning-model" : "on-demand"
147+
"queued-provisioning" : "true"
145148
"resource-model" : "h100"
146149
"resource-type" : "gpu"
147150
}
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "h100"
238242
"resource-type" : "gpu"
239243
}
@@ -322,6 +326,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
322326
node_config {
323327
# Variables
324328
labels = {
329+
"node-provisioning-model" : "spot"
325330
"resource-model" : "h100"
326331
"resource-type" : "gpu"
327332
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_l4.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "l4"
5859
"resource-type" : "gpu"
5960
}
@@ -138,6 +139,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
138139
node_config {
139140
# Variables
140141
labels = {
142+
"node-provisioning-model" : "on-demand"
143+
"queued-provisioning" : "true"
141144
"resource-model" : "l4"
142145
"resource-type" : "gpu"
143146
}
@@ -226,6 +229,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
226229
node_config {
227230
# Variables
228231
labels = {
232+
"node-provisioning-model" : "reservation"
229233
"resource-model" : "l4"
230234
"resource-type" : "gpu"
231235
}
@@ -311,6 +315,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
311315
node_config {
312316
# Variables
313317
labels = {
318+
"node-provisioning-model" : "spot"
314319
"resource-model" : "l4"
315320
"resource-type" : "gpu"
316321
}

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2" {
5555
node_config {
5656
# Variables
5757
labels = {
58+
"node-provisioning-model" : "on-demand"
5859
"resource-model" : "a100"
5960
"resource-type" : "gpu"
6061
"resource-variant" : "80GB"
@@ -141,6 +142,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_dws" {
141142
node_config {
142143
# Variables
143144
labels = {
145+
"node-provisioning-model" : "on-demand"
146+
"queued-provisioning" : "true"
144147
"resource-model" : "a100"
145148
"resource-type" : "gpu"
146149
"resource-variant" : "80GB"
@@ -231,6 +234,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_res" {
231234
node_config {
232235
# Variables
233236
labels = {
237+
"node-provisioning-model" : "reservation"
234238
"resource-model" : "a100"
235239
"resource-type" : "gpu"
236240
"resource-variant" : "80GB"
@@ -317,6 +321,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_spot" {
317321
node_config {
318322
# Variables
319323
labels = {
324+
"node-provisioning-model" : "spot"
320325
"resource-model" : "a100"
321326
"resource-type" : "gpu"
322327
"resource-variant" : "80GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "h100"
6061
"resource-type" : "gpu"
6162
}
@@ -146,6 +147,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
146147
node_config {
147148
# Variables
148149
labels = {
150+
"node-provisioning-model" : "on-demand"
151+
"queued-provisioning" : "true"
149152
"resource-model" : "h100"
150153
"resource-type" : "gpu"
151154
}
@@ -240,6 +243,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
240243
node_config {
241244
# Variables
242245
labels = {
246+
"node-provisioning-model" : "reservation"
243247
"resource-model" : "h100"
244248
"resource-type" : "gpu"
245249
}
@@ -330,6 +334,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
330334
node_config {
331335
# Variables
332336
labels = {
337+
"node-provisioning-model" : "spot"
333338
"resource-model" : "h100"
334339
"resource-type" : "gpu"
335340
}

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_l4.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "l4"
6061
"resource-type" : "gpu"
6162
}
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
142143
node_config {
143144
# Variables
144145
labels = {
146+
"node-provisioning-model" : "on-demand"
147+
"queued-provisioning" : "true"
145148
"resource-model" : "l4"
146149
"resource-type" : "gpu"
147150
}
@@ -232,6 +235,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
232235
node_config {
233236
# Variables
234237
labels = {
238+
"node-provisioning-model" : "reservation"
235239
"resource-model" : "l4"
236240
"resource-type" : "gpu"
237241
}
@@ -318,6 +322,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
318322
node_config {
319323
# Variables
320324
labels = {
325+
"node-provisioning-model" : "spot"
321326
"resource-model" : "l4"
322327
"resource-type" : "gpu"
323328
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
resource "terraform_data" "cleanup_network_endpoint_groups" {
16+
input = {
17+
project_id = var.cluster_project_id
18+
identifier = local.unique_identifier_prefix
19+
}
20+
21+
provisioner "local-exec" {
22+
command = <<EOT
23+
echo "Cleaning up network endpoint groups..."
24+
negs=$(gcloud compute network-endpoint-groups list --filter="name~'k8s.*-.*' AND network~'${self.input.identifier}$'" --format='value(format("{0},{1}", name, zone.basename()))' --project=${self.input.project_id})
25+
for neg in $${negs}; do
26+
name="$${neg%,*}"
27+
zone="$${neg#*,}"
28+
29+
echo "Deleting '$${name}' network endpoint group in $${zone}..."
30+
gcloud compute network-endpoint-groups delete $${name} --project=${self.input.project_id} --quiet --zone=$${zone}
31+
done
32+
EOT
33+
interpreter = ["bash", "-c"]
34+
when = destroy
35+
working_dir = path.root
36+
}
37+
}

platforms/gke/base/core/networking/compute_network.tf

+4
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
# limitations under the License.
1414

1515
resource "google_compute_network" "vpc" {
16+
depends_on = [
17+
terraform_data.cleanup_network_endpoint_groups
18+
]
19+
1620
count = var.network_name != null ? 0 : 1
1721

1822
auto_create_subnetworks = false

0 commit comments

Comments
 (0)