Skip to content

Commit 7bd3654

Browse files
committed
Refactored workloads and Kubernetes manifest actions (#119)
1 parent 3662dbf commit 7bd3654

File tree

99 files changed

+2817
-41
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+2817
-41
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2024 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -o errexit
18+
set -o nounset
19+
set -o pipefail
20+
21+
SHARED_CONFIG_DIRECTORY=${1}
22+
SHARED_CONFIG_NAME=${2}
23+
24+
if [[ ${SHARED_CONFIG_DIRECTORY} != \.* ]]; then
25+
echo "The shared config directory path must be a relative path!"
26+
exit 1
27+
fi
28+
29+
if test ! -d "${SHARED_CONFIG_DIRECTORY}"; then
30+
echo "Shared config directory '${SHARED_CONFIG_DIRECTORY}' does not exist!"
31+
exit 2
32+
fi
33+
34+
if test ! -f "${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf"; then
35+
echo "Shared config '${SHARED_CONFIG_NAME}' does not exist in '${SHARED_CONFIG_DIRECTORY}'!"
36+
exit 3
37+
fi
38+
39+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf _${SHARED_CONFIG_NAME}_variables.tf
40+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}.auto.tfvars _${SHARED_CONFIG_NAME}.auto.tfvars
41+
42+
echo "Successfully linked shared config '${SHARED_CONFIG_NAME}' from '${SHARED_CONFIG_DIRECTORY}'."

platforms/gke/base/_shared_config/workloads_variables.tf

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ variable "jobset_version" {
2626
type = string
2727
}
2828

29+
variable "kuberay_version" {
30+
default = "1.3.1"
31+
description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install."
32+
type = string
33+
}
34+
2935
variable "kueue_version" {
3036
default = "0.10.2"
3137
description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install."

platforms/gke/base/core/container_cluster/container_cluster.tf

+1
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ resource "google_container_node_pool" "system" {
247247
node_config {
248248
# Variables
249249
labels = {
250+
"node-provisioning-model" : "on-demand"
250251
"resource-type" : "system"
251252
}
252253
machine_type = var.cluster_system_node_pool_machine_type

platforms/gke/base/core/container_node_pool/cpu/region/us-central1/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -118,6 +119,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
118119
node_config {
119120
# Variables
120121
labels = {
122+
"node-provisioning-model" : "spot"
121123
"resource-model" : "n4"
122124
"resource-type" : "cpu"
123125
}

platforms/gke/base/core/container_node_pool/cpu/region/us-east4/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -117,6 +118,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
117118
node_config {
118119
# Variables
119120
labels = {
121+
"node-provisioning-model" : "spot"
120122
"resource-model" : "n4"
121123
"resource-type" : "cpu"
122124
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "a100"
6061
"resource-type" : "gpu"
6162
"resource-variant" : "40GB"
@@ -143,6 +144,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
143144
node_config {
144145
# Variables
145146
labels = {
147+
"node-provisioning-model" : "on-demand"
148+
"queued-provisioning" : "true"
146149
"resource-model" : "a100"
147150
"resource-type" : "gpu"
148151
"resource-variant" : "40GB"
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "a100"
238242
"resource-type" : "gpu"
239243
"resource-variant" : "40GB"
@@ -321,6 +325,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_spot" {
321325
node_config {
322326
# Variables
323327
labels = {
328+
"node-provisioning-model" : "spot"
324329
"resource-model" : "a100"
325330
"resource-type" : "gpu"
326331
"resource-variant" : "40GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "h100"
5859
"resource-type" : "gpu"
5960
}
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
142143
node_config {
143144
# Variables
144145
labels = {
146+
"node-provisioning-model" : "on-demand"
147+
"queued-provisioning" : "true"
145148
"resource-model" : "h100"
146149
"resource-type" : "gpu"
147150
}
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "h100"
238242
"resource-type" : "gpu"
239243
}
@@ -322,6 +326,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
322326
node_config {
323327
# Variables
324328
labels = {
329+
"node-provisioning-model" : "spot"
325330
"resource-model" : "h100"
326331
"resource-type" : "gpu"
327332
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_l4.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "l4"
5859
"resource-type" : "gpu"
5960
}
@@ -138,6 +139,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
138139
node_config {
139140
# Variables
140141
labels = {
142+
"node-provisioning-model" : "on-demand"
143+
"queued-provisioning" : "true"
141144
"resource-model" : "l4"
142145
"resource-type" : "gpu"
143146
}
@@ -226,6 +229,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
226229
node_config {
227230
# Variables
228231
labels = {
232+
"node-provisioning-model" : "reservation"
229233
"resource-model" : "l4"
230234
"resource-type" : "gpu"
231235
}
@@ -311,6 +315,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
311315
node_config {
312316
# Variables
313317
labels = {
318+
"node-provisioning-model" : "spot"
314319
"resource-model" : "l4"
315320
"resource-type" : "gpu"
316321
}

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2" {
5555
node_config {
5656
# Variables
5757
labels = {
58+
"node-provisioning-model" : "on-demand"
5859
"resource-model" : "a100"
5960
"resource-type" : "gpu"
6061
"resource-variant" : "80GB"
@@ -141,6 +142,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_dws" {
141142
node_config {
142143
# Variables
143144
labels = {
145+
"node-provisioning-model" : "on-demand"
146+
"queued-provisioning" : "true"
144147
"resource-model" : "a100"
145148
"resource-type" : "gpu"
146149
"resource-variant" : "80GB"
@@ -231,6 +234,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_res" {
231234
node_config {
232235
# Variables
233236
labels = {
237+
"node-provisioning-model" : "reservation"
234238
"resource-model" : "a100"
235239
"resource-type" : "gpu"
236240
"resource-variant" : "80GB"
@@ -317,6 +321,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_spot" {
317321
node_config {
318322
# Variables
319323
labels = {
324+
"node-provisioning-model" : "spot"
320325
"resource-model" : "a100"
321326
"resource-type" : "gpu"
322327
"resource-variant" : "80GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "h100"
6061
"resource-type" : "gpu"
6162
}
@@ -146,6 +147,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
146147
node_config {
147148
# Variables
148149
labels = {
150+
"node-provisioning-model" : "on-demand"
151+
"queued-provisioning" : "true"
149152
"resource-model" : "h100"
150153
"resource-type" : "gpu"
151154
}
@@ -240,6 +243,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
240243
node_config {
241244
# Variables
242245
labels = {
246+
"node-provisioning-model" : "reservation"
243247
"resource-model" : "h100"
244248
"resource-type" : "gpu"
245249
}
@@ -330,6 +334,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
330334
node_config {
331335
# Variables
332336
labels = {
337+
"node-provisioning-model" : "spot"
333338
"resource-model" : "h100"
334339
"resource-type" : "gpu"
335340
}

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_l4.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "l4"
6061
"resource-type" : "gpu"
6162
}
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
142143
node_config {
143144
# Variables
144145
labels = {
146+
"node-provisioning-model" : "on-demand"
147+
"queued-provisioning" : "true"
145148
"resource-model" : "l4"
146149
"resource-type" : "gpu"
147150
}
@@ -232,6 +235,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
232235
node_config {
233236
# Variables
234237
labels = {
238+
"node-provisioning-model" : "reservation"
235239
"resource-model" : "l4"
236240
"resource-type" : "gpu"
237241
}
@@ -318,6 +322,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
318322
node_config {
319323
# Variables
320324
labels = {
325+
"node-provisioning-model" : "spot"
321326
"resource-model" : "l4"
322327
"resource-type" : "gpu"
323328
}

platforms/gke/base/core/workloads/kuberay/.terraform.lock.hcl

+61
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/cluster.auto.tfvars
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/cluster_variables.tf
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/platform.auto.tfvars
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/platform_variables.tf
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/workloads.auto.tfvars
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../_shared_config/workloads_variables.tf

0 commit comments

Comments
 (0)