Skip to content

Commit 34451cd

Browse files
committed
Refactored workloads and Kubernetes manifest actions (#119)
1 parent b07b509 commit 34451cd

File tree

113 files changed

+3318
-132
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+3318
-132
lines changed

.github/workflows/dictionary/accelerated-platforms.txt

+1
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,4 @@ schedulable
5454
tunables
5555
usecs
5656
virt
57+
yunikorn

.github/workflows/dictionary/google-cloud.txt

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ negs
6767
nodepool
6868
nodepools
6969
ondemand
70+
parallelstore
7071
pgaudit
7172
podslice
7273
policycontroller
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
comand

.github/workflows/dictionary/kubernetes.txt

+2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ admissionregistration
22
apiextensions
33
apiregistration
44
apiservice
5+
automount
56
clusterregistry
67
clusterrole
78
clusterrolebinding
9+
crds
810
egressgateway
911
flowcontrol
1012
flowschemas

cspell.json

+8-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
"name": "kubeflow",
2323
"path": ".github/workflows/dictionary/kubeflow.txt"
2424
},
25+
{
26+
"name": "kuberay",
27+
"path": ".github/workflows/dictionary/kuberay.txt"
28+
},
2529
{
2630
"name": "kubernetes",
2731
"path": ".github/workflows/dictionary/kubernetes.txt"
@@ -61,14 +65,15 @@
6165
{
6266
"name": "vllm",
6367
"path": ".github/workflows/dictionary/vllm.txt"
64-
}
68+
},
6569
],
6670
"dictionaries": [
6771
"accelerated-platforms",
6872
"flipkart",
6973
"google-cloud",
7074
"huggingface",
7175
"kubeflow",
76+
"kuberay",
7277
"kubernetes",
7378
"kueue",
7479
"nvidia",
@@ -78,11 +83,11 @@
7883
"svg",
7984
"terraform",
8085
"use-case-federated-learning",
81-
"vllm"
86+
"vllm",
8287
],
8388
"ignorePaths": [
8489
"node_modules",
8590
"use-cases/inferencing/batch-inference/example_predictions.txt",
86-
"use-cases/model-fine-tuning-pipeline/model-eval/examples/"
91+
"use-cases/model-fine-tuning-pipeline/model-eval/examples/",
8792
]
8893
}
+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# GKE AI/ML Platform reference architecture and implementation
2+
3+
## Deploy the platform
4+
5+
[Reference Architecture](/platforms/gke/base/use-cases/aiml/README.md)
6+
7+
### Examples
8+
9+
[Fine-tuning Pipeline](/platforms/gke/base/use-cases/aiml/examples/fine-tuning-pipeline/README.md)

platforms/gke/base/_shared_config/cluster_variables.tf

+6
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ locals {
4343
kubeconfig_file_name = "${var.cluster_project_id}-${local.cluster_name}"
4444
}
4545

46+
variable "cluster_addons_ray_operator_enabled" {
47+
default = false
48+
description = "Enable Ray Operator Addon. Ref: https://cloud.google.com/kubernetes-engine/docs/add-on/ray-on-gke/concepts/overview"
49+
type = bool
50+
}
51+
4652
variable "cluster_binary_authorization_evaluation_mode" {
4753
default = "DISABLED"
4854
description = "Mode of operation for Binary Authorization policy evaluation. Valid values are DISABLED and PROJECT_SINGLETON_POLICY_ENFORCE."
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
#
3+
# Copyright 2024 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -o errexit
18+
set -o nounset
19+
set -o pipefail
20+
21+
SHARED_CONFIG_DIRECTORY=${1}
22+
SHARED_CONFIG_NAME=${2}
23+
24+
if [[ ${SHARED_CONFIG_DIRECTORY} != \.* ]]; then
25+
echo "The shared config directory path must be a relative path!"
26+
exit 1
27+
fi
28+
29+
if test ! -d "${SHARED_CONFIG_DIRECTORY}"; then
30+
echo "Shared config directory '${SHARED_CONFIG_DIRECTORY}' does not exist!"
31+
exit 2
32+
fi
33+
34+
if test ! -f "${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf"; then
35+
echo "Shared config '${SHARED_CONFIG_NAME}' does not exist in '${SHARED_CONFIG_DIRECTORY}'!"
36+
exit 3
37+
fi
38+
39+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf _${SHARED_CONFIG_NAME}_variables.tf
40+
ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}.auto.tfvars _${SHARED_CONFIG_NAME}.auto.tfvars
41+
42+
echo "Successfully linked shared config '${SHARED_CONFIG_NAME}' from '${SHARED_CONFIG_DIRECTORY}'."

platforms/gke/base/_shared_config/workloads_variables.tf

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ variable "jobset_version" {
2626
type = string
2727
}
2828

29+
variable "kuberay_version" {
30+
default = "1.3.1"
31+
description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install."
32+
type = string
33+
}
34+
2935
variable "kueue_version" {
3036
default = "0.10.2"
3137
description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install."

platforms/gke/base/core/container_cluster/container_cluster.tf

+18
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,23 @@ resource "google_container_cluster" "cluster" {
3737
gce_persistent_disk_csi_driver_config {
3838
enabled = true
3939
}
40+
41+
parallelstore_csi_driver_config {
42+
enabled = true
43+
}
44+
45+
dynamic "ray_operator_config" {
46+
for_each = var.cluster_addons_ray_operator_enabled ? ["ray_operator_config"] : []
47+
content {
48+
enabled = true
49+
ray_cluster_logging_config {
50+
enabled = true
51+
}
52+
ray_cluster_monitoring_config {
53+
enabled = true
54+
}
55+
}
56+
}
4057
}
4158

4259
cluster_autoscaling {
@@ -247,6 +264,7 @@ resource "google_container_node_pool" "system" {
247264
node_config {
248265
# Variables
249266
labels = {
267+
"node-provisioning-model" : "on-demand"
250268
"resource-type" : "system"
251269
}
252270
machine_type = var.cluster_system_node_pool_machine_type

platforms/gke/base/core/container_node_pool/cpu/region/us-central1/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -118,6 +119,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
118119
node_config {
119120
# Variables
120121
labels = {
122+
"node-provisioning-model" : "spot"
121123
"resource-model" : "n4"
122124
"resource-type" : "cpu"
123125
}

platforms/gke/base/core/container_node_pool/cpu/region/us-east4/container_node_pool_cpu_n4.tf

+2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
4848
node_config {
4949
# Variables
5050
labels = {
51+
"node-provisioning-model" : "on-demand"
5152
"resource-model" : "n4"
5253
"resource-type" : "cpu"
5354
}
@@ -117,6 +118,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
117118
node_config {
118119
# Variables
119120
labels = {
121+
"node-provisioning-model" : "spot"
120122
"resource-model" : "n4"
121123
"resource-type" : "cpu"
122124
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "a100"
6061
"resource-type" : "gpu"
6162
"resource-variant" : "40GB"
@@ -143,6 +144,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
143144
node_config {
144145
# Variables
145146
labels = {
147+
"node-provisioning-model" : "on-demand"
148+
"queued-provisioning" : "true"
146149
"resource-model" : "a100"
147150
"resource-type" : "gpu"
148151
"resource-variant" : "40GB"
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "a100"
238242
"resource-type" : "gpu"
239243
"resource-variant" : "40GB"
@@ -321,6 +325,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_spot" {
321325
node_config {
322326
# Variables
323327
labels = {
328+
"node-provisioning-model" : "spot"
324329
"resource-model" : "a100"
325330
"resource-type" : "gpu"
326331
"resource-variant" : "40GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "h100"
5859
"resource-type" : "gpu"
5960
}
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
142143
node_config {
143144
# Variables
144145
labels = {
146+
"node-provisioning-model" : "on-demand"
147+
"queued-provisioning" : "true"
145148
"resource-model" : "h100"
146149
"resource-type" : "gpu"
147150
}
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
234237
node_config {
235238
# Variables
236239
labels = {
240+
"node-provisioning-model" : "reservation"
237241
"resource-model" : "h100"
238242
"resource-type" : "gpu"
239243
}
@@ -322,6 +326,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
322326
node_config {
323327
# Variables
324328
labels = {
329+
"node-provisioning-model" : "spot"
325330
"resource-model" : "h100"
326331
"resource-type" : "gpu"
327332
}

platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_l4.tf

+5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
5454
node_config {
5555
# Variables
5656
labels = {
57+
"node-provisioning-model" : "on-demand"
5758
"resource-model" : "l4"
5859
"resource-type" : "gpu"
5960
}
@@ -138,6 +139,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
138139
node_config {
139140
# Variables
140141
labels = {
142+
"node-provisioning-model" : "on-demand"
143+
"queued-provisioning" : "true"
141144
"resource-model" : "l4"
142145
"resource-type" : "gpu"
143146
}
@@ -226,6 +229,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
226229
node_config {
227230
# Variables
228231
labels = {
232+
"node-provisioning-model" : "reservation"
229233
"resource-model" : "l4"
230234
"resource-type" : "gpu"
231235
}
@@ -311,6 +315,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
311315
node_config {
312316
# Variables
313317
labels = {
318+
"node-provisioning-model" : "spot"
314319
"resource-model" : "l4"
315320
"resource-type" : "gpu"
316321
}

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_a100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2" {
5555
node_config {
5656
# Variables
5757
labels = {
58+
"node-provisioning-model" : "on-demand"
5859
"resource-model" : "a100"
5960
"resource-type" : "gpu"
6061
"resource-variant" : "80GB"
@@ -141,6 +142,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_dws" {
141142
node_config {
142143
# Variables
143144
labels = {
145+
"node-provisioning-model" : "on-demand"
146+
"queued-provisioning" : "true"
144147
"resource-model" : "a100"
145148
"resource-type" : "gpu"
146149
"resource-variant" : "80GB"
@@ -231,6 +234,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_res" {
231234
node_config {
232235
# Variables
233236
labels = {
237+
"node-provisioning-model" : "reservation"
234238
"resource-model" : "a100"
235239
"resource-type" : "gpu"
236240
"resource-variant" : "80GB"
@@ -317,6 +321,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_spot" {
317321
node_config {
318322
# Variables
319323
labels = {
324+
"node-provisioning-model" : "spot"
320325
"resource-model" : "a100"
321326
"resource-type" : "gpu"
322327
"resource-variant" : "80GB"

platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_h100.tf

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
5656
node_config {
5757
# Variables
5858
labels = {
59+
"node-provisioning-model" : "on-demand"
5960
"resource-model" : "h100"
6061
"resource-type" : "gpu"
6162
}
@@ -146,6 +147,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
146147
node_config {
147148
# Variables
148149
labels = {
150+
"node-provisioning-model" : "on-demand"
151+
"queued-provisioning" : "true"
149152
"resource-model" : "h100"
150153
"resource-type" : "gpu"
151154
}
@@ -240,6 +243,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
240243
node_config {
241244
# Variables
242245
labels = {
246+
"node-provisioning-model" : "reservation"
243247
"resource-model" : "h100"
244248
"resource-type" : "gpu"
245249
}
@@ -330,6 +334,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
330334
node_config {
331335
# Variables
332336
labels = {
337+
"node-provisioning-model" : "spot"
333338
"resource-model" : "h100"
334339
"resource-type" : "gpu"
335340
}

0 commit comments

Comments
 (0)