GoogleCloudPlatform
diff --git a/‎.github/workflows/dictionary/accelerated-platforms.txt
+1 b/‎.github/workflows/dictionary/accelerated-platforms.txt
+1
diff --git a/‎.github/workflows/dictionary/google-cloud.txt
+1 b/‎.github/workflows/dictionary/google-cloud.txt
+1
diff --git a/‎.github/workflows/dictionary/kuberay.txt
+1 b/‎.github/workflows/dictionary/kuberay.txt
+1
diff --git a/‎.github/workflows/dictionary/kubernetes.txt
+2 b/‎.github/workflows/dictionary/kubernetes.txt
+2
diff --git a/‎cspell.json
+8-3 b/‎cspell.json
+8-3
diff --git a/‎docs/platforms/gke/base/aiml/README.md
+9 b/‎docs/platforms/gke/base/aiml/README.md
+9
diff --git a/‎platforms/gke/base/_shared_config/cluster_variables.tf
+6 b/‎platforms/gke/base/_shared_config/cluster_variables.tf
+6
diff --git a/‎platforms/gke/base/_shared_config/scripts/link_shared_config.sh
+42 b/‎platforms/gke/base/_shared_config/scripts/link_shared_config.sh
+42
diff --git a/‎platforms/gke/base/_shared_config/workloads_variables.tf
+6 b/‎platforms/gke/base/_shared_config/workloads_variables.tf
+6
diff --git a/‎platforms/gke/base/core/container_cluster/container_cluster.tf
+18 b/‎platforms/gke/base/core/container_cluster/container_cluster.tf
+18
diff --git a/‎platforms/gke/base/core/container_node_pool/cpu/region/us-central1/container_node_pool_cpu_n4.tf
+2 b/‎platforms/gke/base/core/container_node_pool/cpu/region/us-central1/container_node_pool_cpu_n4.tf
+2
diff --git a/‎platforms/gke/base/core/container_node_pool/cpu/region/us-east4/container_node_pool_cpu_n4.tf
+2 b/‎platforms/gke/base/core/container_node_pool/cpu/region/us-east4/container_node_pool_cpu_n4.tf
+2
diff --git a/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_a100.tf
+5 b/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_a100.tf
+5
diff --git a/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_h100.tf
+5 b/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_h100.tf
+5
diff --git a/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_l4.tf
+5 b/‎platforms/gke/base/core/container_node_pool/gpu/region/us-central1/container_node_pool_gpu_l4.tf
+5
diff --git a/‎platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_a100.tf
+5 b/‎platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_a100.tf
+5
diff --git a/‎platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_h100.tf
+5 b/‎platforms/gke/base/core/container_node_pool/gpu/region/us-east4/container_node_pool_gpu_h100.tf
+5
@@ -54,3 +54,4 @@ schedulable
 tunables
 usecs
 virt
+yunikorn
@@ -67,6 +67,7 @@ negs
 nodepool
 nodepools
 ondemand
+parallelstore
 pgaudit
 podslice
 policycontroller
 
@@ -0,0 +1 @@
+comand
@@ -2,9 +2,11 @@ admissionregistration
 apiextensions
 apiregistration
 apiservice
+automount
 clusterregistry
 clusterrole
 clusterrolebinding
+crds
 egressgateway
 flowcontrol
 flowschemas
 
@@ -22,6 +22,10 @@
       "name": "kubeflow",
       "path": ".github/workflows/dictionary/kubeflow.txt"
     },
+    {
+      "name": "kuberay",
+      "path": ".github/workflows/dictionary/kuberay.txt"
+    },
     {
       "name": "kubernetes",
       "path": ".github/workflows/dictionary/kubernetes.txt"
@@ -61,14 +65,15 @@
     {
       "name": "vllm",
       "path": ".github/workflows/dictionary/vllm.txt"
-    }
+    },
   ],
   "dictionaries": [
     "accelerated-platforms",
     "flipkart",
     "google-cloud",
     "huggingface",
     "kubeflow",
+    "kuberay",
     "kubernetes",
     "kueue",
     "nvidia",
@@ -78,11 +83,11 @@
     "svg",
     "terraform",
     "use-case-federated-learning",
-    "vllm"
+    "vllm",
   ],
   "ignorePaths": [
     "node_modules",
     "use-cases/inferencing/batch-inference/example_predictions.txt",
-    "use-cases/model-fine-tuning-pipeline/model-eval/examples/"
+    "use-cases/model-fine-tuning-pipeline/model-eval/examples/",
   ]
 }
@@ -0,0 +1,9 @@
+# GKE AI/ML Platform reference architecture and implementation
+
+## Deploy the platform
+
+[Reference Architecture](/platforms/gke/base/use-cases/aiml/README.md)
+
+### Examples
+
+[Fine-tuning Pipeline](/platforms/gke/base/use-cases/aiml/examples/fine-tuning-pipeline/README.md)
@@ -43,6 +43,12 @@ locals {
   kubeconfig_file_name = "${var.cluster_project_id}-${local.cluster_name}"
 }
 
+variable "cluster_addons_ray_operator_enabled" {
+  default     = false
+  description = "Enable Ray Operator Addon. Ref: https://cloud.google.com/kubernetes-engine/docs/add-on/ray-on-gke/concepts/overview"
+  type        = bool
+}
+
 variable "cluster_binary_authorization_evaluation_mode" {
   default     = "DISABLED"
   description = "Mode of operation for Binary Authorization policy evaluation. Valid values are DISABLED and PROJECT_SINGLETON_POLICY_ENFORCE."
 
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+SHARED_CONFIG_DIRECTORY=${1}
+SHARED_CONFIG_NAME=${2}
+
+if [[ ${SHARED_CONFIG_DIRECTORY} != \.* ]]; then
+  echo "The shared config directory path must be a relative path!"
+  exit 1
+fi
+
+if test ! -d "${SHARED_CONFIG_DIRECTORY}"; then
+  echo "Shared config directory '${SHARED_CONFIG_DIRECTORY}' does not exist!"
+  exit 2
+fi
+
+if test ! -f "${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf"; then
+  echo "Shared config '${SHARED_CONFIG_NAME}' does not exist in '${SHARED_CONFIG_DIRECTORY}'!"
+  exit 3
+fi
+
+ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}_variables.tf _${SHARED_CONFIG_NAME}_variables.tf
+ln -s ${SHARED_CONFIG_DIRECTORY}/${SHARED_CONFIG_NAME}.auto.tfvars _${SHARED_CONFIG_NAME}.auto.tfvars
+
+echo "Successfully linked shared config '${SHARED_CONFIG_NAME}' from '${SHARED_CONFIG_DIRECTORY}'."
@@ -26,6 +26,12 @@ variable "jobset_version" {
   type        = string
 }
 
+variable "kuberay_version" {
+  default     = "1.3.1"
+  description = "Version of KubeRay (https://github.com/ray-project/kuberay) to install."
+  type        = string
+}
+
 variable "kueue_version" {
   default     = "0.10.2"
   description = "Version of Kueue (https://kueue.sigs.k8s.io/) to install."
 
@@ -37,6 +37,23 @@ resource "google_container_cluster" "cluster" {
     gce_persistent_disk_csi_driver_config {
       enabled = true
     }
+
+    parallelstore_csi_driver_config {
+      enabled = true
+    }
+
+    dynamic "ray_operator_config" {
+      for_each = var.cluster_addons_ray_operator_enabled ? ["ray_operator_config"] : []
+      content {
+        enabled = true
+        ray_cluster_logging_config {
+          enabled = true
+        }
+        ray_cluster_monitoring_config {
+          enabled = true
+        }
+      }
+    }
   }
 
   cluster_autoscaling {
@@ -247,6 +264,7 @@ resource "google_container_node_pool" "system" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-type" : "system"
     }
     machine_type    = var.cluster_system_node_pool_machine_type
 
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "n4"
       "resource-type" : "cpu"
     }
@@ -118,6 +119,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "n4"
       "resource-type" : "cpu"
     }
 
@@ -48,6 +48,7 @@ resource "google_container_node_pool" "cpu_n4s8" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "n4"
       "resource-type" : "cpu"
     }
@@ -117,6 +118,7 @@ resource "google_container_node_pool" "cpu_n4s8_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "n4"
       "resource-type" : "cpu"
     }
 
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "40GB"
@@ -143,6 +144,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
+      "queued-provisioning" : "true"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "40GB"
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_res" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "reservation"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "40GB"
@@ -321,6 +325,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "40GB"
 
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -142,6 +143,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
+      "queued-provisioning" : "true"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -234,6 +237,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "reservation"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -322,6 +326,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
 
@@ -54,6 +54,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "l4"
       "resource-type" : "gpu"
     }
@@ -138,6 +139,8 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
+      "queued-provisioning" : "true"
       "resource-model" : "l4"
       "resource-type" : "gpu"
     }
@@ -226,6 +229,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_res" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "reservation"
       "resource-model" : "l4"
       "resource-type" : "gpu"
     }
@@ -311,6 +315,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "l4"
       "resource-type" : "gpu"
     }
 
@@ -55,6 +55,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "80GB"
@@ -141,6 +142,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_dws" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
+      "queued-provisioning" : "true"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "80GB"
@@ -231,6 +234,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_res" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "reservation"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "80GB"
@@ -317,6 +321,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2u2_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "a100"
       "resource-type" : "gpu"
       "resource-variant" : "80GB"
 
@@ -56,6 +56,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -146,6 +147,8 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "on-demand"
+      "queued-provisioning" : "true"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -240,6 +243,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_res" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "reservation"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }
@@ -330,6 +334,7 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_spot" {
   node_config {
     # Variables
     labels = {
+      "node-provisioning-model" : "spot"
       "resource-model" : "h100"
       "resource-type" : "gpu"
     }