Qss poc (#990)

danjuan-81 · Yevet · web-flow · commit e390b7eda003 · 2025-02-20T10:17:19.000-08:00
* add scripts to update region * clean up python script * parse variable gpu_type to nccl tests * fix gpu_type variable in nccl tests * update gpu_type variable call * modify variable * update root * Add ephemeral storage * void project_id here and read it as an input * cleanup scripts * Add back vpc changes #971 --------- Co-authored-by: Yevet <xyiwen@google.com>
diff --git a/applications/hcc/main.tf b/applications/hcc/main.tf
@@ -243,7 +243,6 @@ module "a3-ultragpu-pool" {
   additional_networks = concat([{ network = module.gke-a3-ultra-net-1[0].network_name, subnetwork = module.gke-a3-ultra-net-1[0].subnetwork_name, subnetwork_project = var.project_id, nic_type = "GVNIC", queue_count = null, network_ip = null, stack_type = null, access_config = [{ nat_ip = null, public_ptr_domain_name = null, network_tier = null }], ipv6_access_config = [], alias_ip_range = [] }], module.gke-a3-ultra-rdma-net[0].subnetwork_interfaces_gke)
   auto_upgrade        = true
   cluster_id          = local.gke_cluster_id
-  disk_size_gb        = 100
   disk_type           = "hyperdisk-balanced"
   gke_version         = local.gke_cluster_version
   guest_accelerator = [{
@@ -262,7 +261,8 @@ module "a3-ultragpu-pool" {
     specific_reservations = [{
       name = var.reservation
     }]
-  }
+  } 
+  local_ssd_count_ephemeral_storage = 32
   static_node_count = var.node_count
   zones             = [local.zone]
   providers = {
@@ -312,6 +312,7 @@ module "nemo" {
   checkpoint_bucket = local.result_bucket_name
   recipe = var.recipe
   node_count = var.node_count
+  gpu_type = var.gpu_type
   # Providers needs to be explicitely passed in when a depends_on is present in a module.
   providers = {
     helm = helm
diff --git a/applications/hcc/metadata.display.yaml b/applications/hcc/metadata.display.yaml
@@ -3,7 +3,7 @@ kind: BlueprintMetadata
 metadata:
   name: ai-on-gke-display
   annotations:
-    config.kubernetes.io/local-config: "true"
+    config.kubernetes.io/local-config: 'true'
 spec:
   info:
     title: HyperCompute Cluster on GKE
@@ -30,36 +30,60 @@ spec:
           title: GPU Type
           section: required_config
           enumValueLabels:
-            - label: A3 Mega
-              value: "A3 Mega"
-            - label: A3 Ultra
-              value: "A3 Ultra"
+          - label: A3 Mega
+            value: A3 Mega
+          - label: A3 Ultra
+            value: A3 Ultra
         a3_mega_zone:
           name: a3_mega_zone
           title: Location for A3 Mega
           section: required_config
           xGoogleProperty:
             type: ET_GCE_ZONE
-            # specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
             gce_zone:
-              allowlisted_zones: ["asia-northeast1-b", "asia-southeast1-b", "asia-southeast1-c", "europe-west4-b", "europe-west4-c", "us-central1-a", "us-central1-b", "us-central1-c", "us-east4-a", "us-east4-b", "us-east4-c", "us-east5-a", "us-west1-a", "us-west1-b", "us-west4-a"]
+              allowlisted_zones:
+              - asia-northeast1-b
+              - asia-southeast1-b
+              - asia-southeast1-c
+              - australia-southeast1-c
+              - europe-west1-b
+              - europe-west1-c
+              - europe-west3-c
+              - europe-west4-b
+              - europe-west4-c
+              - us-central1-c
+              - us-central1-a
+              - us-central1-b
+              - us-east4-c
+              - us-east4-b
+              - us-east4-a
+              - us-east5-a
+              - us-east7-b
+              - us-west1-b
+              - us-west1-a
+              - us-west4-a
           toggleUsingVariables:
-            - variableName: gpu_type
-              variableValues: ["A3 Mega"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: gpu_type
+            variableValues:
+            - A3 Mega
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         a3_ultra_zone:
           name: a3_ultra_zone
           title: Location for A3 Ultra
           section: required_config
           xGoogleProperty:
             type: ET_GCE_ZONE
-            # specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
             gce_zone:
-              allowlisted_zones: ["europe-west1-b"]
+              allowlisted_zones:
+              - europe-west1-b
+              - us-east5-a
+              - us-east7-c
+              - us-west1-c
           toggleUsingVariables:
-            - variableName: gpu_type
-              variableValues: ["A3 Ultra"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: gpu_type
+            variableValues:
+            - A3 Ultra
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         node_count:
           name: node_count
           title: Node Count
@@ -69,90 +93,110 @@ spec:
           title: Deployment options
           section: required_config
           enumValueLabels:
-            - label: GKE Cluster Only
-              value: gke
-            - label: GKE Cluster with NCCL Tests
-              value: gke-nccl
-            - label: GKE Cluster with Llama-3.1-7B pretraining benchmark
-              value: llama3.1_7b_nemo_pretraining
-            - label: GKE Cluster with Llama-3.1-70B pretraining benchmark
-              value: llama3.1_7b_nemo_pretraining
+          - label: GKE Cluster Only
+            value: gke
+          - label: GKE Cluster with NCCL Tests
+            value: gke-nccl
+          - label: GKE Cluster with Llama-3.1-7B pretraining benchmark
+            value: llama3.1_7b_nemo_pretraining
+          - label: GKE Cluster with Llama-3.1-70B pretraining benchmark
+            value: llama3.1_7b_nemo_pretraining
         consumption_model:
           name: consumption_model
           title: Consumption model
           section: required_config
           enumValueLabels:
-            - label: "Reservation"
-              value: "Reservation"
-            - label: "On Demand"
-              value: "On Demand"
+          - label: Reservation
+            value: Reservation
+          - label: On Demand
+            value: On Demand
         reservation:
           name: reservation
-          title: Reservation Name 
+          title: Reservation Name
           section: required_config
           toggleUsingVariables:
-            - variableName: consumption_model 
-              variableValues: ["Reservation"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: consumption_model
+            variableValues:
+            - Reservation
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         reservation_block:
           name: reservation_block
           title: Reservation Block
           section: required_config
           toggleUsingVariables:
-            - variableName: consumption_model
-              variableValues: ["Reservation"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: consumption_model
+            variableValues:
+            - Reservation
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         placement_policy_name:
           name: placement_policy_name
           title: Placement Policy
           section: required_config
           toggleUsingVariables:
-            - variableName: consumption_model
-              variableValues: ["Reservation"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: consumption_model
+            variableValues:
+            - Reservation
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         host_maintenance:
           name: host_maintenance
           title: Host Maintainance
           section: required_config
           enumValueLabels:
-            - label: "NONE"
-              value: "none"
-            - label: "PERIODIC"
-              value: "periodic"
+          - label: NONE
+            value: none
+          - label: PERIODIC
+            value: periodic
           toggleUsingVariables:
-            - variableName: consumption_model
-              variableValues: ["Reservation"]
-              type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: consumption_model
+            variableValues:
+            - Reservation
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         acknowledge:
           name: acknowledge
-          title: Check to confirm you enabled Google APIs for your project with this command.
+          title: Check to confirm you enabled Google APIs for your project with this
+            command.
           section: acknowledge
-          subtext: |
-                  <pre>
-                    <code style="background: #f4f4f4;border: 1px solid #ddd; border-left: 3px solid #3367d6; color: #6d6868; font-size: 12px; max-width: 100%; padding: 0.5em 0.5em; display: inline; line-height: 45px;">gcloud services enable serviceusage.googleapis.com cloudresourcemanager.googleapis.com</code>
-                  </pre>
+          subtext: "<pre>\n  <code style=\"background: #f4f4f4;border: 1px solid #ddd;\
+            \ border-left: 3px solid #3367d6; color: #6d6868; font-size: 12px; max-width:\
+            \ 100%; padding: 0.5em 0.5em; display: inline; line-height: 45px;\">gcloud\
+            \ services enable serviceusage.googleapis.com cloudresourcemanager.googleapis.com</code>\n\
+            </pre>\n"
           enumValueLabels:
-            - label: Confirm that all prerequisites have been met.
-              value: "true"
+          - label: Confirm that all prerequisites have been met.
+            value: 'true'
       sections:
-        - name: acknowledge
-          title: Before you begin
-          subtext: 
-                This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute Cluster</i></a> on GKE in your project.</br>
-        - name: required_config
-          title: Required configuration
+      - name: acknowledge
+        title: Before you begin
+        subtext: This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute
+          Cluster</i></a> on GKE in your project.</br>
+      - name: required_config
+        title: Required configuration
     runtime:
       outputMessage: Deployment can take several minutes to complete.
       suggestedActions:
-        - heading: Connect to Ray Cluster
-          description: Connect to Ray Cluster, scroll to <b>Ports</b> section and initiate <b>PORT FORWARDING</b> (Run in Cloud Shell) to the ray dashboard (port 8265). Open another terminal and follow these <a href="https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke#install-ray">instructions</a> to install ray and submit jobs.
-        - heading: View Job Status in Ray Dashboard
-          description: |-
-            <p>
-            1&#41; If IAP is disabled, open the ray dashboard via the <b>OPEN IN WEB PREVIEW</b> button in the port forwarding page.</br>
-            </p>
-            <p>
-            2&#41; If IAP is enabled, click the <b>Launch Ray Dashboard</b> button and log in with your organization's credentials. Troubleshooting access issues:</br>
-            &emsp;&#x2022; SSL or cert errors indicate the cert is provisioning which takes up to 20 minutes.</br>
-            &emsp;&#x2022; If you're unable to login, go to <a href="https://console.cloud.google.com/security/iap">Google Cloud Platform IAP</a>, select the <b>ray-cluster-kuberay-head-svc</b> service and add the user with the role <b>IAP-secured Web App User</b>.
-            </p>
+      - heading: Connect to Ray Cluster
+        description: Connect to Ray Cluster, scroll to <b>Ports</b> section and initiate
+          <b>PORT FORWARDING</b> (Run in Cloud Shell) to the ray dashboard (port 8265).
+          Open another terminal and follow these <a href="https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke#install-ray">instructions</a>
+          to install ray and submit jobs.
+      - heading: View Job Status in Ray Dashboard
+        description: '<p>
+
+          1&#41; If IAP is disabled, open the ray dashboard via the <b>OPEN IN WEB
+          PREVIEW</b> button in the port forwarding page.</br>
+
+          </p>
+
+          <p>
+
+          2&#41; If IAP is enabled, click the <b>Launch Ray Dashboard</b> button and
+          log in with your organization''s credentials. Troubleshooting access issues:</br>
+
+          &emsp;&#x2022; SSL or cert errors indicate the cert is provisioning which
+          takes up to 20 minutes.</br>
+
+          &emsp;&#x2022; If you''re unable to login, go to <a href="https://console.cloud.google.com/security/iap">Google
+          Cloud Platform IAP</a>, select the <b>ray-cluster-kuberay-head-svc</b> service
+          and add the user with the role <b>IAP-secured Web App User</b>.
+
+          </p>'
diff --git a/applications/hcc/modules/embedded/modules/scheduler/gke-cluster/main.tf b/applications/hcc/modules/embedded/modules/scheduler/gke-cluster/main.tf
@@ -382,15 +382,15 @@ module "kubectl_apply" {
       {
         source = "${path.module}/templates/gke-network-paramset.yaml.tftpl",
         template_vars = {
-          name            = network_info.subnetwork,
+          name            = "vpc${idx + 1}",
           network_name    = network_info.network
           subnetwork_name = network_info.subnetwork,
           device_mode     = strcontains(upper(network_info.nic_type), "RDMA") ? "RDMA" : "NetDevice"
         }
       },
       {
         source        = "${path.module}/templates/network-object.yaml.tftpl",
-        template_vars = { name = network_info.subnetwork }
+        template_vars = { name = "vpc${idx + 1}" }
       }
     ]
   ])
diff --git a/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/mega-test.yaml b/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/mega-test.yaml
@@ -1,7 +1,6 @@
-{{- if eq var.gpuType "A3 Mega" }}
+{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Mega" }}
 {{ $timestamp := now | unixEpoch }}
 
-{{- $root := . -}}
 apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
diff --git a/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/nccl-installer.yaml b/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/nccl-installer.yaml
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-{{- if eq var.gpuType "A3 Ultra" }}
+{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Ultra" }}
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
diff --git a/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/ultra-test.yaml b/applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/ultra-test.yaml
@@ -1,8 +1,6 @@
-{{- if eq var.gpuType "A3 Ultra" }}
+{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Ultra" }}
 {{ $timestamp := now | unixEpoch }}
 
-{{- $root := . -}}
-
 apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
 metadata:
diff --git a/applications/hcc/modules/nemo/main.tf b/applications/hcc/modules/nemo/main.tf
@@ -29,8 +29,6 @@ resource "helm_release" "nemo" {
     "${file("${path.module}/values.yaml")}"
   ]
 
-
-
   set {
     name  = "nemo_config"
     value = "${file("${path.module}/${local.nccl_config}")}"
@@ -64,4 +62,9 @@ resource "helm_release" "nccl_tests" {
     name = "workload.gcsBucketForDataCataPath"
     value = var.checkpoint_bucket
   }
+
+  set {
+    name = "workload.gpuType"
+    value = var.gpu_type
+  }
 }
diff --git a/applications/hcc/modules/nemo/variables.tf b/applications/hcc/modules/nemo/variables.tf
@@ -14,6 +14,14 @@ variable "recipe" {
   }
 }
 
+variable "gpu_type" {
+  type = string
+  validation {
+    condition     = contains(["A3 Mega", "A3 Ultra"], var.gpu_type)
+    error_message = "Invalid gpu value. Must be one of: A3 Mega, A3 Ultra."
+  }
+}
+
 variable "node_count" {
   type = number
 }
diff --git a/applications/hcc/update_zone_region.py b/applications/hcc/update_zone_region.py

Original file line number	Diff line number	Diff line change
`@@ -382,15 +382,15 @@ module "kubectl_apply" {`
`382`	`382`	`{`
`383`	`383`	`source = "${path.module}/templates/gke-network-paramset.yaml.tftpl",`
`384`	`384`	`template_vars = {`
`385`		`- name = network_info.subnetwork,`
	`385`	`+ name = "vpc${idx + 1}",`
`386`	`386`	`network_name = network_info.network`
`387`	`387`	`subnetwork_name = network_info.subnetwork,`
`388`	`388`	`device_mode = strcontains(upper(network_info.nic_type), "RDMA") ? "RDMA" : "NetDevice"`
`389`	`389`	`}`
`390`	`390`	`},`
`391`	`391`	`{`
`392`	`392`	`source = "${path.module}/templates/network-object.yaml.tftpl",`
`393`		`- template_vars = { name = network_info.subnetwork }`
	`393`	`+ template_vars = { name = "vpc${idx + 1}" }`
`394`	`394`	`}`
`395`	`395`	`]`
`396`	`396`	`])`
Original file line number	Diff line number	Diff line change
`@@ -29,8 +29,6 @@ resource "helm_release" "nemo" {`
`29`	`29`	`"${file("${path.module}/values.yaml")}"`
`30`	`30`	`]`
`31`	`31`
`32`		`-`
`33`		`-`
`34`	`32`	`set {`
`35`	`33`	`name = "nemo_config"`
`36`	`34`	`value = "${file("${path.module}/${local.nccl_config}")}"`
`@@ -64,4 +62,9 @@ resource "helm_release" "nccl_tests" {`
`64`	`62`	`name = "workload.gcsBucketForDataCataPath"`
`65`	`63`	`value = var.checkpoint_bucket`
`66`	`64`	`}`
	`65`	`+`
	`66`	`+ set {`
	`67`	`+ name = "workload.gpuType"`
	`68`	`+ value = var.gpu_type`
	`69`	`+ }`
`67`	`70`	`}`
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,14 @@ variable "recipe" {`
`14`	`14`	`}`
`15`	`15`	`}`
`16`	`16`
	`17`	`+variable "gpu_type" {`
	`18`	`+ type = string`
	`19`	`+ validation {`
	`20`	`+ condition = contains(["A3 Mega", "A3 Ultra"], var.gpu_type)`
	`21`	`+ error_message = "Invalid gpu value. Must be one of: A3 Mega, A3 Ultra."`
	`22`	`+ }`
	`23`	`+}`
	`24`	`+`
`17`	`25`	`variable "node_count" {`
`18`	`26`	`type = number`
`19`	`27`	`}`