GoogleCloudPlatform
diff --git a/‎applications/hcc/a3mega_workloads.tfvars
+1-2 b/‎applications/hcc/a3mega_workloads.tfvars
+1-2
diff --git a/‎applications/hcc/a3ultra_workloads.tfvars
+1-2 b/‎applications/hcc/a3ultra_workloads.tfvars
+1-2
diff --git a/‎applications/hcc/main.tf
+4-4 b/‎applications/hcc/main.tf
+4-4
diff --git a/‎applications/hcc/metadata.display.yaml
+98-41 b/‎applications/hcc/metadata.display.yaml
+98-41
diff --git a/‎applications/hcc/metadata.yaml
+16-9 b/‎applications/hcc/metadata.yaml
+16-9
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/Chart.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/Chart.yaml b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/Chart.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/Chart.yaml
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-configmap.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-configmap.yaml b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-configmap.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-configmap.yaml
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-launcher-job.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-launcher-job.yaml b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-launcher-job.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-launcher-job.yaml
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-launcher-svc.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-launcher-svc.yaml b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/templates/nemo-launcher-svc.yaml renamed to ‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3mega/templates/nemo-launcher-svc.yaml
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3ultra/Chart.yaml
+20 b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3ultra/Chart.yaml
+20
diff --git a/‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3ultra/templates/nemo-configmap.yaml
+21 b/‎applications/hcc/modules/nemo/helm-charts/nemo-training/a3ultra/templates/nemo-configmap.yaml
+21
@@ -28,12 +28,11 @@ project_id = "supercomputer-testing"
 a3_mega_zone = "us-east5-a"
 a3_ultra_zone = ""
 
-node_count = 2
+node_count_gke_nccl = 2
 recipe = "gke-nccl"
 
 reservation = "qss-test-a3mega"
 reservation_block = ""
 placement_policy_name = ""
-host_maintenance = ""
 
 gpu_type = "A3 Mega"
@@ -28,12 +28,11 @@ project_id = "supercomputer-testing"
 a3_mega_zone = ""
 a3_ultra_zone = "europe-west1-b"
 
-node_count = 2
+node_count_gke_nccl = 2
 recipe = "gke-nccl"
 
 reservation = "supercomputer-testing-gsc-asq-fr/reservationBlocks/supercomputer-testing-gsc-asq-fr-block-0001"
 reservation_block = ""
 placement_policy_name = ""
-host_maintenance = ""
 
 gpu_type = "A3 Ultra"
@@ -107,7 +107,7 @@ module "a3_megagpu_pool" {
     }
   )
   local_ssd_count_ephemeral_storage = 16
-  static_node_count = var.node_count
+  static_node_count = local.node_count
   taints            = []
   zones             = [var.a3_mega_zone]
   providers = {
@@ -263,7 +263,7 @@ module "a3-ultragpu-pool" {
     }]
   } 
   local_ssd_count_ephemeral_storage = 32
-  static_node_count = var.node_count
+  static_node_count = local.node_count
   zones             = [local.zone]
   providers = {
     kubectl = kubectl
@@ -291,7 +291,7 @@ module "workload-manager-install" {
   }
   kueue = {
     install = true
-    config_path = "./modules/embedded/modules/management/kubectl-apply/templates/kueue-configuration.yaml.tftpl"
+    config_path = var.gpu_type == "A3 Ultra" ? "./modules/embedded/modules/management/kubectl-apply/templates/kueue-configuration.yaml.tftpl" : null
     config_template_vars = {
       node_pool_name = var.gpu_type == "A3 Ultra" ? module.a3-ultragpu-pool[0].node_pool_name : null
       num_gpus       = var.gpu_type == "A3 Ultra" ? module.a3-ultragpu-pool[0].static_gpu_count : null
@@ -311,7 +311,7 @@ module "nemo" {
   cluster_id = local.gke_cluster_id
   checkpoint_bucket = local.result_bucket_name
   recipe = var.recipe
-  node_count = var.node_count
+  node_count = local.node_count
   gpu_type = var.gpu_type
   # Providers needs to be explicitely passed in when a depends_on is present in a module.
   providers = {
 
@@ -27,17 +27,19 @@ spec:
           section: required_config
         gpu_type:
           name: gpu_type
-          title: GPU Type
+          title: Accelerator machine type
           section: required_config
+          subtext: Select the accelerator machine type.
           enumValueLabels:
-          - label: A3 Mega
+          - label: "A3 Mega, NVIDIA H100 80GB MEGA: a3-megagpu-8g"
             value: A3 Mega
-          - label: A3 Ultra
+          - label: "A3 Ultra, NVIDIA H200 141GB: a3-ultragpu-8g"
             value: A3 Ultra
         a3_mega_zone:
           name: a3_mega_zone
-          title: Location for A3 Mega
+          title: Cluster zone
           section: required_config
+          subtext: Select from locations with available accelerators. If you have a reservation, select the zone where your reservation is located.
           xGoogleProperty:
             type: ET_GCE_ZONE
             gce_zone:
@@ -69,8 +71,9 @@ spec:
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         a3_ultra_zone:
           name: a3_ultra_zone
-          title: Location for A3 Ultra
+          title: Cluster zone
           section: required_config
+          subtext: Select from locations with available accelerators. If you have a reservation, select the zone where your reservation is located.
           xGoogleProperty:
             type: ET_GCE_ZONE
             gce_zone:
@@ -84,72 +87,126 @@ spec:
             variableValues:
             - A3 Ultra
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
-        node_count:
-          name: node_count
-          title: Node Count
-          section: required_config
-        recipe:
-          name: recipe
-          title: Deployment options
+        a3_mega_consumption_model:
+          name: a3_mega_consumption_model
+          title: Consumption options 
           section: required_config
+          subtext: For optimal performance with distributed AI workloads, reserve densely allocated accelerator capacity. See <a href="https://cloud.google.com/ai-hypercomputer/docs/consumption-models"><i>Consumption options</i></a> for more details.</br>
           enumValueLabels:
-          - label: GKE Cluster Only
-            value: gke
-          - label: GKE Cluster with NCCL Tests
-            value: gke-nccl
-          - label: GKE Cluster with Llama-3.1-7B pretraining benchmark
-            value: llama3.1_7b_nemo_pretraining
-          - label: GKE Cluster with Llama-3.1-70B pretraining benchmark
-            value: llama3.1_7b_nemo_pretraining
-        consumption_model:
-          name: consumption_model
-          title: Consumption model
+          - label: Reservation
+            value: Reservation
+          toggleUsingVariables:
+          - variableName: gpu_type
+            variableValues:
+            - A3 Mega
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+        a3_ultra_consumption_model:
+          name: a3_ultra_consumption_model
+          title: Consumption options
           section: required_config
+          subtext: For optimal performance with distributed AI workloads, reserve densely allocated accelerator capacity. See <a href="https://cloud.google.com/ai-hypercomputer/docs/consumption-models"><i>Consumption options</i></a> for more details.</br>
           enumValueLabels:
           - label: Reservation
             value: Reservation
-          - label: On Demand
-            value: On Demand
+          toggleUsingVariables:
+          - variableName: gpu_type
+            variableValues:
+            - A3 Ultra
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         reservation:
           name: reservation
-          title: Reservation Name
+          title: Reservation name
           section: required_config
           toggleUsingVariables:
-          - variableName: consumption_model
+          - variableName: a3_mega_consumption_model
+            variableValues:
+            - Reservation
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+          - variableName: a3_ultra_consumption_model
             variableValues:
             - Reservation
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         reservation_block:
           name: reservation_block
-          title: Reservation Block
+          title: Reservation block
           section: required_config
           toggleUsingVariables:
-          - variableName: consumption_model
+          - variableName: a3_ultra_consumption_model
             variableValues:
             - Reservation
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         placement_policy_name:
           name: placement_policy_name
-          title: Placement Policy
+          title: Placement policy
           section: required_config
           toggleUsingVariables:
-          - variableName: consumption_model
+          - variableName: a3_mega_consumption_model
             variableValues:
             - Reservation
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
-        host_maintenance:
-          name: host_maintenance
-          title: Host Maintainance
+        recipe:
+          name: recipe
+          title: Solution deployment option
           section: required_config
+          subtext: Select your deployment option.
           enumValueLabels:
-          - label: NONE
-            value: none
-          - label: PERIODIC
-            value: periodic
+          - label: GKE Cluster Only
+            value: gke
+          - label: GKE Cluster with NCCL Test
+            value: gke-nccl
+          - label: GKE Cluster with Llama-3.1-7B pretraining benchmark
+            value: llama3.1_7b_nemo_pretraining
+          - label: GKE Cluster with Llama-3.1-70B pretraining benchmark
+            value: llama3.1_70b_nemo_pretraining
+        node_count_gke:
+          name: node_count_gke
+          title: Node count
+          section: required_config
+          subtext: Please enter a value >= 0. If using a reservation, ensure that your reservation has the required capacity.
           toggleUsingVariables:
-          - variableName: consumption_model
+          - variableName: recipe
             variableValues:
-            - Reservation
+            - gke
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+        node_count_gke_nccl:
+          name: node_count_gke_nccl
+          title: Node count
+          section: required_config
+          subtext: Please enter a value >= 2. If using a reservation, ensure that your reservation has the required capacity.
+          toggleUsingVariables:
+          - variableName: recipe
+            variableValues:
+            - gke-nccl
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+        node_count_llama_3_7b:
+          name: node_count_llama_3_7b
+          title: Node count
+          section: required_config
+          subtext: Some benchmarks require a minimum number of nodes. If using a reservation, ensure that your reservation has the required capacity
+          enumValueLabels:
+            - label: 2
+              value: 2
+            - label: 4
+              value: 4
+          toggleUsingVariables:
+          - variableName: recipe
+            variableValues:
+            - llama3.1_7b_nemo_pretraining
+            type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
+        node_count_llama_3_70b:
+          name: node_count_llama_3_70b
+          title: Node count
+          section: required_config
+          subtext: Some benchmarks require a minimum number of nodes. If using a reservation, ensure that your reservation has the required capacity
+          enumValueLabels:
+            - label: 32
+              value: 32
+            - label: 40
+              value: 40
+          toggleUsingVariables:
+          - variableName: recipe
+            variableValues:
+            - llama3.1_70b_nemo_pretraining
             type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
         acknowledge:
           name: acknowledge
@@ -168,7 +225,7 @@ spec:
       - name: acknowledge
         title: Before you begin
         subtext: This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute
-          Cluster</i></a> on GKE in your project.</br>
+          Cluster</i></a> with GKE in your project to run AI/ML and HPC workloads.</br>
       - name: required_config
         title: Required configuration
     runtime:
 
@@ -34,17 +34,11 @@ spec:
       - name: a3_ultra_zone
         varType: string
         defaultValue: ""
-      - name: node_count
-        varType: number
-        required: true
-        defaultValue: 2
-      - name: recipe
+      - name: a3_mega_consumption_model
         varType: string
-        required: true
         defaultValue: ""
-      - name: consumption_model
+      - name: a3_ultra_consumption_model
         varType: string
-        required: true
         defaultValue: ""
       - name: reservation
         varType: string
@@ -55,9 +49,22 @@ spec:
       - name: placement_policy_name
         varType: string
         defaultValue: ""
-      - name: host_maintenance
+      - name: recipe
         varType: string
+        required: true
         defaultValue: ""
+      - name: node_count_gke
+        varType: number
+        defaultValue: 0
+      - name: node_count_gke_nccl
+        varType: number
+        defaultValue: 2
+      - name: node_count_llama_3_7b
+        varType: number
+        defaultValue: 2
+      - name: node_count_llama_3_70b
+        varType: number
+        defaultValue: 32
       - name: additional_labels
         description: Additional labels to add to Kubernetes resources.
         varType: string
 
@@ -0,0 +1,20 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: nemo_training_workload
+description: nemo_training_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
@@ -0,0 +1,21 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}"
+data:
+  nemo-configuration.yaml: |-
+{{ .Values.nemo_config | nindent 4 }}