Skip to content

Commit 858f3b5

Browse files
danjuan-81Yevet
andauthored
Qss poc (#991)
* add scripts to update region * clean up python script * parse variable gpu_type to nccl tests * fix gpu_type variable in nccl tests * update gpu_type variable call * modify variable * update root * Add ephemeral storage * void project_id here and read it as an input * cleanup scripts * Add back vpc changes #971 * Add machine type to differentiate the templates for different machine types * Update UI based on approved description doc * Remove host maintenance and other order changes * Set consumption model variable as optional * Fix error: "Invalid template interpolation value" and some cleanup * remove default value --------- Co-authored-by: Yevet <[email protected]>
1 parent e390b7e commit 858f3b5

16 files changed

+536
-66
lines changed

applications/hcc/a3mega_workloads.tfvars

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,11 @@ project_id = "supercomputer-testing"
2828
a3_mega_zone = "us-east5-a"
2929
a3_ultra_zone = ""
3030

31-
node_count = 2
31+
node_count_gke_nccl = 2
3232
recipe = "gke-nccl"
3333

3434
reservation = "qss-test-a3mega"
3535
reservation_block = ""
3636
placement_policy_name = ""
37-
host_maintenance = ""
3837

3938
gpu_type = "A3 Mega"

applications/hcc/a3ultra_workloads.tfvars

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,11 @@ project_id = "supercomputer-testing"
2828
a3_mega_zone = ""
2929
a3_ultra_zone = "europe-west1-b"
3030

31-
node_count = 2
31+
node_count_gke_nccl = 2
3232
recipe = "gke-nccl"
3333

3434
reservation = "supercomputer-testing-gsc-asq-fr/reservationBlocks/supercomputer-testing-gsc-asq-fr-block-0001"
3535
reservation_block = ""
3636
placement_policy_name = ""
37-
host_maintenance = ""
3837

3938
gpu_type = "A3 Ultra"

applications/hcc/main.tf

+4-4
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ module "a3_megagpu_pool" {
107107
}
108108
)
109109
local_ssd_count_ephemeral_storage = 16
110-
static_node_count = var.node_count
110+
static_node_count = local.node_count
111111
taints = []
112112
zones = [var.a3_mega_zone]
113113
providers = {
@@ -263,7 +263,7 @@ module "a3-ultragpu-pool" {
263263
}]
264264
}
265265
local_ssd_count_ephemeral_storage = 32
266-
static_node_count = var.node_count
266+
static_node_count = local.node_count
267267
zones = [local.zone]
268268
providers = {
269269
kubectl = kubectl
@@ -291,7 +291,7 @@ module "workload-manager-install" {
291291
}
292292
kueue = {
293293
install = true
294-
config_path = "./modules/embedded/modules/management/kubectl-apply/templates/kueue-configuration.yaml.tftpl"
294+
config_path = var.gpu_type == "A3 Ultra" ? "./modules/embedded/modules/management/kubectl-apply/templates/kueue-configuration.yaml.tftpl" : null
295295
config_template_vars = {
296296
node_pool_name = var.gpu_type == "A3 Ultra" ? module.a3-ultragpu-pool[0].node_pool_name : null
297297
num_gpus = var.gpu_type == "A3 Ultra" ? module.a3-ultragpu-pool[0].static_gpu_count : null
@@ -311,7 +311,7 @@ module "nemo" {
311311
cluster_id = local.gke_cluster_id
312312
checkpoint_bucket = local.result_bucket_name
313313
recipe = var.recipe
314-
node_count = var.node_count
314+
node_count = local.node_count
315315
gpu_type = var.gpu_type
316316
# Providers needs to be explicitely passed in when a depends_on is present in a module.
317317
providers = {

applications/hcc/metadata.display.yaml

+98-41
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,19 @@ spec:
2727
section: required_config
2828
gpu_type:
2929
name: gpu_type
30-
title: GPU Type
30+
title: Accelerator machine type
3131
section: required_config
32+
subtext: Select the accelerator machine type.
3233
enumValueLabels:
33-
- label: A3 Mega
34+
- label: "A3 Mega, NVIDIA H100 80GB MEGA: a3-megagpu-8g"
3435
value: A3 Mega
35-
- label: A3 Ultra
36+
- label: "A3 Ultra, NVIDIA H200 141GB: a3-ultragpu-8g"
3637
value: A3 Ultra
3738
a3_mega_zone:
3839
name: a3_mega_zone
39-
title: Location for A3 Mega
40+
title: Cluster zone
4041
section: required_config
42+
subtext: Select from locations with available accelerators. If you have a reservation, select the zone where your reservation is located.
4143
xGoogleProperty:
4244
type: ET_GCE_ZONE
4345
gce_zone:
@@ -69,8 +71,9 @@ spec:
6971
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
7072
a3_ultra_zone:
7173
name: a3_ultra_zone
72-
title: Location for A3 Ultra
74+
title: Cluster zone
7375
section: required_config
76+
subtext: Select from locations with available accelerators. If you have a reservation, select the zone where your reservation is located.
7477
xGoogleProperty:
7578
type: ET_GCE_ZONE
7679
gce_zone:
@@ -84,72 +87,126 @@ spec:
8487
variableValues:
8588
- A3 Ultra
8689
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
87-
node_count:
88-
name: node_count
89-
title: Node Count
90-
section: required_config
91-
recipe:
92-
name: recipe
93-
title: Deployment options
90+
a3_mega_consumption_model:
91+
name: a3_mega_consumption_model
92+
title: Consumption options
9493
section: required_config
94+
subtext: For optimal performance with distributed AI workloads, reserve densely allocated accelerator capacity. See <a href="https://cloud.google.com/ai-hypercomputer/docs/consumption-models"><i>Consumption options</i></a> for more details.</br>
9595
enumValueLabels:
96-
- label: GKE Cluster Only
97-
value: gke
98-
- label: GKE Cluster with NCCL Tests
99-
value: gke-nccl
100-
- label: GKE Cluster with Llama-3.1-7B pretraining benchmark
101-
value: llama3.1_7b_nemo_pretraining
102-
- label: GKE Cluster with Llama-3.1-70B pretraining benchmark
103-
value: llama3.1_7b_nemo_pretraining
104-
consumption_model:
105-
name: consumption_model
106-
title: Consumption model
96+
- label: Reservation
97+
value: Reservation
98+
toggleUsingVariables:
99+
- variableName: gpu_type
100+
variableValues:
101+
- A3 Mega
102+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
103+
a3_ultra_consumption_model:
104+
name: a3_ultra_consumption_model
105+
title: Consumption options
107106
section: required_config
107+
subtext: For optimal performance with distributed AI workloads, reserve densely allocated accelerator capacity. See <a href="https://cloud.google.com/ai-hypercomputer/docs/consumption-models"><i>Consumption options</i></a> for more details.</br>
108108
enumValueLabels:
109109
- label: Reservation
110110
value: Reservation
111-
- label: On Demand
112-
value: On Demand
111+
toggleUsingVariables:
112+
- variableName: gpu_type
113+
variableValues:
114+
- A3 Ultra
115+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
113116
reservation:
114117
name: reservation
115-
title: Reservation Name
118+
title: Reservation name
116119
section: required_config
117120
toggleUsingVariables:
118-
- variableName: consumption_model
121+
- variableName: a3_mega_consumption_model
122+
variableValues:
123+
- Reservation
124+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
125+
- variableName: a3_ultra_consumption_model
119126
variableValues:
120127
- Reservation
121128
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
122129
reservation_block:
123130
name: reservation_block
124-
title: Reservation Block
131+
title: Reservation block
125132
section: required_config
126133
toggleUsingVariables:
127-
- variableName: consumption_model
134+
- variableName: a3_ultra_consumption_model
128135
variableValues:
129136
- Reservation
130137
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
131138
placement_policy_name:
132139
name: placement_policy_name
133-
title: Placement Policy
140+
title: Placement policy
134141
section: required_config
135142
toggleUsingVariables:
136-
- variableName: consumption_model
143+
- variableName: a3_mega_consumption_model
137144
variableValues:
138145
- Reservation
139146
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
140-
host_maintenance:
141-
name: host_maintenance
142-
title: Host Maintainance
147+
recipe:
148+
name: recipe
149+
title: Solution deployment option
143150
section: required_config
151+
subtext: Select your deployment option.
144152
enumValueLabels:
145-
- label: NONE
146-
value: none
147-
- label: PERIODIC
148-
value: periodic
153+
- label: GKE Cluster Only
154+
value: gke
155+
- label: GKE Cluster with NCCL Test
156+
value: gke-nccl
157+
- label: GKE Cluster with Llama-3.1-7B pretraining benchmark
158+
value: llama3.1_7b_nemo_pretraining
159+
- label: GKE Cluster with Llama-3.1-70B pretraining benchmark
160+
value: llama3.1_70b_nemo_pretraining
161+
node_count_gke:
162+
name: node_count_gke
163+
title: Node count
164+
section: required_config
165+
subtext: Please enter a value >= 0. If using a reservation, ensure that your reservation has the required capacity.
149166
toggleUsingVariables:
150-
- variableName: consumption_model
167+
- variableName: recipe
151168
variableValues:
152-
- Reservation
169+
- gke
170+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
171+
node_count_gke_nccl:
172+
name: node_count_gke_nccl
173+
title: Node count
174+
section: required_config
175+
subtext: Please enter a value >= 2. If using a reservation, ensure that your reservation has the required capacity.
176+
toggleUsingVariables:
177+
- variableName: recipe
178+
variableValues:
179+
- gke-nccl
180+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
181+
node_count_llama_3_7b:
182+
name: node_count_llama_3_7b
183+
title: Node count
184+
section: required_config
185+
subtext: Some benchmarks require a minimum number of nodes. If using a reservation, ensure that your reservation has the required capacity
186+
enumValueLabels:
187+
- label: 2
188+
value: 2
189+
- label: 4
190+
value: 4
191+
toggleUsingVariables:
192+
- variableName: recipe
193+
variableValues:
194+
- llama3.1_7b_nemo_pretraining
195+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
196+
node_count_llama_3_70b:
197+
name: node_count_llama_3_70b
198+
title: Node count
199+
section: required_config
200+
subtext: Some benchmarks require a minimum number of nodes. If using a reservation, ensure that your reservation has the required capacity
201+
enumValueLabels:
202+
- label: 32
203+
value: 32
204+
- label: 40
205+
value: 40
206+
toggleUsingVariables:
207+
- variableName: recipe
208+
variableValues:
209+
- llama3.1_70b_nemo_pretraining
153210
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
154211
acknowledge:
155212
name: acknowledge
@@ -168,7 +225,7 @@ spec:
168225
- name: acknowledge
169226
title: Before you begin
170227
subtext: This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute
171-
Cluster</i></a> on GKE in your project.</br>
228+
Cluster</i></a> with GKE in your project to run AI/ML and HPC workloads.</br>
172229
- name: required_config
173230
title: Required configuration
174231
runtime:

applications/hcc/metadata.yaml

+16-9
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,11 @@ spec:
3434
- name: a3_ultra_zone
3535
varType: string
3636
defaultValue: ""
37-
- name: node_count
38-
varType: number
39-
required: true
40-
defaultValue: 2
41-
- name: recipe
37+
- name: a3_mega_consumption_model
4238
varType: string
43-
required: true
4439
defaultValue: ""
45-
- name: consumption_model
40+
- name: a3_ultra_consumption_model
4641
varType: string
47-
required: true
4842
defaultValue: ""
4943
- name: reservation
5044
varType: string
@@ -55,9 +49,22 @@ spec:
5549
- name: placement_policy_name
5650
varType: string
5751
defaultValue: ""
58-
- name: host_maintenance
52+
- name: recipe
5953
varType: string
54+
required: true
6055
defaultValue: ""
56+
- name: node_count_gke
57+
varType: number
58+
defaultValue: 0
59+
- name: node_count_gke_nccl
60+
varType: number
61+
defaultValue: 2
62+
- name: node_count_llama_3_7b
63+
varType: number
64+
defaultValue: 2
65+
- name: node_count_llama_3_70b
66+
varType: number
67+
defaultValue: 32
6168
- name: additional_labels
6269
description: Additional labels to add to Kubernetes resources.
6370
varType: string
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v2
16+
name: nemo_training_workload
17+
description: nemo_training_workload
18+
type: application
19+
version: 0.1.0
20+
appVersion: "1.16.0"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: "{{ .Release.Name }}"
19+
data:
20+
nemo-configuration.yaml: |-
21+
{{ .Values.nemo_config | nindent 4 }}

0 commit comments

Comments
 (0)