Skip to content

Commit e390b7e

Browse files
danjuan-81Yevet
andauthored
Qss poc (#990)
* add scripts to update region * clean up python script * parse variable gpu_type to nccl tests * fix gpu_type variable in nccl tests * update gpu_type variable call * modify variable * update root * Add ephemeral storage * void project_id here and read it as an input * cleanup scripts * Add back vpc changes #971 --------- Co-authored-by: Yevet <[email protected]>
1 parent cdb7f13 commit e390b7e

File tree

9 files changed

+294
-81
lines changed

9 files changed

+294
-81
lines changed

applications/hcc/main.tf

+3-2
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,6 @@ module "a3-ultragpu-pool" {
243243
additional_networks = concat([{ network = module.gke-a3-ultra-net-1[0].network_name, subnetwork = module.gke-a3-ultra-net-1[0].subnetwork_name, subnetwork_project = var.project_id, nic_type = "GVNIC", queue_count = null, network_ip = null, stack_type = null, access_config = [{ nat_ip = null, public_ptr_domain_name = null, network_tier = null }], ipv6_access_config = [], alias_ip_range = [] }], module.gke-a3-ultra-rdma-net[0].subnetwork_interfaces_gke)
244244
auto_upgrade = true
245245
cluster_id = local.gke_cluster_id
246-
disk_size_gb = 100
247246
disk_type = "hyperdisk-balanced"
248247
gke_version = local.gke_cluster_version
249248
guest_accelerator = [{
@@ -262,7 +261,8 @@ module "a3-ultragpu-pool" {
262261
specific_reservations = [{
263262
name = var.reservation
264263
}]
265-
}
264+
}
265+
local_ssd_count_ephemeral_storage = 32
266266
static_node_count = var.node_count
267267
zones = [local.zone]
268268
providers = {
@@ -312,6 +312,7 @@ module "nemo" {
312312
checkpoint_bucket = local.result_bucket_name
313313
recipe = var.recipe
314314
node_count = var.node_count
315+
gpu_type = var.gpu_type
315316
# Providers needs to be explicitely passed in when a depends_on is present in a module.
316317
providers = {
317318
helm = helm

applications/hcc/metadata.display.yaml

+113-69
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: BlueprintMetadata
33
metadata:
44
name: ai-on-gke-display
55
annotations:
6-
config.kubernetes.io/local-config: "true"
6+
config.kubernetes.io/local-config: 'true'
77
spec:
88
info:
99
title: HyperCompute Cluster on GKE
@@ -30,36 +30,60 @@ spec:
3030
title: GPU Type
3131
section: required_config
3232
enumValueLabels:
33-
- label: A3 Mega
34-
value: "A3 Mega"
35-
- label: A3 Ultra
36-
value: "A3 Ultra"
33+
- label: A3 Mega
34+
value: A3 Mega
35+
- label: A3 Ultra
36+
value: A3 Ultra
3737
a3_mega_zone:
3838
name: a3_mega_zone
3939
title: Location for A3 Mega
4040
section: required_config
4141
xGoogleProperty:
4242
type: ET_GCE_ZONE
43-
# specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
4443
gce_zone:
45-
allowlisted_zones: ["asia-northeast1-b", "asia-southeast1-b", "asia-southeast1-c", "europe-west4-b", "europe-west4-c", "us-central1-a", "us-central1-b", "us-central1-c", "us-east4-a", "us-east4-b", "us-east4-c", "us-east5-a", "us-west1-a", "us-west1-b", "us-west4-a"]
44+
allowlisted_zones:
45+
- asia-northeast1-b
46+
- asia-southeast1-b
47+
- asia-southeast1-c
48+
- australia-southeast1-c
49+
- europe-west1-b
50+
- europe-west1-c
51+
- europe-west3-c
52+
- europe-west4-b
53+
- europe-west4-c
54+
- us-central1-c
55+
- us-central1-a
56+
- us-central1-b
57+
- us-east4-c
58+
- us-east4-b
59+
- us-east4-a
60+
- us-east5-a
61+
- us-east7-b
62+
- us-west1-b
63+
- us-west1-a
64+
- us-west4-a
4665
toggleUsingVariables:
47-
- variableName: gpu_type
48-
variableValues: ["A3 Mega"]
49-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
66+
- variableName: gpu_type
67+
variableValues:
68+
- A3 Mega
69+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
5070
a3_ultra_zone:
5171
name: a3_ultra_zone
5272
title: Location for A3 Ultra
5373
section: required_config
5474
xGoogleProperty:
5575
type: ET_GCE_ZONE
56-
# specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
5776
gce_zone:
58-
allowlisted_zones: ["europe-west1-b"]
77+
allowlisted_zones:
78+
- europe-west1-b
79+
- us-east5-a
80+
- us-east7-c
81+
- us-west1-c
5982
toggleUsingVariables:
60-
- variableName: gpu_type
61-
variableValues: ["A3 Ultra"]
62-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
83+
- variableName: gpu_type
84+
variableValues:
85+
- A3 Ultra
86+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
6387
node_count:
6488
name: node_count
6589
title: Node Count
@@ -69,90 +93,110 @@ spec:
6993
title: Deployment options
7094
section: required_config
7195
enumValueLabels:
72-
- label: GKE Cluster Only
73-
value: gke
74-
- label: GKE Cluster with NCCL Tests
75-
value: gke-nccl
76-
- label: GKE Cluster with Llama-3.1-7B pretraining benchmark
77-
value: llama3.1_7b_nemo_pretraining
78-
- label: GKE Cluster with Llama-3.1-70B pretraining benchmark
79-
value: llama3.1_7b_nemo_pretraining
96+
- label: GKE Cluster Only
97+
value: gke
98+
- label: GKE Cluster with NCCL Tests
99+
value: gke-nccl
100+
- label: GKE Cluster with Llama-3.1-7B pretraining benchmark
101+
value: llama3.1_7b_nemo_pretraining
102+
- label: GKE Cluster with Llama-3.1-70B pretraining benchmark
103+
value: llama3.1_7b_nemo_pretraining
80104
consumption_model:
81105
name: consumption_model
82106
title: Consumption model
83107
section: required_config
84108
enumValueLabels:
85-
- label: "Reservation"
86-
value: "Reservation"
87-
- label: "On Demand"
88-
value: "On Demand"
109+
- label: Reservation
110+
value: Reservation
111+
- label: On Demand
112+
value: On Demand
89113
reservation:
90114
name: reservation
91-
title: Reservation Name
115+
title: Reservation Name
92116
section: required_config
93117
toggleUsingVariables:
94-
- variableName: consumption_model
95-
variableValues: ["Reservation"]
96-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
118+
- variableName: consumption_model
119+
variableValues:
120+
- Reservation
121+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
97122
reservation_block:
98123
name: reservation_block
99124
title: Reservation Block
100125
section: required_config
101126
toggleUsingVariables:
102-
- variableName: consumption_model
103-
variableValues: ["Reservation"]
104-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
127+
- variableName: consumption_model
128+
variableValues:
129+
- Reservation
130+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
105131
placement_policy_name:
106132
name: placement_policy_name
107133
title: Placement Policy
108134
section: required_config
109135
toggleUsingVariables:
110-
- variableName: consumption_model
111-
variableValues: ["Reservation"]
112-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
136+
- variableName: consumption_model
137+
variableValues:
138+
- Reservation
139+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
113140
host_maintenance:
114141
name: host_maintenance
115142
title: Host Maintainance
116143
section: required_config
117144
enumValueLabels:
118-
- label: "NONE"
119-
value: "none"
120-
- label: "PERIODIC"
121-
value: "periodic"
145+
- label: NONE
146+
value: none
147+
- label: PERIODIC
148+
value: periodic
122149
toggleUsingVariables:
123-
- variableName: consumption_model
124-
variableValues: ["Reservation"]
125-
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
150+
- variableName: consumption_model
151+
variableValues:
152+
- Reservation
153+
type: DISPLAY_VARIABLE_TOGGLE_TYPE_UNSPECIFIED
126154
acknowledge:
127155
name: acknowledge
128-
title: Check to confirm you enabled Google APIs for your project with this command.
156+
title: Check to confirm you enabled Google APIs for your project with this
157+
command.
129158
section: acknowledge
130-
subtext: |
131-
<pre>
132-
<code style="background: #f4f4f4;border: 1px solid #ddd; border-left: 3px solid #3367d6; color: #6d6868; font-size: 12px; max-width: 100%; padding: 0.5em 0.5em; display: inline; line-height: 45px;">gcloud services enable serviceusage.googleapis.com cloudresourcemanager.googleapis.com</code>
133-
</pre>
159+
subtext: "<pre>\n <code style=\"background: #f4f4f4;border: 1px solid #ddd;\
160+
\ border-left: 3px solid #3367d6; color: #6d6868; font-size: 12px; max-width:\
161+
\ 100%; padding: 0.5em 0.5em; display: inline; line-height: 45px;\">gcloud\
162+
\ services enable serviceusage.googleapis.com cloudresourcemanager.googleapis.com</code>\n\
163+
</pre>\n"
134164
enumValueLabels:
135-
- label: Confirm that all prerequisites have been met.
136-
value: "true"
165+
- label: Confirm that all prerequisites have been met.
166+
value: 'true'
137167
sections:
138-
- name: acknowledge
139-
title: Before you begin
140-
subtext:
141-
This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute Cluster</i></a> on GKE in your project.</br>
142-
- name: required_config
143-
title: Required configuration
168+
- name: acknowledge
169+
title: Before you begin
170+
subtext: This solution deploys a sample <a href="https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute"><i>HyperCompute
171+
Cluster</i></a> on GKE in your project.</br>
172+
- name: required_config
173+
title: Required configuration
144174
runtime:
145175
outputMessage: Deployment can take several minutes to complete.
146176
suggestedActions:
147-
- heading: Connect to Ray Cluster
148-
description: Connect to Ray Cluster, scroll to <b>Ports</b> section and initiate <b>PORT FORWARDING</b> (Run in Cloud Shell) to the ray dashboard (port 8265). Open another terminal and follow these <a href="https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke#install-ray">instructions</a> to install ray and submit jobs.
149-
- heading: View Job Status in Ray Dashboard
150-
description: |-
151-
<p>
152-
1&#41; If IAP is disabled, open the ray dashboard via the <b>OPEN IN WEB PREVIEW</b> button in the port forwarding page.</br>
153-
</p>
154-
<p>
155-
2&#41; If IAP is enabled, click the <b>Launch Ray Dashboard</b> button and log in with your organization's credentials. Troubleshooting access issues:</br>
156-
&emsp;&#x2022; SSL or cert errors indicate the cert is provisioning which takes up to 20 minutes.</br>
157-
&emsp;&#x2022; If you're unable to login, go to <a href="https://console.cloud.google.com/security/iap">Google Cloud Platform IAP</a>, select the <b>ray-cluster-kuberay-head-svc</b> service and add the user with the role <b>IAP-secured Web App User</b>.
158-
</p>
177+
- heading: Connect to Ray Cluster
178+
description: Connect to Ray Cluster, scroll to <b>Ports</b> section and initiate
179+
<b>PORT FORWARDING</b> (Run in Cloud Shell) to the ray dashboard (port 8265).
180+
Open another terminal and follow these <a href="https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke#install-ray">instructions</a>
181+
to install ray and submit jobs.
182+
- heading: View Job Status in Ray Dashboard
183+
description: '<p>
184+
185+
1&#41; If IAP is disabled, open the ray dashboard via the <b>OPEN IN WEB
186+
PREVIEW</b> button in the port forwarding page.</br>
187+
188+
</p>
189+
190+
<p>
191+
192+
2&#41; If IAP is enabled, click the <b>Launch Ray Dashboard</b> button and
193+
log in with your organization''s credentials. Troubleshooting access issues:</br>
194+
195+
&emsp;&#x2022; SSL or cert errors indicate the cert is provisioning which
196+
takes up to 20 minutes.</br>
197+
198+
&emsp;&#x2022; If you''re unable to login, go to <a href="https://console.cloud.google.com/security/iap">Google
199+
Cloud Platform IAP</a>, select the <b>ray-cluster-kuberay-head-svc</b> service
200+
and add the user with the role <b>IAP-secured Web App User</b>.
201+
202+
</p>'

applications/hcc/modules/embedded/modules/scheduler/gke-cluster/main.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -382,15 +382,15 @@ module "kubectl_apply" {
382382
{
383383
source = "${path.module}/templates/gke-network-paramset.yaml.tftpl",
384384
template_vars = {
385-
name = network_info.subnetwork,
385+
name = "vpc${idx + 1}",
386386
network_name = network_info.network
387387
subnetwork_name = network_info.subnetwork,
388388
device_mode = strcontains(upper(network_info.nic_type), "RDMA") ? "RDMA" : "NetDevice"
389389
}
390390
},
391391
{
392392
source = "${path.module}/templates/network-object.yaml.tftpl",
393-
template_vars = { name = network_info.subnetwork }
393+
template_vars = { name = "vpc${idx + 1}" }
394394
}
395395
]
396396
])

applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/mega-test.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
{{- if eq var.gpuType "A3 Mega" }}
1+
{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Mega" }}
22
{{ $timestamp := now | unixEpoch }}
33

4-
{{- $root := . -}}
54
apiVersion: jobset.x-k8s.io/v1alpha2
65
kind: JobSet
76
metadata:

applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/nccl-installer.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
{{- if eq var.gpuType "A3 Ultra" }}
14+
{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Ultra" }}
1515
apiVersion: apps/v1
1616
kind: DaemonSet
1717
metadata:

applications/hcc/modules/nemo/helm-charts/nccl-tests/templates/ultra-test.yaml

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
{{- if eq var.gpuType "A3 Ultra" }}
1+
{{- $root := . -}} {{- if eq $root.Values.workload.gpuType "A3 Ultra" }}
22
{{ $timestamp := now | unixEpoch }}
33

4-
{{- $root := . -}}
5-
64
apiVersion: jobset.x-k8s.io/v1alpha2
75
kind: JobSet
86
metadata:

applications/hcc/modules/nemo/main.tf

+5-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ resource "helm_release" "nemo" {
2929
"${file("${path.module}/values.yaml")}"
3030
]
3131

32-
33-
3432
set {
3533
name = "nemo_config"
3634
value = "${file("${path.module}/${local.nccl_config}")}"
@@ -64,4 +62,9 @@ resource "helm_release" "nccl_tests" {
6462
name = "workload.gcsBucketForDataCataPath"
6563
value = var.checkpoint_bucket
6664
}
65+
66+
set {
67+
name = "workload.gpuType"
68+
value = var.gpu_type
69+
}
6770
}

applications/hcc/modules/nemo/variables.tf

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ variable "recipe" {
1414
}
1515
}
1616

17+
variable "gpu_type" {
18+
type = string
19+
validation {
20+
condition = contains(["A3 Mega", "A3 Ultra"], var.gpu_type)
21+
error_message = "Invalid gpu value. Must be one of: A3 Mega, A3 Ultra."
22+
}
23+
}
24+
1725
variable "node_count" {
1826
type = number
1927
}

0 commit comments

Comments
 (0)