Skip to content
This repository was archived by the owner on Jun 23, 2025. It is now read-only.

Commit a4aca32

Browse files
committed
write back dlio result to a separate result bucket
1 parent c9df187 commit a4aca32

File tree

8 files changed

+105
-19
lines changed

8 files changed

+105
-19
lines changed

benchmarks/benchmark/tools/dlio/README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Preinstall the following on your computer:
1414
Note: Terraform keeps state metadata in a local file called `terraform.tfstate`.
1515
If you need to reinstall any resources, make sure to delete this file as well.
1616

17+
The workload identity and `k8s_service_account` should be set up for the `gcs_bucket` and `result_bucket` correctly ahead of time because DLIO jobs need to read from and write to them respectively.
18+
1719
## Run DLIO Job
1820
1. Update the `variables.tf` file with your desired settings to run your machine learning benchmark workload
1921
2. Change the dlio image in `modules/dlio/podspec.tpl`
@@ -22,20 +24,22 @@ If you need to reinstall any resources, make sure to delete this file as well.
2224
5. After you finish your test, run `terraform destroy` to delete the
2325
resources
2426

27+
*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run.
28+
2529
## Run DLIO Job with Parallelstore
26-
Pre-reqs: right now you'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`.
30+
Pre-reqs:
31+
- You'll need to manually setup the VPC peering from the GKE cluster's network to `servicenetworking.googleapis.com`.
2732

28-
1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `gcs_fuse_csi_driver_enabled` to `false` and `paralllestore_csi_driver_enabled` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`.
33+
1. update `variables.tf` file with your desired settings to run your machine learning benchmark workload, notably set `run_with_gcs_fuse_csi` to `false` and `run_with_parallelstore_csi` to `true`. If you want to use static provisioning, update the "parallelstore variables" and `parallelstore_storageclass` to `""`.
2934
2. Change the dlio image in `dlio/podspec.tpl` to a desired version. We have tested the job with dlio v0.5.1.
3035
3. run `terraform init`
3136
4. run `terraform apply -target=module.ps_storage`
3237
5. run `terraform apply` after the dataloader job is completed; pvc patch failure is OK for dynamic provisioning.
3338

39+
*__Important__*: To isolate results from different runs, `${dlio_benchmark_result}` should be unique to each run.
40+
3441
## Check Test Result
35-
The test result reports are located in the `${dlio_benchmark_result}` directory. For example,
36-
if you use a GCS bucket to store the training dataset, the GCS bucket will be mounted at
37-
`${dlio_data_mount_path}`, and you can find the test result reports at `${dlio_data_mount_path}/${dlio_benchmark_result}`
38-
or in the folder with the same name as `${dlio_benchmark_result}` in your GCS bucket.
42+
The test result reports are located in provided GCS bucket `${result_bucket}` in a directory named `${dlio_benchmark_result}`.
3943

4044
## Debug Workload
4145

benchmarks/benchmark/tools/dlio/main.tf

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ provider "kubectl" {
2020

2121
module "gcs_pv_pvc" {
2222
source = "./modules/storage"
23-
count = var.gcs_fuse_csi_driver_enabled == "\"true\"" ? 1 : 0
23+
count = var.run_with_gcs_fuse_csi == "\"true\"" ? 1 : 0
2424

2525
namespace = var.namespace
2626
pv_name = var.pv_name
@@ -33,7 +33,7 @@ module "gcs_pv_pvc" {
3333

3434
module "ps_storage" {
3535
source = "./modules/parallelstore_storage"
36-
count = var.paralllestore_csi_driver_enabled == "\"true\"" ? 1 : 0
36+
count = var.run_with_parallelstore_csi == "\"true\"" ? 1 : 0
3737

3838
pv_name = var.pv_name
3939
pvc_name = var.pvc_name
@@ -59,11 +59,11 @@ module "dlio" {
5959
job_backoffLimit = var.job_backoffLimit
6060
job_completions = var.job_completions
6161
job_parallelism = var.job_parallelism
62-
gcs_fuse_csi_driver_enabled = var.gcs_fuse_csi_driver_enabled
62+
gcs_fuse_csi_driver_enabled = var.run_with_gcs_fuse_csi
6363
gcs_fuse_sidecar_cpu_limit = var.gcs_fuse_sidecar_cpu_limit
6464
gcs_fuse_sidecar_memory_limit = var.gcs_fuse_sidecar_memory_limit
6565
gcs_fuse_sidecar_ephemeral_storage_limit = var.gcs_fuse_sidecar_ephemeral_storage_limit
66-
pscsi_driver_enabled = var.paralllestore_csi_driver_enabled
66+
pscsi_driver_enabled = var.run_with_parallelstore_csi
6767
pscsi_sidecar_cpu_limit = var.pscsi_sidecar_cpu_limit
6868
pscsi_sidecar_memory_limit = var.pscsi_sidecar_memory_limit
6969
dlio_container_cpu_limit = var.dlio_container_cpu_limit
@@ -84,6 +84,7 @@ module "dlio" {
8484
dlio_iostat_devices = var.dlio_iostat_devices
8585
dlio_read_threads = var.dlio_read_threads
8686
gcs_bucket = var.gcs_bucket
87+
result_bucket = var.result_bucket
8788
k8s_service_account = var.k8s_service_account
8889
pvc_name = var.pvc_name
8990
}

benchmarks/benchmark/tools/dlio/modules/dlio/job.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ resource "local_file" "podspec" {
4343
dlio_iostat_devices = "${var.dlio_iostat_devices}"
4444
dlio_read_threads = "${var.dlio_read_threads}"
4545
gcs_bucket = "${var.gcs_bucket}"
46+
result_bucket = "${var.result_bucket}"
4647
service_account = "${var.k8s_service_account}"
4748
pvc_name = "${var.pvc_name}"
4849
})

benchmarks/benchmark/tools/dlio/modules/dlio/podspec.tpl

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ spec:
1212
labels:
1313
app: dlio-job
1414
annotations:
15-
gke-gcsfuse/volumes: ${gcs_fuse_csi_driver_enabled}
15+
gke-gcsfuse/volumes: "true"
1616
gke-gcsfuse/cpu-limit: ${gcs_fuse_sidecar_cpu_limit}
1717
gke-gcsfuse/memory-limit: ${gcs_fuse_sidecar_memory_limit}
1818
gke-gcsfuse/ephemeral-storage-limit: ${gcs_fuse_sidecar_ephemeral_storage_limit}
@@ -51,8 +51,8 @@ spec:
5151
python dlio_postprocessor.py --output-folder $OUTPUT_FOLDER;
5252
rm $OUTPUT_FOLDER/\.*\.pfw;
5353
echo 'copying results';
54-
mkdir -p ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME;
55-
cp -r $OUTPUT_FOLDER ${dlio_data_mount_path}/${dlio_benchmark_result}/$MY_POD_NAME;
54+
mkdir -p /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME;
55+
cp -r $OUTPUT_FOLDER /dlio_results/${dlio_benchmark_result}/$MY_POD_NAME;
5656
echo 'done';
5757
fi
5858
env:
@@ -67,11 +67,18 @@ spec:
6767
mountPath: ${dlio_data_mount_path}
6868
- name: dshm
6969
mountPath: /dev/shm
70+
- name: results
71+
mountPath: /dlio_results
7072
serviceAccountName: ${service_account}
7173
volumes:
7274
- name: ml-perf-volume
7375
persistentVolumeClaim:
7476
claimName: ${pvc_name}
77+
- name: results
78+
csi:
79+
driver: gcsfuse.csi.storage.gke.io
80+
volumeAttributes:
81+
bucketName: ${result_bucket}
7582
- name: dshm
7683
emptyDir:
7784
medium: Memory

benchmarks/benchmark/tools/dlio/modules/dlio/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ variable "gcs_bucket" {
2727
description = "GCS Bucket name"
2828
}
2929

30+
variable "result_bucket" {
31+
type = string
32+
description = "GCS Bucket name"
33+
}
34+
3035
variable "pvc_name" {
3136
type = string
3237
description = "Name of the PersistentVolumeClaim used for DLIO dataset"

benchmarks/benchmark/tools/dlio/modules/parallelstore_storage/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023 Google LLC
1+
# Copyright 2024 Google LLC
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import os
2+
import json
3+
import datetime
4+
5+
RESULT_FOLDER = './tmp'
6+
7+
START_TIME = 'start'
8+
END_TIME = 'end'
9+
GPU = 'train_au_percentage'
10+
M_GPU = 'train_au_mean_percentage'
11+
SAMPLE_THROUGHPUT = 'train_throughput_samples_per_second'
12+
M_SAMPLE_THROUGHPUT = 'train_throughput_mean_samples_per_second'
13+
M_MB = "train_io_mean_MB_per_second"
14+
DURATION = 'duration'
15+
16+
17+
def average(numbers):
18+
return sum(numbers) / len(numbers)
19+
20+
def process_summary(summary):
21+
metric = summary['metric']
22+
gpu = metric[M_GPU]
23+
spp = metric[M_SAMPLE_THROUGHPUT]
24+
mmb = metric[M_MB]
25+
fe_gpu_percentage = metric[GPU][0]
26+
fe_samples_per_second = metric[SAMPLE_THROUGHPUT][0]
27+
sub_gpu_percentage = average(metric[GPU][1:]) if len(metric[GPU]) > 1 else -1
28+
sub_spp = average(metric[SAMPLE_THROUGHPUT][1:]) if len(metric[SAMPLE_THROUGHPUT]) > 1 else -1
29+
start_time = summary[START_TIME]
30+
end_time = summary[END_TIME]
31+
total_time = datetime.datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%S.%f") - datetime.datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S.%f")
32+
return total_time.total_seconds(), fe_gpu_percentage, fe_samples_per_second, sub_gpu_percentage, sub_spp, gpu, spp, mmb
33+
34+
headers = ['e2e training seconds', 'first epoch au percentage', 'first epoch throughput samples per second', 'subsequent epochs average au percentage', 'subsequent epochs throughput samples per second',
35+
'mean au percentage', 'mean throughput samples per second', 'mean MB per second']
36+
37+
def process_per_epoch_stats(epochs):
38+
fe_duration = float(epochs['1'][DURATION])
39+
sq_durations = []
40+
for i in range(2, len(epochs)):
41+
sq_durations.append(float(epochs[str(i)][DURATION]))
42+
sq_avg_duration = average(sq_durations) if len(sq_durations) > 0 else -1
43+
return fe_duration, sq_avg_duration
44+
45+
per_epoch_headers = ['first epoch duration seconds', "subsequent epochs average duration seconds"]
46+
47+
summary_results = []
48+
per_epoch_results = []
49+
for root, dirs, files in os.walk(RESULT_FOLDER):
50+
for file in files:
51+
if file == 'summary.json':
52+
with open(root +'/'+ file) as f:
53+
d = json.load(f)
54+
summary_results.append(process_summary(d))
55+
if file == 'per_epoch_stats.json':
56+
with open(root +'/'+ file) as f:
57+
d = json.load(f)
58+
per_epoch_results.append(process_per_epoch_stats(d))
59+
60+
61+
print(list(zip(headers, list(map(average, zip(*summary_results))))))
62+
print(list(zip(per_epoch_headers, list(map(average, zip(*per_epoch_results))))))

benchmarks/benchmark/tools/dlio/variables.tf

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,20 @@ variable "gcs_bucket" {
3030
default = "<your gcs bucket>"
3131
}
3232

33+
variable "result_bucket" {
34+
type = string
35+
description = "GCS Bucket name to store dlio results"
36+
default = "<result bucket>"
37+
}
38+
3339
// at most one of the below trigers can be set to true
34-
variable "gcs_fuse_csi_driver_enabled" {
40+
variable "run_with_gcs_fuse_csi" {
3541
type = string
36-
description = "Set to true if running DLIO on GCSFuse and the Cloud Storage FUSE CSI driver is enabled on your cluster"
42+
description = "Set to true if running DLIO on GCSFuse"
3743
default = "\"true\""
3844
}
3945

40-
variable "paralllestore_csi_driver_enabled" {
46+
variable "run_with_parallelstore_csi" {
4147
type = string
4248
description = "Set to true if running DLIO on Parallelstore and the Parallelstore CSI driver is enabled on your cluster"
4349
default = "\"false\""
@@ -119,8 +125,8 @@ variable "dlio_data_mount_path" {
119125

120126
variable "dlio_benchmark_result" {
121127
type = string
122-
description = "The path stores benchmark result reports"
123-
default = "results"
128+
description = "The path stores benchmark result reports for a specific DLIO run. When doing multi-pod runs, this folder stores results logged from all the pods, needs to be changed every run to guarantee result isolation."
129+
default = "<a result folder name unique to your run>"
124130
}
125131

126132
// DLIO configurations, detailed explanation check

0 commit comments

Comments
 (0)