37
37
parse_internal_config_filename ,
38
38
parse_internal_config_content ,
39
39
get_patheon_job_link ,
40
+ find_xprof_gcs_path ,
40
41
)
41
42
42
43
from dags .map_reproducibility .utils .benchmarkdb_utils import write_run
@@ -151,6 +152,23 @@ def sample_job_configure_project_and_cluster(cluster: str, cluster_region: str):
151
152
return set_project_command
152
153
153
154
155
+ def sample_workload_gcs_to_cns_cmds (log_file_in_gcs , output_file = None ):
156
+ # If output_file is not provided, use the same name as the input file
157
+ log_file_in_gcs = log_file_in_gcs .replace ("gs://" , "" )
158
+ if not output_file :
159
+ output_file = os .path .basename (log_file_in_gcs )
160
+ print (f"output_file name is: { output_file } " )
161
+
162
+ cmds = (
163
+ f"LOG_FILE_IN_GCS={ log_file_in_gcs } " ,
164
+ f"filename={ output_file } " ,
165
+ "CNS_PATH=/cns/pi-d/home/${USER}/tensorboard/multislice " ,
166
+ "/google/data/ro/projects/cloud/bigstore/mpm/fileutil_bs/stable/bin/fileutil_bs cp /bigstore/${LOG_FILE_IN_GCS} ${CNS_PATH}/${filename} " ,
167
+ "echo file to put into xprof: ${CNS_PATH}/${filename}" ,
168
+ )
169
+ return cmds
170
+
171
+
154
172
def write_run_results (
155
173
config : Any ,
156
174
result : WorkloadResult ,
@@ -289,8 +307,19 @@ def run_internal_sample_aotc_workload(
289
307
print (f"mfu: { mfu } " )
290
308
print (f"step_time: { step_time } " )
291
309
comment = "sample benchmarking run"
292
- gcs_bucket = get_job_gcs_bucket_folder (job_name )
310
+ gcs_bucket = get_job_gcs_bucket_folder (
311
+ job_name , bucket_name = sample_run_bucket_name
312
+ )
293
313
print (f"GCS bucket is { gcs_bucket } " )
314
+ logs_profile = None
315
+
316
+ if config .profiler :
317
+ logs_profile = find_xprof_gcs_path (gcs_bucket )
318
+ print (f"logs_profile is { logs_profile } " )
319
+ profiler_cmds = sample_workload_gcs_to_cns_cmds (logs_profile )
320
+ profile_success , profiler_error_message = execute_workload_commands (
321
+ profiler_cmds , tmpdir
322
+ )
294
323
295
324
write_run (
296
325
model_id = config .HELM_NAME_MODEL_ID ,
@@ -309,11 +338,12 @@ def run_internal_sample_aotc_workload(
309
338
mfu = mfu ,
310
339
tokens_per_second = 1 ,
311
340
writer_path = bq_writer_repo_root ,
312
- run_type = "internal_perf_regression " ,
341
+ run_type = "sample_helm_workload " ,
313
342
topology = "" ,
314
343
comment = comment ,
315
344
is_test = True ,
316
- logs_profile = gcs_bucket ,
345
+ logs_profile = logs_profile ,
346
+ gcs_metrics_bucket = gcs_bucket ,
317
347
workload_others = str (config ),
318
348
experiment_id = job_name ,
319
349
)
0 commit comments