GoogleCloudPlatform · Doris26 · Apr 25, 2025 · Apr 22, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -59,7 +59,7 @@ def main():
 
   # Setup configuration
   relative_config_yaml_path = (
-      "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_bf16_maxtext.yaml"
+      "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_fp8_maxtext.yaml"
   )
   timeout = DAG_CONFIGS_ULTRA[relative_config_yaml_path]["timeout_minutes"]
 

@@ -0,0 +1,43 @@
+# This test file should be run from the project root directory using:
+# python -m unittest discover dags/map_reproducibility/tests -p "test_sample_workload_utils.py"
+#
+# Other methods that might work:
+# 1. Specific test: python -m unittest dags.map_reproducibility.tests.test_sample_workload_utils.TestSampleWorkloadUtils.test_execute_workload_commands_real_success
+
+
+import unittest
+
+from dags.map_reproducibility.utils.sample_workload_utils import (
+    sample_workload_gcs_to_cns_cmds,
+    execute_workload_commands,
+)
+from dags.map_reproducibility.utils.common_utils import (
+    find_xprof_gcs_path,
+)
+
+
+class TestSampleWorkloadUtils(unittest.TestCase):
+
+  def test_execute_workload_commands_real_success(self):
+    """
+    Test execute_workload_commands with a real subprocess that succeeds.
+    """
+    # Use simple commands guaranteed to succeed in most environments
+    gcs_path = "gs://yujunzou-dev-supercomputer-testing/maxtext/yujunzou-coreml-llama-3-1-8b-1745453263-maxtext-xpbx-1745453272-xppn/tensorboard/plugins/profile/2025_04_24_00_13_31/yujunzou-coreml-llama-3-1-8b-1745453263-maxtext-xpbx-0.xplane.pb"
+    commands = sample_workload_gcs_to_cns_cmds(gcs_path)
+
+    # --- Act ---
+    # Execute the commands using the real subprocess mechanism
+    success, results = execute_workload_commands(commands, "/tmp")
+    self.assertTrue(success)
+
+  def test_find_xprof_gcs_path_real_success(self):
+    """
+    Test find_xprof_gcs_path with a real subprocess that succeeds.
+    """
+    gcs_run_bucket_folder = "gs://yujunzou-dev-supercomputer-testing/maxtext/yujunzou-coreml-llama-3-1-8b-1745363352-maxtext-okrp-1745363360-h593/"
+    xprof_path = find_xprof_gcs_path(gcs_run_bucket_folder)
+    self.assertIsNotNone(xprof_path, "xprof_path should not be None")
+    self.assertTrue(
+        xprof_path.startswith("gs://"), "xprof_path should be a GCS path"
+    )
@@ -50,6 +50,7 @@ def write_run(
     comment: str = "",
     is_test: bool = False,
     logs_profile="",
+    gcs_metrics_bucket="",
     workload_others="",
     experiment_id="",
 ):
@@ -255,6 +256,7 @@ def validate_software_id(software_id: str, is_test: bool = False) -> bool:
         hardware_num_superblocks=num_of_superblock,
         logs_comments=comment,
         logs_profile=logs_profile,
+        gcs_metrics_bucket=gcs_metrics_bucket,
         workload_others=workload_others,
         experiment_id=experiment_id,
     )

@@ -31,6 +31,7 @@
 from dags.map_reproducibility.utils.benchmarkdb_utils import write_run
 from datetime import datetime, timezone
 from dags import composer_env
+from google.cloud import storage
 
 PROJECT = "supercomputer-testing"
 BUCKET_NAME = "regression-testing-xlml"
@@ -128,6 +129,47 @@ def get_internal_pre_workload_job_name(
   return job_name
 
 
+def find_xprof_gcs_path(gcs_path):
+  """
+  Find the .xplane.pb file in the latest date blob from the specified GCS path.
+
+  Args:
+      gcs_path (str): Full GCS path in the format gs://bucket-name/folder/path/
+
+  Returns:
+      str: Path to the .xplane.pb file in the latest date blob
+  """
+  path_without_prefix = gcs_path.removeprefix("gs://")
+
+  parts = path_without_prefix.split("/", 1)
+  bucket_name = parts[0]
+  print(f"Bucket name: {bucket_name}")
+
+  prefix = parts[1] if len(parts) > 1 else ""
+
+  storage_client = storage.Client()
+  bucket = storage_client.get_bucket(bucket_name)
+
+  # List all blobs in the bucket with the given prefix
+  print(f"Prefix: {prefix}")
+  blobs = list(bucket.list_blobs(prefix=prefix))
+
+  # Look for .xplane.pb file in the latest directory
+  xplane_pb_file = None
+  for blob in blobs:
+    if blob.name.endswith(".xplane.pb"):
+      xplane_pb_file = blob.name
+      break
+
+  if not xplane_pb_file:
+    print(f"No .xplane.pb file found in {gcs_path}")
+    return None
+
+  full_xplane_pb_file = f"gs://{bucket_name}/{xplane_pb_file}"
+  print(f"Found .xplane.pb file: {full_xplane_pb_file}")
+  return full_xplane_pb_file
+
+
 def get_patheon_job_link(region, cluster_name, job_name):
   pantheon_link = f"https://pantheon.corp.google.com/kubernetes/job/{region}/{cluster_name}/default/{job_name}"
   print(f"{'*' * 20}LINK: {pantheon_link}")
@@ -291,7 +333,7 @@ def internal_wait_for_jobs_cmds(timeout="100m"):
   return wait_for_job
 
 
-def get_job_gcs_bucket_folder(job_name):
+def get_job_gcs_bucket_folder(job_name, bucket_name=BUCKET_NAME):
   """
   Get the GCS bucket folder for a specific job.
 
@@ -302,14 +344,20 @@ def get_job_gcs_bucket_folder(job_name):
   Returns:
       str: The full path to the bucket folder containing the job
   """
-  gcs_location = f"gs://{BUCKET_NAME}/maxtext/"
+  gcs_location = f"gs://{bucket_name}/maxtext/"
   bucket_folder_cmd = f"gcloud storage ls {gcs_location} | grep {job_name}"
+  print(f"bucket_folder_cmd: {bucket_folder_cmd}")
 
   try:
     bucket_folder = (
         subprocess.check_output(bucket_folder_cmd, shell=True).decode().strip()
     )
-    print(f"BUCKET_FOLDER: {bucket_folder}")
+    bucket_folder_prefix_removed = bucket_folder.removeprefix("gs://")
+    pantheon_bucket_link = (
+        "https://pantheon.corp.google.com/storage/browser/"
+        + bucket_folder_prefix_removed
+    )
+    print(f"BUCKET PANTHEON LINK: {pantheon_bucket_link}")
     return bucket_folder
   except subprocess.CalledProcessError as e:
     print(f"Error finding bucket folder: {e}")

@@ -194,7 +194,7 @@ def run_internal_aotc_workload(
         topology="",
         comment=comment,
         is_test=is_db_test_run,
-        logs_profile=gcs_bucket,
+        gcs_metrics_bucket=gcs_bucket,
         workload_others=str(config),
         experiment_id=job_name,
     )
@@ -37,6 +37,7 @@
     parse_internal_config_filename,
     parse_internal_config_content,
     get_patheon_job_link,
+    find_xprof_gcs_path,
 )
 
 from dags.map_reproducibility.utils.benchmarkdb_utils import write_run
@@ -151,6 +152,24 @@ def sample_job_configure_project_and_cluster(cluster: str, cluster_region: str):
   return set_project_command
 
 
+def sample_workload_gcs_to_cns_cmds(log_file_in_gcs, output_file=None):
+  # This function only works for glinux or cloudtop because it is using fileutil_bs
+  # If output_file is not provided, use the same name as the input file
+  log_file_in_gcs = log_file_in_gcs.removeprefix("gs://")
+  if not output_file:
+    output_file = os.path.basename(log_file_in_gcs)
+  print(f"output_file name is: {output_file}")
+
+  cmds = (
+      f"LOG_FILE_IN_GCS={log_file_in_gcs} ",
+      f"filename={output_file} ",
+      "CNS_PATH=/cns/pi-d/home/${USER}/tensorboard/multislice ",
+      "/google/data/ro/projects/cloud/bigstore/mpm/fileutil_bs/stable/bin/fileutil_bs cp /bigstore/${LOG_FILE_IN_GCS} ${CNS_PATH}/${filename} ",
+      "echo file to put into xprof: ${CNS_PATH}/${filename}",
+  )
+  return cmds
+
+
 def write_run_results(
     config: Any,
     result: WorkloadResult,
@@ -289,8 +308,26 @@ def run_internal_sample_aotc_workload(
       print(f"mfu: {mfu}")
       print(f"step_time: {step_time}")
       comment = "sample benchmarking run"
-      gcs_bucket = get_job_gcs_bucket_folder(job_name)
+      gcs_bucket = get_job_gcs_bucket_folder(
+          job_name, bucket_name=sample_run_bucket_name
+      )
       print(f"GCS bucket is {gcs_bucket}")
+      logs_profile = None
+
+      if hasattr(config, "profiler"):
+        logs_profile = find_xprof_gcs_path(gcs_bucket)
+        if not logs_profile:
+          logger.error(f"No xprof file found in {gcs_bucket}")
+        else:
+          print(f"logs_profile is {logs_profile}")
+          profiler_cmds = sample_workload_gcs_to_cns_cmds(logs_profile)
+          profile_success, profiler_error_message = execute_workload_commands(
+              profiler_cmds, tmpdir
+          )
+          if not profile_success:
+            logger.error(
+                f"Profile command failed with error: {profiler_error_message}"
+            )
 
       write_run(
           model_id=config.HELM_NAME_MODEL_ID,
@@ -309,11 +346,12 @@ def run_internal_sample_aotc_workload(
           mfu=mfu,
           tokens_per_second=1,
           writer_path=bq_writer_repo_root,
-          run_type="internal_perf_regression",
+          run_type="sample_helm_workload",
           topology="",
           comment=comment,
           is_test=True,
-          logs_profile=gcs_bucket,
+          logs_profile=logs_profile,
+          gcs_metrics_bucket=gcs_bucket,
           workload_others=str(config),
           experiment_id=job_name,
       )