enable xprof

Doris26 · Doris26 · commit 47df8e7f48d5 · 2025-04-24T03:15:48.000Z
diff --git a/dags/map_reproducibility/internal_runs/sample_a3ultra_maxtext_single_run.py b/dags/map_reproducibility/internal_runs/sample_a3ultra_maxtext_single_run.py
@@ -59,7 +59,7 @@ def main():
 
   # Setup configuration
   relative_config_yaml_path = (
-      "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_bf16_maxtext.yaml"
+      "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_fp8_maxtext.yaml"
   )
   timeout = DAG_CONFIGS_ULTRA[relative_config_yaml_path]["timeout_minutes"]
 
diff --git a/dags/map_reproducibility/tests/test_sample_workload_utils.py b/dags/map_reproducibility/tests/test_sample_workload_utils.py
@@ -0,0 +1,74 @@
+import sys
+import os
+import unittest
+
+# --- Setup sys.path and repo check (Keep as is) ---
+base_recipe_repo_root = os.path.abspath(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "..",
+        "..",
+        "..",
+        "..",
+        "internal-gpu-recipes",
+    )
+)
+
+if not os.path.exists(base_recipe_repo_root):
+  print(
+      f"Skipping test_sample_workload_utils.py - required directory not found: {base_recipe_repo_root}"
+  )
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(script_dir, "..", "..", ".."))
+
+print(f"Test Script directory: {script_dir}")
+print(f"Test Project root: {project_root}")
+
+if project_root not in sys.path:
+  sys.path.insert(0, project_root)
+# --- End Setup sys.path ---
+
+# Assuming the functions are in this module or imported
+from dags.map_reproducibility.utils.sample_workload_utils import (
+    sample_workload_gcs_to_cns_cmds,
+    execute_workload_commands,
+)
+from dags.map_reproducibility.utils.common_utils import (
+    find_xprof_gcs_path,
+)
+
+
+class TestSampleWorkloadUtils(unittest.TestCase):
+
+  def test_execute_workload_commands_real_success(self):
+    """
+    Test execute_workload_commands with a real subprocess that succeeds.
+    """
+    # Use simple commands guaranteed to succeed in most environments
+    gcs_path = "gs://yujunzou-dev-supercomputer-testing/maxtext/yujunzou-coreml-llama-3-1-8b-1745453263-maxtext-xpbx-1745453272-xppn/tensorboard/plugins/profile/2025_04_24_00_13_31/yujunzou-coreml-llama-3-1-8b-1745453263-maxtext-xpbx-0.xplane.pb"
+    commands = sample_workload_gcs_to_cns_cmds(gcs_path)
+
+    # --- Act ---
+    # Execute the commands using the real subprocess mechanism
+    success, results = execute_workload_commands(commands, "/tmp")
+    print(f"Real execution success flag: {success}")
+    print(f"Real execution results: {results}")
+
+  def test_find_xprof_gcs_path_real_success(self):
+    """
+    Test find_xprof_gcs_path with a real subprocess that succeeds.
+    """
+    gcs_run_bucket_folder = "gs://yujunzou-dev-supercomputer-testing/maxtext/yujunzou-coreml-llama-3-1-8b-1745363352-maxtext-okrp-1745363360-h593/"
+    xprof_path = find_xprof_gcs_path(gcs_run_bucket_folder)
+    print(f"xprof_path is {xprof_path}")
+
+
+if __name__ == "__main__":
+  # Run only the tests in the TestSampleWorkloadUtils class
+  suite = unittest.TestSuite()
+  suite.addTest(
+      TestSampleWorkloadUtils("test_execute_workload_commands_real_success")
+  )
+  runner = unittest.TextTestRunner()
+  runner.run(suite)
diff --git a/dags/map_reproducibility/utils/benchmarkdb_utils.py b/dags/map_reproducibility/utils/benchmarkdb_utils.py
@@ -50,6 +50,7 @@ def write_run(
     comment: str = "",
     is_test: bool = False,
     logs_profile="",
+    gcs_metrics_bucket="",
     workload_others="",
     experiment_id="",
 ):
@@ -255,6 +256,7 @@ def validate_software_id(software_id: str, is_test: bool = False) -> bool:
         hardware_num_superblocks=num_of_superblock,
         logs_comments=comment,
         logs_profile=logs_profile,
+        gcs_metrics_bucket=gcs_metrics_bucket,
         workload_others=workload_others,
         experiment_id=experiment_id,
     )
diff --git a/dags/map_reproducibility/utils/common_utils.py b/dags/map_reproducibility/utils/common_utils.py
@@ -128,6 +128,50 @@ def get_internal_pre_workload_job_name(
   return job_name
 
 
+def find_xprof_gcs_path(gcs_path):
+  """
+  Find the .xplane.pb file in the latest date blob from the specified GCS path.
+
+  Args:
+      gcs_path (str): Full GCS path in the format gs://bucket-name/folder/path/
+
+  Returns:
+      str: Path to the .xplane.pb file in the latest date blob
+  """
+  from google.cloud import storage
+
+  path_without_prefix = gcs_path.replace("gs://", "")
+
+  parts = path_without_prefix.split("/", 1)
+  bucket_name = parts[0]
+  print(f"Bucket name: {bucket_name}")
+
+  prefix = parts[1] if len(parts) > 1 else ""
+  prefix = prefix.rstrip("/")
+
+  storage_client = storage.Client()
+  bucket = storage_client.get_bucket(bucket_name)
+
+  # List all blobs in the bucket with the given prefix
+  print(f"Prefix: {prefix}")
+  blobs = list(bucket.list_blobs(prefix=prefix))
+
+  # Look for .xplane.pb file in the latest directory
+  xplane_pb_file = None
+  for blob in blobs:
+    if blob.name.endswith(".xplane.pb"):
+      xplane_pb_file = blob.name
+      break
+
+  if not xplane_pb_file:
+    print(f"No .xplane.pb file found in {gcs_path}")
+    return None
+
+  full_xplane_pb_file = f"gs://{bucket_name}/{xplane_pb_file}"
+  print(f"Found .xplane.pb file: {full_xplane_pb_file}")
+  return full_xplane_pb_file
+
+
 def get_patheon_job_link(region, cluster_name, job_name):
   pantheon_link = f"https://pantheon.corp.google.com/kubernetes/job/{region}/{cluster_name}/default/{job_name}"
   print(f"{'*' * 20}LINK: {pantheon_link}")
@@ -291,7 +335,7 @@ def internal_wait_for_jobs_cmds(timeout="100m"):
   return wait_for_job
 
 
-def get_job_gcs_bucket_folder(job_name):
+def get_job_gcs_bucket_folder(job_name, bucket_name=BUCKET_NAME):
   """
   Get the GCS bucket folder for a specific job.
 
@@ -302,8 +346,9 @@ def get_job_gcs_bucket_folder(job_name):
   Returns:
       str: The full path to the bucket folder containing the job
   """
-  gcs_location = f"gs://{BUCKET_NAME}/maxtext/"
+  gcs_location = f"gs://{bucket_name}/maxtext/"
   bucket_folder_cmd = f"gcloud storage ls {gcs_location} | grep {job_name}"
+  print(f"bucket_folder_cmd: {bucket_folder_cmd}")
 
   try:
     bucket_folder = (
diff --git a/dags/map_reproducibility/utils/internal_aotc_workload.py b/dags/map_reproducibility/utils/internal_aotc_workload.py
@@ -194,7 +194,7 @@ def run_internal_aotc_workload(
         topology="",
         comment=comment,
         is_test=is_db_test_run,
-        logs_profile=gcs_bucket,
+        gcs_metrics_bucket=gcs_bucket,
         workload_others=str(config),
         experiment_id=job_name,
     )
diff --git a/dags/map_reproducibility/utils/sample_workload_utils.py b/dags/map_reproducibility/utils/sample_workload_utils.py
@@ -37,6 +37,7 @@
     parse_internal_config_filename,
     parse_internal_config_content,
     get_patheon_job_link,
+    find_xprof_gcs_path,
 )
 
 from dags.map_reproducibility.utils.benchmarkdb_utils import write_run
@@ -151,6 +152,23 @@ def sample_job_configure_project_and_cluster(cluster: str, cluster_region: str):
   return set_project_command
 
 
+def sample_workload_gcs_to_cns_cmds(log_file_in_gcs, output_file=None):
+  # If output_file is not provided, use the same name as the input file
+  log_file_in_gcs = log_file_in_gcs.replace("gs://", "")
+  if not output_file:
+    output_file = os.path.basename(log_file_in_gcs)
+  print(f"output_file name is: {output_file}")
+
+  cmds = (
+      f"LOG_FILE_IN_GCS={log_file_in_gcs} ",
+      f"filename={output_file} ",
+      "CNS_PATH=/cns/pi-d/home/${USER}/tensorboard/multislice ",
+      "/google/data/ro/projects/cloud/bigstore/mpm/fileutil_bs/stable/bin/fileutil_bs cp /bigstore/${LOG_FILE_IN_GCS} ${CNS_PATH}/${filename} ",
+      "echo file to put into xprof: ${CNS_PATH}/${filename}",
+  )
+  return cmds
+
+
 def write_run_results(
     config: Any,
     result: WorkloadResult,
@@ -289,8 +307,19 @@ def run_internal_sample_aotc_workload(
       print(f"mfu: {mfu}")
       print(f"step_time: {step_time}")
       comment = "sample benchmarking run"
-      gcs_bucket = get_job_gcs_bucket_folder(job_name)
+      gcs_bucket = get_job_gcs_bucket_folder(
+          job_name, bucket_name=sample_run_bucket_name
+      )
       print(f"GCS bucket is {gcs_bucket}")
+      logs_profile = None
+
+      if config.profiler:
+        logs_profile = find_xprof_gcs_path(gcs_bucket)
+        print(f"logs_profile is {logs_profile}")
+        profiler_cmds = sample_workload_gcs_to_cns_cmds(logs_profile)
+        profile_success, profiler_error_message = execute_workload_commands(
+            profiler_cmds, tmpdir
+        )
 
       write_run(
           model_id=config.HELM_NAME_MODEL_ID,
@@ -309,11 +338,12 @@ def run_internal_sample_aotc_workload(
           mfu=mfu,
           tokens_per_second=1,
           writer_path=bq_writer_repo_root,
-          run_type="internal_perf_regression",
+          run_type="sample_helm_workload",
           topology="",
           comment=comment,
           is_test=True,
-          logs_profile=gcs_bucket,
+          logs_profile=logs_profile,
+          gcs_metrics_bucket=gcs_bucket,
           workload_others=str(config),
           experiment_id=job_name,
       )

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def main():`
`59`	`59`
`60`	`60`	`# Setup configuration`
`61`	`61`	`relative_config_yaml_path = (`
`62`		`- "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_bf16_maxtext.yaml"`
	`62`	`+ "recipes/a3ultra/a3ultra_llama3.1-8b_8gpus_fp8_maxtext.yaml"`
`63`	`63`	`)`
`64`	`64`	`timeout = DAG_CONFIGS_ULTRA[relative_config_yaml_path]["timeout_minutes"]`
`65`	`65`
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@ def run_internal_aotc_workload(`
`194`	`194`	`topology="",`
`195`	`195`	`comment=comment,`
`196`	`196`	`is_test=is_db_test_run,`
`197`		`- logs_profile=gcs_bucket,`
	`197`	`+ gcs_metrics_bucket=gcs_bucket,`
`198`	`198`	`workload_others=str(config),`
`199`	`199`	`experiment_id=job_name,`
`200`	`200`	`)`