Merge branch 'master' into fix-pw-maxtext-v5e

dipannita08 · web-flow · commit 15a92adc362d · 2025-03-03T21:27:12.000-08:00
diff --git a/dags/map_reproducibility/a3mega_gpt3_175b_nemo.py b/dags/map_reproducibility/a3mega_gpt3_175b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -44,6 +44,8 @@
 from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_cluster
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "gpt3-175b"
@@ -60,11 +62,10 @@
 VALUE_YAML_PATH = (
     f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
 )
-CLUSTER = "a3plus-benchmark"
-CLUSTER_REGION = "australia-southeast1"
+CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo_workload:24.07"
-DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
 
 
 @task
diff --git a/dags/map_reproducibility/a3mega_llama_3_1_70b_nemo.py b/dags/map_reproducibility/a3mega_llama_3_1_70b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -44,6 +44,8 @@
 from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_cluster
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "llama-3.1-70b"
@@ -62,11 +64,10 @@
 VALUE_YAML_PATH = (
     f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
 )
-CLUSTER = "a3plus-benchmark"
-CLUSTER_REGION = "australia-southeast1"
+CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo_workload:24.07"
-DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
 
 
 @task
diff --git a/dags/map_reproducibility/a3mega_llama_3_70b_nemo.py b/dags/map_reproducibility/a3mega_llama_3_70b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -44,6 +44,8 @@
 from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_cluster
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "llama-3-70b"
@@ -61,11 +63,10 @@
 VALUE_YAML_PATH = (
     f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
 )
-CLUSTER = "a3plus-benchmark"
-CLUSTER_REGION = "australia-southeast1"
+CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo_workload:24.07"
-DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
 
 
 @task
diff --git a/dags/map_reproducibility/a3mega_mixtral_8_7b_nemo.py b/dags/map_reproducibility/a3mega_mixtral_8_7b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -44,6 +44,8 @@
 from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_cluster
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "mixtral-8x7b"
@@ -62,11 +64,10 @@
 VALUE_YAML_PATH = (
     f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
 )
-CLUSTER = "a3plus-benchmark"
-CLUSTER_REGION = "australia-southeast1"
+CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo_workload:24.07"
-DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
 
 
 @task
diff --git a/dags/map_reproducibility/a3ultra_llama_3_1_70b_nemo.py b/dags/map_reproducibility/a3ultra_llama_3_1_70b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -45,6 +45,7 @@
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_cluster
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "llama-3.1-70b"
@@ -67,7 +68,8 @@
 CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo_workload:24.07"
-DOCKER_IMAGE = "us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:nemo24.07-gib1.0.3-A3U"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
+KUEUE_NAME = "a3-ultra"
 
 
 @task
@@ -128,6 +130,7 @@ def run_aotc_workload():
                     recipe_repo_root,
                     DOCKER_IMAGE,
                     cluster_name=CLUSTER,
+                    kueue_name=KUEUE_NAME,
                 )
                 + wait_for_jobs_cmds()
                 + copy_bucket_cmds(
diff --git a/dags/map_reproducibility/a3ultra_mixtral_8_7b_nemo.py b/dags/map_reproducibility/a3ultra_mixtral_8_7b_nemo.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""DAGs to run Aotc reproducibility benchmarks."""
+"""DAGs to run hypercomputer recipes"""
 
 import datetime
 import sys
@@ -45,6 +45,7 @@
 from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
 from dags.map_reproducibility.utils.common_utils import get_cluster
 from dags.map_reproducibility.utils.common_utils import get_scheduled_time
+from dags.map_reproducibility.utils.common_utils import get_docker_image
 
 
 MODEL_ID = "mixtral-8x7b"
@@ -65,7 +66,7 @@
 CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
 SOFTWARE_ID = "pytorch_nemo"
 IMAGE_VERSION = "nemo24.07"
-DOCKER_IMAGE = f"us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:{IMAGE_VERSION}-gib1.0.3-A3U"
+DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
 KUEUE_NAME = "a3-ultra"
 
 
diff --git a/dags/map_reproducibility/utils/common_utils.py b/dags/map_reproducibility/utils/common_utils.py
@@ -122,7 +122,7 @@ def helm_apply_cmds(
   gcs_cmd = ""
   if hypercomputer == "a3ultra":
     gcs_cmd = f" --set clusterName={cluster_name}"
-    # gcs_cmd += f" --set queue={kueue_name}"
+    gcs_cmd += f" --set queue={kueue_name}"
     gcs_cmd += f" --set volumes.gcsMounts[0].bucketName={BUCKET_NAME}"
   else:
     gcs_cmd = f" --set workload.gcsBucketForDataCataPath={BUCKET_NAME}"
@@ -325,3 +325,33 @@ def get_scheduled_time(hardware: str, model: str, framework: str):
         return schedule_map[hardware][model][framework]
 
   return None  # Return None if no schedule is found for the given combination
+
+
+def get_docker_image(hardware: str, framework: str):
+  """
+  Returns the appropriate Docker image based on the given hardware, model, and framework.
+
+  Args:
+      hardware: The hardware type (e.g., "a3ultra", "a3mega").
+      framework: The framework (e.g., "nemo", "maxtext").
+
+  Returns:
+      A Docker image string or None if no image is defined for the given combination.
+  """
+
+  image_map = {
+      "a3ultra": {
+          "nemo": "us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:nemo24.07-gib1.0.3-A3U",
+          "maxtext": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/maxtext-benchmark",
+      },
+      "a3mega": {
+          "nemo": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07",
+          "maxtext": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/maxtext-benchmark",
+      },
+  }
+
+  if hardware in image_map:
+    if framework in image_map[hardware]:
+      return image_map[hardware][framework]
+
+  return None  # Return None if no image is found for the given combination
diff --git a/dags/solutions_team/configs/vllm/vllm_benchmark_config.py b/dags/solutions_team/configs/vllm/vllm_benchmark_config.py
@@ -182,7 +182,7 @@ def get_tpu_vllm_benchmark_cmds(
   ]
 
   for request_rate in request_rates:
-    benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""
+    benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --stream-request --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""
 
     benchmark_cmds = [
         # Run benchmark inside the container

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ def get_tpu_vllm_benchmark_cmds(`
`182`	`182`	`]`
`183`	`183`
`184`	`184`	`for request_rate in request_rates:`
`185`		`- benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""`
	`185`	`+ benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --stream-request --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""`
`186`	`186`
`187`	`187`	`benchmark_cmds = [`
`188`	`188`	`# Run benchmark inside the container`