Skip to content

Commit 15a92ad

Browse files
authored
Merge branch 'master' into fix-pw-maxtext-v5e
2 parents c9efb07 + ca07921 commit 15a92ad

8 files changed

+60
-22
lines changed

dags/map_reproducibility/a3mega_gpt3_175b_nemo.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -44,6 +44,8 @@
4444
from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
47+
from dags.map_reproducibility.utils.common_utils import get_cluster
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4749

4850

4951
MODEL_ID = "gpt3-175b"
@@ -60,11 +62,10 @@
6062
VALUE_YAML_PATH = (
6163
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
6264
)
63-
CLUSTER = "a3plus-benchmark"
64-
CLUSTER_REGION = "australia-southeast1"
65+
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6566
SOFTWARE_ID = "pytorch_nemo"
6667
IMAGE_VERSION = "nemo_workload:24.07"
67-
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
68+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
6869

6970

7071
@task

dags/map_reproducibility/a3mega_llama_3_1_70b_nemo.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -44,6 +44,8 @@
4444
from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
47+
from dags.map_reproducibility.utils.common_utils import get_cluster
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4749

4850

4951
MODEL_ID = "llama-3.1-70b"
@@ -62,11 +64,10 @@
6264
VALUE_YAML_PATH = (
6365
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
6466
)
65-
CLUSTER = "a3plus-benchmark"
66-
CLUSTER_REGION = "australia-southeast1"
67+
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6768
SOFTWARE_ID = "pytorch_nemo"
6869
IMAGE_VERSION = "nemo_workload:24.07"
69-
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
70+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
7071

7172

7273
@task

dags/map_reproducibility/a3mega_llama_3_70b_nemo.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -44,6 +44,8 @@
4444
from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
47+
from dags.map_reproducibility.utils.common_utils import get_cluster
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4749

4850

4951
MODEL_ID = "llama-3-70b"
@@ -61,11 +63,10 @@
6163
VALUE_YAML_PATH = (
6264
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
6365
)
64-
CLUSTER = "a3plus-benchmark"
65-
CLUSTER_REGION = "australia-southeast1"
66+
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6667
SOFTWARE_ID = "pytorch_nemo"
6768
IMAGE_VERSION = "nemo_workload:24.07"
68-
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
69+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
6970

7071

7172
@task

dags/map_reproducibility/a3mega_mixtral_8_7b_nemo.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -44,6 +44,8 @@
4444
from dags.map_reproducibility.utils.common_utils import get_bq_writer_path
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
47+
from dags.map_reproducibility.utils.common_utils import get_cluster
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4749

4850

4951
MODEL_ID = "mixtral-8x7b"
@@ -62,11 +64,10 @@
6264
VALUE_YAML_PATH = (
6365
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
6466
)
65-
CLUSTER = "a3plus-benchmark"
66-
CLUSTER_REGION = "australia-southeast1"
67+
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6768
SOFTWARE_ID = "pytorch_nemo"
6869
IMAGE_VERSION = "nemo_workload:24.07"
69-
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/{FRAMEWORK}_test/{IMAGE_VERSION}"
70+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
7071

7172

7273
@task

dags/map_reproducibility/a3ultra_llama_3_1_70b_nemo.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -45,6 +45,7 @@
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_cluster
4747
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4849

4950

5051
MODEL_ID = "llama-3.1-70b"
@@ -67,7 +68,8 @@
6768
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6869
SOFTWARE_ID = "pytorch_nemo"
6970
IMAGE_VERSION = "nemo_workload:24.07"
70-
DOCKER_IMAGE = "us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:nemo24.07-gib1.0.3-A3U"
71+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
72+
KUEUE_NAME = "a3-ultra"
7173

7274

7375
@task
@@ -128,6 +130,7 @@ def run_aotc_workload():
128130
recipe_repo_root,
129131
DOCKER_IMAGE,
130132
cluster_name=CLUSTER,
133+
kueue_name=KUEUE_NAME,
131134
)
132135
+ wait_for_jobs_cmds()
133136
+ copy_bucket_cmds(

dags/map_reproducibility/a3ultra_mixtral_8_7b_nemo.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""DAGs to run Aotc reproducibility benchmarks."""
15+
"""DAGs to run hypercomputer recipes"""
1616

1717
import datetime
1818
import sys
@@ -45,6 +45,7 @@
4545
from dags.map_reproducibility.utils.common_utils import get_recipe_repo_path
4646
from dags.map_reproducibility.utils.common_utils import get_cluster
4747
from dags.map_reproducibility.utils.common_utils import get_scheduled_time
48+
from dags.map_reproducibility.utils.common_utils import get_docker_image
4849

4950

5051
MODEL_ID = "mixtral-8x7b"
@@ -65,7 +66,7 @@
6566
CLUSTER, CLUSTER_REGION = get_cluster(HYPERCOMPUTER)
6667
SOFTWARE_ID = "pytorch_nemo"
6768
IMAGE_VERSION = "nemo24.07"
68-
DOCKER_IMAGE = f"us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:{IMAGE_VERSION}-gib1.0.3-A3U"
69+
DOCKER_IMAGE = get_docker_image(HYPERCOMPUTER, FRAMEWORK)
6970
KUEUE_NAME = "a3-ultra"
7071

7172

dags/map_reproducibility/utils/common_utils.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def helm_apply_cmds(
122122
gcs_cmd = ""
123123
if hypercomputer == "a3ultra":
124124
gcs_cmd = f" --set clusterName={cluster_name}"
125-
# gcs_cmd += f" --set queue={kueue_name}"
125+
gcs_cmd += f" --set queue={kueue_name}"
126126
gcs_cmd += f" --set volumes.gcsMounts[0].bucketName={BUCKET_NAME}"
127127
else:
128128
gcs_cmd = f" --set workload.gcsBucketForDataCataPath={BUCKET_NAME}"
@@ -325,3 +325,33 @@ def get_scheduled_time(hardware: str, model: str, framework: str):
325325
return schedule_map[hardware][model][framework]
326326

327327
return None # Return None if no schedule is found for the given combination
328+
329+
330+
def get_docker_image(hardware: str, framework: str):
331+
"""
332+
Returns the appropriate Docker image based on the given hardware, model, and framework.
333+
334+
Args:
335+
hardware: The hardware type (e.g., "a3ultra", "a3mega").
336+
framework: The framework (e.g., "nemo", "maxtext").
337+
338+
Returns:
339+
A Docker image string or None if no image is defined for the given combination.
340+
"""
341+
342+
image_map = {
343+
"a3ultra": {
344+
"nemo": "us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-gpu-nemo-nccl:nemo24.07-gib1.0.3-A3U",
345+
"maxtext": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/maxtext-benchmark",
346+
},
347+
"a3mega": {
348+
"nemo": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/nemo_test/nemo_workload:24.07",
349+
"maxtext": "us-central1-docker.pkg.dev/supercomputer-testing/gunjanjalori/maxtext-benchmark",
350+
},
351+
}
352+
353+
if hardware in image_map:
354+
if framework in image_map[hardware]:
355+
return image_map[hardware][framework]
356+
357+
return None # Return None if no image is found for the given combination

dags/solutions_team/configs/vllm/vllm_benchmark_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def get_tpu_vllm_benchmark_cmds(
182182
]
183183

184184
for request_rate in request_rates:
185-
benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""
185+
benchmark_cmd_fmt = "sudo docker exec $CONTAINER_NAME /bin/bash -c \"export HF_TOKEN={HF_TOKEN} && python inference-benchmark/benchmark_serving.py --stream-request --host localhost --port 8000 --num-prompts {num_prompts} --max-input-length 1024 --max-output-length 1024 --dataset ShareGPT_V3_unfiltered_cleaned_split.json --save-json-results --model {model_id} --tokenizer {model_id} --request-rate {request_rate} --additional-metadata-metrics-to-save '{additional_metadata}'\""
186186

187187
benchmark_cmds = [
188188
# Run benchmark inside the container

0 commit comments

Comments
 (0)