From 0683770dc8cc3558f19a126ac206010e5a457df5 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 00:19:41 -0800 Subject: [PATCH 1/7] Create vllm_xpk.py --- dags/pytorch_xla/vllm_xpk.py | 123 +++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 dags/pytorch_xla/vllm_xpk.py diff --git a/dags/pytorch_xla/vllm_xpk.py b/dags/pytorch_xla/vllm_xpk.py new file mode 100644 index 00000000..6549b4b0 --- /dev/null +++ b/dags/pytorch_xla/vllm_xpk.py @@ -0,0 +1,123 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This Airflow DAG runs a maxtext machine learning benchmark on a GKE cluster + +Copy and reference from https://github.com/GoogleCloudPlatform/ml-auto-solutions/blob/master/dags/mlcompass/maxtext_gke.py + +Usage: +gcloud composer environments run ml-automation-solutions \ + --project=cloud-ml-auto-solutions \ + --location=us-central1 dags trigger \ + -- \ + pytorch_tpu_vllm_xpk \ + --conf={\\\"uuid\\\":\\\"abc\\\"} +""" + +import datetime +import json +from airflow import models +from airflow.decorators import task +from airflow.providers.google.cloud.hooks.gcs import GCSHook +from dags.common import test_owner +from xlml.utils import xpk + + +with models.DAG( + dag_id="pytorch_tpu_vllm_xpk", + schedule=None, + tags=["pytorch", "vllm", "xpk"], + start_date=datetime.datetime(2025, 3, 5), + catchup=False, + params={ + "uuid": "", + }, + default_args={ + "retries": 0, + }, +) as dag: + + # @task.python(multiple_outputs=True) + # def load_xlml_state(params: dict = None): + # dag.log.info(params) + # uuid = params["uuid"] + # if not uuid: + # raise RuntimeError("uuid is not set") + # gcs_hook = GCSHook() + # file_content = gcs_hook.download( + # "mlcompass-jax-artifacts", f"xlml/{uuid}/xlml_state.json" + # ) + # return json.loads(file_content) + + # xlml_state = load_xlml_state() + + # cluster_name = xlml_state["cluster_name"] + # cluster_project = xlml_state["cluster_project"] + # cluster_region = xlml_state["cluster_region"] + # cluster_zone = xlml_state["cluster_zone"] + # benchmark_id = xlml_state["test_name"] + + # docker_image_path = xlml_state["docker_image_path"] + # accelerator_type = xlml_state["accelerator_type"] + # num_slices = xlml_state["num_slices"] + + # model_name = xlml_state["model_name"] + # workdir_bucket = xlml_state["workdir_bucket"] + # workdir_path = xlml_state["workdir_path"] + # gcs_path = f"gs://{workdir_bucket}/{workdir_path}" + # workload_id = f'mlc-{xlml_state["uuid"]}' + + # workload_provision_timeout = datetime.timedelta(minutes=300).total_seconds() + # workload_run_timeout = datetime.timedelta(minutes=60).total_seconds() + + run_workload = xpk.run_workload.override(owner=test_owner.Manfei_Bai)( + task_id="run_workload", + cluster_project=cluster_project, + zone=cluster_zone, + cluster_name=cluster_name, + benchmark_id=benchmark_id, + workload_id=workload_id, + gcs_path=gcs_path, + docker_image=docker_image_path, + accelerator_type=accelerator_type, + run_cmds=f"source benchmark_run.sh;run {model_name} {gcs_path}", + num_slices=num_slices, + use_vertex_tensorboard=False, + use_pathways=False, + ) + + wait_for_workload_start = xpk.wait_for_workload_start.override( + timeout=workload_provision_timeout + )( + workload_id=workload_id, + project_id=cluster_project, + region=cluster_region, + cluster_name=cluster_name, + ) + + wait_for_workload_completion = xpk.wait_for_workload_completion.override( + timeout=workload_run_timeout + )( + workload_id=workload_id, + project_id=cluster_project, + region=cluster_region, + cluster_name=cluster_name, + ) + + ( + xlml_state + >> run_workload + >> wait_for_workload_start + >> wait_for_workload_completion + ) From da09d251b7a4c83d8368f73fff2227ea0c3d3ffd Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 00:30:17 -0800 Subject: [PATCH 2/7] Update test_owner.py --- dags/common/test_owner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/common/test_owner.py b/dags/common/test_owner.py index 9671cfa8..f8bb15db 100644 --- a/dags/common/test_owner.py +++ b/dags/common/test_owner.py @@ -37,6 +37,7 @@ class Team(enum.Enum): # PYTORCH PEI_Z = "Pei Z." +MANFEI_B = "Manfei B." # MaxText TONY_C = "Tony C." From a4754ce2bfe6e91920767414f0de2c1fc8b72f93 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 01:09:25 -0800 Subject: [PATCH 3/7] Update vllm_xpk.py --- dags/pytorch_xla/vllm_xpk.py | 57 +++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/dags/pytorch_xla/vllm_xpk.py b/dags/pytorch_xla/vllm_xpk.py index 6549b4b0..0e79b3d1 100644 --- a/dags/pytorch_xla/vllm_xpk.py +++ b/dags/pytorch_xla/vllm_xpk.py @@ -81,43 +81,58 @@ # workload_provision_timeout = datetime.timedelta(minutes=300).total_seconds() # workload_run_timeout = datetime.timedelta(minutes=60).total_seconds() - run_workload = xpk.run_workload.override(owner=test_owner.Manfei_Bai)( + run_workload_server = xpk.run_workload.override(owner=test_owner.MANFEI_B)( task_id="run_workload", - cluster_project=cluster_project, - zone=cluster_zone, - cluster_name=cluster_name, - benchmark_id=benchmark_id, - workload_id=workload_id, - gcs_path=gcs_path, - docker_image=docker_image_path, - accelerator_type=accelerator_type, - run_cmds=f"source benchmark_run.sh;run {model_name} {gcs_path}", - num_slices=num_slices, + cluster_project="cloud-tpu-multipod-dev", + zone="europe-west4-b", + cluster_name="b397493880-manfei3", + benchmark_id="xlml.vllm.llama3-8b.1slice.v5p_128_xpk", + workload_id="nightly-vllm-"+datetime.now(), + gcs_path=f"gs://vllmnightlyxpk/vllmnightlyxpk/workload_id", + docker_image="gcr.io/cloud-tpu-v2-images/vllm-tpu-nightly:latest", + accelerator_type="v5p-8", + run_cmds=f"bash nightly-benchmarks/scripts/run-nightly-benchmarks.sh", + num_slices=1, use_vertex_tensorboard=False, use_pathways=False, ) + # run_workload_inference = xpk.run_workload.override(owner=test_owner.MANFEI_B)( + # task_id="run_workload", + # cluster_project="cloud-tpu-multipod-dev", + # zone="europe-west4-b", + # cluster_name="b397493880-manfei3", + # benchmark_id="xlml.vllm.llama3-8b.1slice.v5p_128_xpk", + # workload_id="nightly-vllm-"+datetime.now(), + # gcs_path=f"gs://vllmnightlyxpk/vllmnightlyxpk/workload_id", + # docker_image="gcr.io/cloud-tpu-v2-images/vllm-tpu-nightly:latest", + # accelerator_type="v5p-8", + # run_cmds=f"source benchmark_run.sh;run {model_name} {gcs_path}", + # num_slices=num_slices, + # use_vertex_tensorboard=False, + # use_pathways=False, + # ) + wait_for_workload_start = xpk.wait_for_workload_start.override( timeout=workload_provision_timeout )( - workload_id=workload_id, - project_id=cluster_project, - region=cluster_region, - cluster_name=cluster_name, + workload_id="nightly-vllm-"+datetime.now(), + project_id="cloud-tpu-multipod-dev", + region="europe-west4", + cluster_name="b397493880-manfei3", ) wait_for_workload_completion = xpk.wait_for_workload_completion.override( timeout=workload_run_timeout )( - workload_id=workload_id, - project_id=cluster_project, - region=cluster_region, - cluster_name=cluster_name, + workload_id="nightly-vllm-"+datetime.now(), + project_id="cloud-tpu-multipod-dev", + region="europe-west4", + cluster_name="b397493880-manfei3", ) ( - xlml_state - >> run_workload + run_workload_server >> wait_for_workload_start >> wait_for_workload_completion ) From 7f897659e2141c26373f73ffc7d7d88013c202c9 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 10:59:14 -0800 Subject: [PATCH 4/7] Update vllm_xpk.py --- dags/pytorch_xla/vllm_xpk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/pytorch_xla/vllm_xpk.py b/dags/pytorch_xla/vllm_xpk.py index 0e79b3d1..643f0599 100644 --- a/dags/pytorch_xla/vllm_xpk.py +++ b/dags/pytorch_xla/vllm_xpk.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""This Airflow DAG runs a maxtext machine learning benchmark on a GKE cluster +"""This Airflow DAG runs a vllm benchmark on a GKE cluster Copy and reference from https://github.com/GoogleCloudPlatform/ml-auto-solutions/blob/master/dags/mlcompass/maxtext_gke.py From 9440a359f231fb0c796fa1c00cb018e331752494 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:21:57 -0800 Subject: [PATCH 5/7] Update vllm_xpk.py --- dags/pytorch_xla/vllm_xpk.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/dags/pytorch_xla/vllm_xpk.py b/dags/pytorch_xla/vllm_xpk.py index 643f0599..b430ce25 100644 --- a/dags/pytorch_xla/vllm_xpk.py +++ b/dags/pytorch_xla/vllm_xpk.py @@ -35,17 +35,22 @@ with models.DAG( - dag_id="pytorch_tpu_vllm_xpk", - schedule=None, - tags=["pytorch", "vllm", "xpk"], - start_date=datetime.datetime(2025, 3, 5), + dag_id="pytorch_xla_model_regression_test_on_trillium", + schedule="0 0 * * *", # everyday at midnight # job["schedule"], + tags=["mantaray", "pytorchxla", "xlml"], + start_date=datetime.datetime(2024, 4, 22), catchup=False, - params={ - "uuid": "", - }, - default_args={ - "retries": 0, - }, + # dag_id="pytorch_xla_model_regression_test_on_trillium", # pytorch_tpu_vllm_xpk", + # schedule=None, + # tags=["pytorch", "vllm", "xpk"], + # start_date=datetime.datetime(2025, 3, 5), + # catchup=False, + # params={ + # "uuid": "", + # }, + # default_args={ + # "retries": 0, + # }, ) as dag: # @task.python(multiple_outputs=True) From af842c176df7885c813aa76a345cd32407e17f92 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:23:10 -0800 Subject: [PATCH 6/7] Update run_mantaray_jobs.py --- dags/mantaray/run_mantaray_jobs.py | 82 ++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/dags/mantaray/run_mantaray_jobs.py b/dags/mantaray/run_mantaray_jobs.py index 59eec4fb..e32eb6cc 100644 --- a/dags/mantaray/run_mantaray_jobs.py +++ b/dags/mantaray/run_mantaray_jobs.py @@ -16,11 +16,15 @@ import datetime +# import _datetime from airflow import models from xlml.utils import mantaray import yaml from dags import composer_env import re +from airflow.decorators import task +from xlml.utils import xpk +from dags.common import test_owner # Skip running this script in unit test because gcs loading will fail. if composer_env.is_prod_env() or composer_env.is_dev_env(): @@ -38,6 +42,11 @@ if re.match(pattern, job["task_name"]): workload_file_name_list.append(job["file_name"]) + @task.python(multiple_outputs=True) + def hello_world_vllm(params: dict = None): + dag.log.info(params) + print("Hello world vLLM!") + # merge all PyTorch/XLA tests ino one Dag with models.DAG( dag_id="pytorch_xla_model_regression_test_on_trillium", @@ -53,6 +62,79 @@ workload_file_name=workload_file_name, ) run_workload + # hello_world_vllm + workload_id="nightly-vllm-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") + cluster_name="b397493880-repo3" + cluster_project="cloud-tpu-multipod-dev" + zone="europe-west4-b" + region="europe-west4" + workload_provision_timeout = datetime.timedelta(seconds=30).total_seconds() + workload_run_timeout = datetime.timedelta(minutes=3).total_seconds() + hello_world_vllm_xpk = xpk.run_workload.override(owner=test_owner.MANFEI_B)( + task_id="run_workload_vllm_xpk", + cluster_project=cluster_project, + zone=zone, + cluster_name=cluster_name, # "b397493880-manfei3", + benchmark_id="xlml.vllm.llama3-8b.1slice.v5p_128_xpk", + workload_id=workload_id, + gcs_path=f"gs://vllmnightlyxpk/vllmnightlyxpk/workload_id", + docker_image="gcr.io/cloud-tpu-v2-images/vllm-tpu-nightly:latest", + accelerator_type="v5p-128", + run_cmds=f"export HF_TOKEN=xxxxx && \ + export VLLM_SOURCE_CODE_LOC=./ && \ + vllm serve meta-llama/Meta-Llama-3.1-8B --swap-space 16 --disable-log-requests --tensor_parallel_size=8 --max-model-len=2048 --num-scheduler-steps=4 & sleep 600 \ + ", + num_slices=1, + use_vertex_tensorboard=False, + use_pathways=False, + ) + # hello_world_vllm_xpk = xpk.run_workload.override(owner=test_owner.MANFEI_B)( + # task_id="run_workload_vllm_xpk", + # cluster_project=cluster_project, + # zone=zone, + # cluster_name=cluster_name, # "b397493880-manfei3", + # benchmark_id="xlml.vllm.llama3-8b.1slice.v5p_128_xpk", + # workload_id=workload_id, + # gcs_path=f"gs://vllmnightlyxpk/vllmnightlyxpk/workload_id", + # docker_image="gcr.io/cloud-tpu-v2-images/vllm-tpu-nightly:latest", + # accelerator_type="v5p-128", + # run_cmds=f"pip install --upgrade google-cloud-storage && \ + # wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \ + # rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \ + # echo 'deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main' > /etc/apt/sources.list.d/google-cloud-sdk.list && \ + # curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \ + # apt-get update && apt-get install -y google-cloud-sdk && \ + # apt-get -y install jq && \ + # export HF_TOKEN=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 && \ + # vllm serve meta-llama/Meta-Llama-3.1-8B --swap-space 16 --disable-log-requests --tensor_parallel_size=8 --max-model-len=2048 --num-scheduler-steps=4 & sleep 600 \ + # ", + # num_slices=1, + # use_vertex_tensorboard=False, + # use_pathways=False, + # ) + wait_for_workload_start = xpk.wait_for_workload_start.override( + timeout=workload_provision_timeout + )( + workload_id=workload_id, + project_id=cluster_project, + region=region, + cluster_name=cluster_name, + ) + + wait_for_workload_completion = xpk.wait_for_workload_completion.override( + timeout=workload_run_timeout + )( + workload_id=workload_id, + project_id=cluster_project, + region=region, + cluster_name=cluster_name, + ) + + ( + hello_world_vllm_xpk + >> wait_for_workload_start + >> wait_for_workload_completion + ) # Create a DAG for each job from maxtext for job in xlml_jobs: From 19f1af8bc699ce339d71bcd52f8e1ef408436049 Mon Sep 17 00:00:00 2001 From: Manfei <41607353+ManfeiBai@users.noreply.github.com> Date: Mon, 10 Mar 2025 11:14:24 -0700 Subject: [PATCH 7/7] Update run_mantaray_jobs.py --- dags/mantaray/run_mantaray_jobs.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/dags/mantaray/run_mantaray_jobs.py b/dags/mantaray/run_mantaray_jobs.py index e32eb6cc..77a35135 100644 --- a/dags/mantaray/run_mantaray_jobs.py +++ b/dags/mantaray/run_mantaray_jobs.py @@ -88,30 +88,7 @@ def hello_world_vllm(params: dict = None): use_vertex_tensorboard=False, use_pathways=False, ) - # hello_world_vllm_xpk = xpk.run_workload.override(owner=test_owner.MANFEI_B)( - # task_id="run_workload_vllm_xpk", - # cluster_project=cluster_project, - # zone=zone, - # cluster_name=cluster_name, # "b397493880-manfei3", - # benchmark_id="xlml.vllm.llama3-8b.1slice.v5p_128_xpk", - # workload_id=workload_id, - # gcs_path=f"gs://vllmnightlyxpk/vllmnightlyxpk/workload_id", - # docker_image="gcr.io/cloud-tpu-v2-images/vllm-tpu-nightly:latest", - # accelerator_type="v5p-128", - # run_cmds=f"pip install --upgrade google-cloud-storage && \ - # wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \ - # rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \ - # echo 'deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main' > /etc/apt/sources.list.d/google-cloud-sdk.list && \ - # curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \ - # apt-get update && apt-get install -y google-cloud-sdk && \ - # apt-get -y install jq && \ - # export HF_TOKEN=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 && \ - # vllm serve meta-llama/Meta-Llama-3.1-8B --swap-space 16 --disable-log-requests --tensor_parallel_size=8 --max-model-len=2048 --num-scheduler-steps=4 & sleep 600 \ - # ", - # num_slices=1, - # use_vertex_tensorboard=False, - # use_pathways=False, - # ) + wait_for_workload_start = xpk.wait_for_workload_start.override( timeout=workload_provision_timeout )(