Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add/Enable vllm inference benchmark run on persistent TPUVM #625

Merged
merged 49 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
c97e13e
Create vllm-nightly.py
ManfeiBai Mar 19, 2025
f71d664
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
e762b60
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
53c9db5
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
377224f
Update run_mantaray_jobs.py
ManfeiBai Mar 20, 2025
34f4355
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
bd7e97e
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
265ab41
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
abd67c7
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
95fb795
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7d4e8dc
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
34c9ea1
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c6b1cd
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
a16d2d8
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
77aa55a
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7563599
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c085e0
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
9c4d8de
Create vllm_nightly_experimental.libsonnet
ManfeiBai Mar 20, 2025
cf3790e
Create vllm_nightly_experimental_2.libsonnet
ManfeiBai Mar 21, 2025
dc476e2
Create vllm-nightly-try2.py
ManfeiBai Mar 21, 2025
b69e195
Update tpu.py
ManfeiBai Mar 24, 2025
edcf677
Create vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
a54fa70
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
064558e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
876f0f1
Update tpu.py
ManfeiBai Mar 24, 2025
bffed1a
Create vllm-nightly-4.py
ManfeiBai Mar 24, 2025
747446d
Update run_mantaray_jobs.py
ManfeiBai Mar 24, 2025
9c7fe62
Update vllm-nightly-4.py
ManfeiBai Mar 24, 2025
e79725e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
8da1d81
Update vllm-nightly.py
ManfeiBai Mar 24, 2025
a78d38c
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
e3caa60
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libso…
ManfeiBai Mar 27, 2025
23b8f0f
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.lib…
ManfeiBai Mar 27, 2025
aa47112
Update tpu.py
ManfeiBai Mar 27, 2025
813082a
Delete dags/pytorch_xla/vllm-nightly.py
ManfeiBai Mar 27, 2025
c885d43
Delete dags/pytorch_xla/vllm-nightly-try3.py
ManfeiBai Mar 27, 2025
f98a338
Delete dags/pytorch_xla/vllm-nightly-try2.py
ManfeiBai Mar 27, 2025
ce8478f
Delete dags/pytorch_xla/vllm-nightly-4.py
ManfeiBai Mar 27, 2025
ccc652f
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
32d485d
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
72959ca
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
1bc7cc4
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
468379a
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
de440e8
format
ManfeiBai Apr 10, 2025
dad22b3
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
afdc077
format
ManfeiBai Apr 10, 2025
9d6ac58
format
ManfeiBai Apr 10, 2025
3b3f23b
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
1f30873
Merge branch 'master' into ManfeiBai-patch-11
ManfeiBai Apr 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions dags/mantaray/run_mantaray_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from xlml.utils import mantaray
import yaml
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.common.vm_resource as resource
import re

# Skip running this script in unit test because gcs loading will fail.
Expand All @@ -38,6 +40,54 @@
if re.match(pattern, job["task_name"]):
workload_file_name_list.append(job["file_name"])


def run_test_code_on_persistent_TPUVM():
gcloud_command = (
f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t' --worker=all \
--command=\"sudo docker run -it --privileged --net host --shm-size=16G --name testooo docker.io/vllm/vllm-tpu:270a5da495d24e947a71e2fa0c56635f4fad2dc3 \
bash -c 'export HF_TOKEN=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 && \
VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B --disable-log-requests \
--max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 --max-model-len=8192 --port 8009 & sleep 1200 && \
wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \
pip install --upgrade google-cloud-storage && rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \
echo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main\" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update && apt-get install -y google-cloud-sdk && apt-get -y install jq && export HF_TOKEN=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 && \
export PJRT_DEVICE=TPU && \
python inference-benchmark/benchmark_serving.py --save-json-results --port=8009 --dataset=ShareGPT_V3_unfiltered_cleaned_split.json \
--tokenizer=meta-llama/Meta-Llama-3-8B --request-rate=1 --backend=vllm --num-prompts=300 --max-input-length=1024 --max-output-length=1024 \
--file-prefix=benchmark --models=meta-llama/Meta-Llama-3-8B \"--output-bucket=gs://manfeipublic\"' && sudo docker stop testooo && sudo docker rm testooo\" \
",
)
return gcloud_command


def make_sure_docker_container_cleaned_on_persistent_TPUVM():
gcloud_command = (
f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t -4 -L 6009:localhost:6009' --worker=all --command=\"sudo docker stop testooo && sudo docker rm testooo\"",
)
return gcloud_command


@task
def run_on_v6e_4_persistant_TPUVM():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
";".join(
run_test_code_on_persistent_TPUVM()
+ make_sure_docker_container_cleaned_on_persistent_TPUVM()
),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"


# merge all PyTorch/XLA tests ino one Dag
with models.DAG(
dag_id="pytorch_xla_model_regression_test_on_trillium",
Expand All @@ -53,6 +103,7 @@
workload_file_name=workload_file_name,
)
run_workload
run_on_v6e_4_persistant_TPUVM()

# Create a DAG for each job from maxtext
for job in xlml_jobs:
Expand Down
154 changes: 154 additions & 0 deletions dags/pytorch_xla/vllm-nightly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A DAG to run all PyTorch/XLA tests with nightly version."""

import datetime
import tempfile
from airflow import models
from airflow.decorators import task
from airflow.hooks.subprocess import SubprocessHook
from xlml.utils import mantaray
import yaml
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.common.vm_resource as resource
import re


# Schudule the job to run everyday at 3:00AM PST (11:00AM UTC).
SCHEDULED_TIME = "0 11 * * *" if composer_env.is_prod_env() else None

# def run_test_code_on_persistent_TPUVM():
# gcloud_command = (
# f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t' --worker=all \
# --command=\"sudo docker run -it --privileged --net host --shm-size=16G --name testooo docker.io/vllm/vllm-tpu:270a5da495d24e947a71e2fa0c56635f4fad2dc3 \
# bash -c 'export HF_TOKEN=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 && \
# VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B --disable-log-requests \
# --max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 --max-model-len=8192 --port 8009 & sleep 1200 && \
# wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \
# pip install --upgrade google-cloud-storage && rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \
# echo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main\" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
# curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
# apt-get update && apt-get install -y google-cloud-sdk && apt-get -y install jq && export HF_TOKEN=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 && \
# export PJRT_DEVICE=TPU && \
# python inference-benchmark/benchmark_serving.py --save-json-results --port=8009 --dataset=ShareGPT_V3_unfiltered_cleaned_split.json \
# --tokenizer=meta-llama/Meta-Llama-3-8B --request-rate=1 --backend=vllm --num-prompts=300 --max-input-length=1024 --max-output-length=1024 \
# --file-prefix=benchmark --models=meta-llama/Meta-Llama-3-8B \"--output-bucket=gs://manfeipublic\"' && sudo docker stop testooo && sudo docker rm testooo\"
# ",
# )
# return gcloud_command


def run_test_code_on_persistent_TPUVM():
"""
Generuje polecenie gcloud do uruchomienia testowego kodu na trwałej maszynie wirtualnej TPU.
"""
gcloud_command = (
"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 "
"--zone=us-east5-b "
"--project=cloud-ml-benchmarking "
"--ssh-flag='-t' "
"--worker=all "
"--command=\"sudo docker run -it --privileged --net host --shm-size=16G --name testooo "
"docker.io/vllm/vllm-tpu:270a5da495d24e947a71e2fa0c56635f4fad2dc3 bash -c '"
"export HF_TOKEN=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 && "
"VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server "
"--model meta-llama/Meta-Llama-3-8B --disable-log-requests "
"--max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 "
"--max-model-len=8192 --port 8009 & sleep 1200 && "
"wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && "
"pip install --upgrade google-cloud-storage && rm -rf inference-benchmark && "
"git clone https://github.com/AI-Hypercomputer/inference-benchmark && "
"echo \\\"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main\\\" > /etc/apt/sources.list.d/google-cloud-sdk.list && "
"curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && "
"apt-get update && apt-get install -y google-cloud-sdk && apt-get -y install jq && "
"export HF_TOKEN=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 && export PJRT_DEVICE=TPU && "
"python inference-benchmark/benchmark_serving.py --save-json-results --port=8009 "
"--dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=meta-llama/Meta-Llama-3-8B "
"--request-rate=1 --backend=vllm --num-prompts=300 --max-input-length=1024 "
"--max-output-length=1024 --file-prefix=benchmark --models=meta-llama/Meta-Llama-3-8B "
"\\\"--output-bucket=gs://manfeipublic\\\"' && docker stop testooo && docker rm testooo\"" # Usunięto sudo
)
return gcloud_command

def make_sure_docker_container_cleaned_on_persistent_TPUVM():
gcloud_command = (
f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t -4 -L 6009:localhost:6009' --worker=all --command=\"sudo docker stop testooo && sudo docker rm testooo\"",
)
return gcloud_command


@task
def run_on_v6e_4_persistant_TPUVM():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
";".join(
run_test_code_on_persistent_TPUVM()
+ make_sure_docker_container_cleaned_on_persistent_TPUVM()
),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"


with models.DAG(
dag_id="pytorchxla-vllm-nightly",
schedule=SCHEDULED_TIME,
tags=["pytorchxla", "nightly", "torchbench"],
start_date=datetime.datetime(2024, 1, 1),
catchup=False,
) as dag:
# follow example in https://github.com/GoogleCloudPlatform/ml-auto-solutions/blob/bda4d59ed7fd9dd3b244a8b2612385c4f5c9a8a9/dags/multipod/maxtext_gpu_end_to_end.py#L41
run_on_v6e_4_persistant_TPUVM()


# # Running on V6E
# config.get_torchbench_tpu_config(
# tpu_version=resource.TpuVersion.TRILLIUM,
# tpu_cores=8,
# project=resource.Project.CLOUD_ML_BENCHMARKING,
# tpu_zone=resource.Zone.US_CENTRAL2_B,
# runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV6,
# network=resource.BM_NETWORKS,
# subnetwork=resource.V4_BM_SUBNETWORKS,
# time_out_in_min=1600,
# model_name=model,
# reserved=False,
# preemptible=False,
# extraFlags=" ".join(torchbench_extra_flags),
# )


# # merge all PyTorch/XLA tests ino one Dag
# with models.DAG(
# dag_id="pytorch_xla_model_regression_test_on_trillium",
# schedule="0 0 * * *", # everyday at midnight # job["schedule"],
# tags=["mantaray", "pytorchxla", "xlml"],
# start_date=datetime.datetime(2024, 4, 22),
# catchup=False,
# ) as dag:
# for workload_file_name in workload_file_name_list:
# run_workload = mantaray.run_workload.override(
# task_id=workload_file_name.split(".")[0]
# )(
# workload_file_name=workload_file_name,
# )
# run_workload
Loading