Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable vllm inference benchmark run on persistent TPUVM #625

Draft
wants to merge 41 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
c97e13e
Create vllm-nightly.py
ManfeiBai Mar 19, 2025
f71d664
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
e762b60
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
53c9db5
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
377224f
Update run_mantaray_jobs.py
ManfeiBai Mar 20, 2025
34f4355
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
bd7e97e
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
265ab41
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
abd67c7
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
95fb795
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7d4e8dc
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
34c9ea1
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c6b1cd
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
a16d2d8
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
77aa55a
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7563599
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c085e0
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
9c4d8de
Create vllm_nightly_experimental.libsonnet
ManfeiBai Mar 20, 2025
cf3790e
Create vllm_nightly_experimental_2.libsonnet
ManfeiBai Mar 21, 2025
dc476e2
Create vllm-nightly-try2.py
ManfeiBai Mar 21, 2025
b69e195
Update tpu.py
ManfeiBai Mar 24, 2025
edcf677
Create vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
a54fa70
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
064558e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
876f0f1
Update tpu.py
ManfeiBai Mar 24, 2025
bffed1a
Create vllm-nightly-4.py
ManfeiBai Mar 24, 2025
747446d
Update run_mantaray_jobs.py
ManfeiBai Mar 24, 2025
9c7fe62
Update vllm-nightly-4.py
ManfeiBai Mar 24, 2025
e79725e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
8da1d81
Update vllm-nightly.py
ManfeiBai Mar 24, 2025
a78d38c
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
e3caa60
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libso…
ManfeiBai Mar 27, 2025
23b8f0f
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.lib…
ManfeiBai Mar 27, 2025
aa47112
Update tpu.py
ManfeiBai Mar 27, 2025
813082a
Delete dags/pytorch_xla/vllm-nightly.py
ManfeiBai Mar 27, 2025
c885d43
Delete dags/pytorch_xla/vllm-nightly-try3.py
ManfeiBai Mar 27, 2025
f98a338
Delete dags/pytorch_xla/vllm-nightly-try2.py
ManfeiBai Mar 27, 2025
ce8478f
Delete dags/pytorch_xla/vllm-nightly-4.py
ManfeiBai Mar 27, 2025
ccc652f
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
32d485d
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
72959ca
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion dags/mantaray/run_mantaray_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@
from xlml.utils import mantaray
import yaml
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.common.vm_resource as resource
import re
import tempfile
from airflow.decorators import task
from airflow.decorators import task_group
from airflow.hooks.subprocess import SubprocessHook
from dags.common import test_owner
from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke
from airflow.models import Variable


# Skip running this script in unit test because gcs loading will fail.
if composer_env.is_prod_env() or composer_env.is_dev_env():
Expand All @@ -38,11 +48,84 @@
if re.match(pattern, job["task_name"]):
workload_file_name_list.append(job["file_name"])

HF_TOKEN_LLaMA3_8B = Variable.get("HF_TOKEN_LLaMA3_8B", None)

def run_test_code_on_persistent_TPUVM():
"""
Run nightly vLLM inference benchmarking on persistent TPU.
"""
gcloud_command = (
f"set -x && "
"set -u && "
"project=$(curl -sS \"http://metadata.google.internal/computeMetadata/v1/project/project-id\" -H \"Metadata-Flavor: Google\") && "
"zone=europe-west4-a && "
"tpu_name=manfei-2025-v6e-4-cloud-ml-auto-solu && "
"[ -f /scripts/id_rsa ] && sudo rm /scripts/id_rsa && sudo rm /scripts/id_rsa.pub; sudo ssh-keygen -t rsa -f /scripts/id_rsa -q -N \"\" && "
"echo \"xl-ml-test:$(cat /scripts/id_rsa.pub)\" > ssh-keys.txt && "
"echo 'echo Running startup script' > startup-script.txt && "
"sudo apt-get -y update && "
"sudo apt-get -y install lsof && "
"sudo dpkg --configure -a && "
"sudo apt-get -y install nfs-common && "
"yes '' | gcloud compute config-ssh && "
"ls /home/airflow/.ssh/ && "
"echo ${project} && "
"echo ${zone} && "
"echo ${tpu_name} && "
"yes 'y' | sudo gcloud alpha compute tpus tpu-vm ssh manfei-2025-v6e-4-cloud-ml-auto-solu --zone=europe-west4-a "
"--project=cloud-ml-auto-solutions --ssh-key-file=/home/airflow/.ssh/google_compute_engine --strict-host-key-checking=no "
"--internal-ip --worker=all --command ' \
sudo docker ps -a --filter \"name=testooo\" -q | grep -q . && sudo docker rm -f testooo; \
sudo docker run --privileged --net host --shm-size=16G --name testooo \
us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm bash -c \" \
export HF_TOKEN={HF_TOKEN_LLaMA3_8B} && \
pip uninstall -y torch torchvision torch_xla jax jaxlib libtpu && \
git clone https://github.com/vllm-project/vllm.git && \
cd vllm && \
pip install -r requirements/tpu.txt && \
VLLM_TARGET_DEVICE=\'tpu\' python setup.py develop && \
export PJRT_DEVICE=TPU && \
VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B --disable-log-requests \
--max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 --max-model-len=8192 --port 8009 & sleep 900 && \
git clone -b inference-benchmark-script https://github.com/ManfeiBai/vllm.git vllmscript && \
bash vllmscript/benchmarks/inference_benchmark_script.sh \
\" && sudo docker stop testooo && sudo docker rm testooo' "
)
return gcloud_command


@task
def run_on_v6e_4_persistant_TPUVM():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
run_test_code_on_persistent_TPUVM(),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"


@task_group(prefix_group_id=False)
def run_vllm_nightly_on_v6e_4_persistant_TPUVM():
GCS_SUBFOLDER_PREFIX_PYTORCH_XLA = test_owner.Team.PYTORCH_XLA.value
output_location = name_format.generate_gcs_folder_location(
f"{GCS_SUBFOLDER_PREFIX_PYTORCH_XLA}/vllm_benchmark_nightly",
f'vllm-nightly-v6e-4',
)
run_on_v6e_4_persistant_TPUVM_func = run_on_v6e_4_persistant_TPUVM()
run_on_v6e_4_persistant_TPUVM_func


# merge all PyTorch/XLA tests ino one Dag
with models.DAG(
dag_id="pytorch_xla_model_regression_test_on_trillium",
schedule="0 0 * * *", # everyday at midnight # job["schedule"],
tags=["mantaray", "pytorchxla", "xlml"],
tags=["mantaray", "pytorchxla", "xlml", "vllm"],
start_date=datetime.datetime(2024, 4, 22),
catchup=False,
) as dag:
Expand All @@ -53,6 +136,7 @@
workload_file_name=workload_file_name,
)
run_workload
run_vllm_nightly_on_v6e_4_persistant_TPUVM()

# Create a DAG for each job from maxtext
for job in xlml_jobs:
Expand Down
Loading