Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add/Enable vllm inference benchmark run on persistent TPUVM #625

Merged
merged 49 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
c97e13e
Create vllm-nightly.py
ManfeiBai Mar 19, 2025
f71d664
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
e762b60
Update vllm-nightly.py
ManfeiBai Mar 19, 2025
53c9db5
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
377224f
Update run_mantaray_jobs.py
ManfeiBai Mar 20, 2025
34f4355
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
bd7e97e
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
265ab41
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
abd67c7
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
95fb795
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7d4e8dc
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
34c9ea1
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c6b1cd
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
a16d2d8
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
77aa55a
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
7563599
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
1c085e0
Update vllm-nightly.py
ManfeiBai Mar 20, 2025
9c4d8de
Create vllm_nightly_experimental.libsonnet
ManfeiBai Mar 20, 2025
cf3790e
Create vllm_nightly_experimental_2.libsonnet
ManfeiBai Mar 21, 2025
dc476e2
Create vllm-nightly-try2.py
ManfeiBai Mar 21, 2025
b69e195
Update tpu.py
ManfeiBai Mar 24, 2025
edcf677
Create vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
a54fa70
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
064558e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
876f0f1
Update tpu.py
ManfeiBai Mar 24, 2025
bffed1a
Create vllm-nightly-4.py
ManfeiBai Mar 24, 2025
747446d
Update run_mantaray_jobs.py
ManfeiBai Mar 24, 2025
9c7fe62
Update vllm-nightly-4.py
ManfeiBai Mar 24, 2025
e79725e
Update vllm-nightly-try3.py
ManfeiBai Mar 24, 2025
8da1d81
Update vllm-nightly.py
ManfeiBai Mar 24, 2025
a78d38c
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
e3caa60
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libso…
ManfeiBai Mar 27, 2025
23b8f0f
Delete dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.lib…
ManfeiBai Mar 27, 2025
aa47112
Update tpu.py
ManfeiBai Mar 27, 2025
813082a
Delete dags/pytorch_xla/vllm-nightly.py
ManfeiBai Mar 27, 2025
c885d43
Delete dags/pytorch_xla/vllm-nightly-try3.py
ManfeiBai Mar 27, 2025
f98a338
Delete dags/pytorch_xla/vllm-nightly-try2.py
ManfeiBai Mar 27, 2025
ce8478f
Delete dags/pytorch_xla/vllm-nightly-4.py
ManfeiBai Mar 27, 2025
ccc652f
Update run_mantaray_jobs.py
ManfeiBai Mar 27, 2025
32d485d
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
72959ca
Update run_mantaray_jobs.py
ManfeiBai Mar 31, 2025
1bc7cc4
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
468379a
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
de440e8
format
ManfeiBai Apr 10, 2025
dad22b3
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
afdc077
format
ManfeiBai Apr 10, 2025
9d6ac58
format
ManfeiBai Apr 10, 2025
3b3f23b
Update run_mantaray_jobs.py
ManfeiBai Apr 10, 2025
1f30873
Merge branch 'master' into ManfeiBai-patch-11
ManfeiBai Apr 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

local timeouts = import 'templates/timeouts.libsonnet';
local utils = import 'templates/utils.libsonnet';
local volumes = import 'templates/volumes.libsonnet';

{
BaseTpuVmTest:: {
local config = self,
local cleanupHook = {
preStop: {
exec: {
command: [
'bash',
'/scripts/cleanup.sh',
],
},
},
},

volumeMap+: {
scripts: volumes.MemoryVolumeSpec {
name: 'scripts',
mountPath: '/scripts',
},
},

testName+: '-1vm',

tpuSettings+: {
local tpuSettings = self,

softwareVersion: 'v2-nightly',

// Startup script in TPU VM metadata.
tpuVmStartupScript: 'echo Running startup script',

// Amount of time to sleep after TPU is READY.
tpuVmCreateSleepSeconds:
if config.accelerator.version <= 3 then
60
else
90,

// Additional arguments for test Docker container.
tpuVmDockerArgs: '',
},
podTemplate+:: {
spec+: {
containerMap+:: {
monitor: null,
train+: {
lifecycle: cleanupHook,
resources+: {
// HACK: remove standard Cloud TPU resource.
local originalLimits = super.limits,
limits: {
[field]: originalLimits[field]
for field in std.objectFields(originalLimits)
if !std.startsWith(field, 'cloud-tpus.google.com')
},
},
},
},
initContainerMap+:: {
'create-tpu': {
image: 'google/cloud-sdk',
local tpuCreateSettings = {
acceleratorName: std.escapeStringBash(config.accelerator.name),
softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion),
startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript),
sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds,
testName: std.strReplace(config.testName, '.', '-'),
},
command: utils.scriptCommand(|||
project=cloud-ml-benchmarking
zone=us-east5-b
tpu_name=manfei-2025-v6e-4
ssh-keygen -t rsa -f /scripts/id_rsa -q -N ""

// echo "
// gcloud alpha compute tpus tpu-vm delete -q --async ${tpu_name} --zone=${zone}
// sleep 60
// " > /scripts/cleanup.sh

echo "xl-ml-test:$(cat /scripts/id_rsa.pub)" > ssh-keys.txt
echo %(startupScript)s > startup-script.txt

// # Retry every 30 seconds for up to 10 minutes
// start_time="$(date -u +%%s)"
// for i in {1..40}; do
// set +e
// gcloud alpha compute tpus tpu-vm create ${tpu_name} \
// --accelerator-type=%(acceleratorName)s \
// --version=%(softwareVersion)s \
// --metadata-from-file='ssh-keys=ssh-keys.txt,startup-script=startup-script.txt' \
// --labels='test-name=%(testName)s' \
// --zone=${zone}

// exit_code=$?
// set -e

// current_time="$(date -u +%%s)"
// elapsed_seconds=$(($current_time-$start_time))
// # Break if command passed or 10-minute limit reached
// test $exit_code = 0 && break
// test $elapsed_seconds -gt 600 && break
// sleep 30
// done

// if [ $exit_code -ne 0 ]; then
// exit $exit_code
// fi

echo ${zone} > /scripts/zone
echo ${tpu_name} > /scripts/tpu_name
// gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --format="value(networkEndpoints[0].ipAddress)" > /scripts/tpu_ip
// gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --flatten="networkEndpoints[]" --format="csv[no-heading](networkEndpoints.ipAddress)" > /scripts/all_tpu_ips

sleep %(sleepTime)d
||| % tpuCreateSettings),
env: [
{
name: 'POD_UID',
valueFrom: {
fieldRef: {
fieldPath: 'metadata.uid',
},
},
},
],
volumeMounts: [
{
mountPath: '/scripts',
name: 'scripts',
},
],
},
},
},
},
},
// `BaseTpuVmMixin` is used to convert a 2VM target to 1VM.
BaseTpuVmMixin:: self.BaseTpuVmTest {
local config = self,

// Disable retries
jobTemplate+:: {
spec+: {
activeDeadlineSeconds: std.max(2 * config.timeout, 24 * timeouts.one_hour),
backoffLimit: 0,
},
},

// TPU VM tests don't run the models directly
cpu: 1,
memory: '2Gi',

// Pass TPU VM name to test container
podTemplate+:: {
spec+: {
activeDeadlineSeconds: config.timeout,
containerMap+:: {
train+: {
image: 'google/cloud-sdk',
envMap+:: {
LOCAL_OUTPUT_DIR: '/tmp/model_dir',
KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS: if config.accelerator.replicas == 1 then
'local'
else
'tpu-$(POD_UID)',
},
},
},
},
},
},
}
132 changes: 132 additions & 0 deletions dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@

// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

local experimental = import '../experimental.libsonnet';
local utils = import 'templates/utils.libsonnet';

{
PyTorchTpuVmMixin:: experimental.BaseTpuVmMixin {
local config = self,

// Don't need to mount datasets within Kubernetes for TPU VM.
volumeMap+: { datasets: null },

tpuSettings+: {
tpuVmPytorchSetup: |||
echo No PyTorch setup required.
|||,
tpuVmExtraSetup: |||
echo No extra setup required.
|||,
// XRT_TPU_CONFIG set up by xla_dist on pods
tpuVmExports:
if config.accelerator.replicas == 1 then
|||
export XRT_TPU_CONFIG='localservice;0;localhost:51011'
export TPU_NUM_DEVICES=%d
||| % config.accelerator.numCores
else
'',
tpuVmCreateSleepSeconds:
if config.accelerator.replicas == 1 then
super.tpuVmCreateSleepSeconds
else
180,
tpuVmXlaDistPrefix:
if config.accelerator.replicas == 1 then
null
else
[
'python3',
'-m',
'torch_xla.distributed.xla_dist',
'--tpu=tpu-$(POD_UID)',
'--',
],
tpuVmMainCommandWorkers: '0',
},
podTemplate+:: {
spec+: {
containerMap+:: {
monitor: null,
train+: {
local scriptSettings = {
// Distribute command with xla_dist on pods
testCommand: if config.tpuSettings.tpuVmXlaDistPrefix == null then
utils.toCommandString(config.command)
else
utils.toCommandString(
config.tpuSettings.tpuVmXlaDistPrefix + config.command
),
commandWorkers: config.tpuSettings.tpuVmMainCommandWorkers,
pytorchSetup: config.tpuSettings.tpuVmPytorchSetup,
extraSetup: config.tpuSettings.tpuVmExtraSetup,
exports: config.tpuSettings.tpuVmExports,
},
args: null,
// PyTorch tests are structured as bash scripts that run directly
// on the Cloud TPU VM instead of using docker images.
command: [
'bash',
'-c',
|||
set -x
set -u

cat > workersetup.sh << TEST_SCRIPT_EOF
sudo apt-get -y update
// Ensure lock is released after udpate
sudo kill -9 $(lsof /var/lib/dpkg/lock-frontend | awk '{print $2}')
sudo dpkg --configure -a
sudo apt-get -y install nfs-common
sudo mkdir /datasets && sudo mount.nfs $(PYTORCH_DATA_LOCATION) /datasets

yes '' | gcloud compute config-ssh

cd
%(pytorchSetup)s

cd
%(extraSetup)s
TEST_SCRIPT_EOF
gcloud alpha compute tpus tpu-vm ssh xl-ml-test@$(cat /scripts/tpu_name) --zone=$(cat /scripts/zone) --ssh-key-file=/scripts/id_rsa --strict-host-key-checking=no --internal-ip --worker=all --command "$(cat workersetup.sh)"

// cat > testscript.sh << 'TEST_SCRIPT_EOF'
// %(exports)s
// %(testCommand)s
// TEST_SCRIPT_EOF
// gcloud alpha compute tpus tpu-vm ssh xl-ml-test@$(cat /scripts/tpu_name) --zone=$(cat /scripts/zone) --ssh-key-file=/scripts/id_rsa --strict-host-key-checking=no --internal-ip --worker=%(commandWorkers)s --command "$(cat testscript.sh)"

exit_code=$?
// bash /scripts/cleanup.sh

exit $exit_code
||| % scriptSettings,
],
},
},
},
},
},
PjRt:: {
tpuSettings+: {
tpuVmExports: |||
export PJRT_DEVICE=TPU
|||,
tpuVmXlaDistPrefix: null,
tpuVmMainCommandWorkers: 'all',
},
},
}
51 changes: 51 additions & 0 deletions dags/mantaray/run_mantaray_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from xlml.utils import mantaray
import yaml
from dags import composer_env
from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
import dags.common.vm_resource as resource
import re

# Skip running this script in unit test because gcs loading will fail.
Expand All @@ -38,6 +40,54 @@
if re.match(pattern, job["task_name"]):
workload_file_name_list.append(job["file_name"])


def run_test_code_on_persistent_TPUVM():
gcloud_command = (
f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t' --worker=all \
--command=\"sudo docker run -it --privileged --net host --shm-size=16G --name testooo docker.io/vllm/vllm-tpu:270a5da495d24e947a71e2fa0c56635f4fad2dc3 \
bash -c 'export HF_TOKEN=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 && \
VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B --disable-log-requests \
--max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 --max-model-len=8192 --port 8009 & sleep 1200 && \
wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \
pip install --upgrade google-cloud-storage && rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \
echo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main\" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt-get update && apt-get install -y google-cloud-sdk && apt-get -y install jq && export HF_TOKEN=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 && \
export PJRT_DEVICE=TPU && \
python inference-benchmark/benchmark_serving.py --save-json-results --port=8009 --dataset=ShareGPT_V3_unfiltered_cleaned_split.json \
--tokenizer=meta-llama/Meta-Llama-3-8B --request-rate=1 --backend=vllm --num-prompts=300 --max-input-length=1024 --max-output-length=1024 \
--file-prefix=benchmark --models=meta-llama/Meta-Llama-3-8B \"--output-bucket=gs://manfeipublic\"' && sudo docker stop testooo && sudo docker rm testooo\" \
",
)
return gcloud_command


def make_sure_docker_container_cleaned_on_persistent_TPUVM():
gcloud_command = (
f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t -4 -L 6009:localhost:6009' --worker=all --command=\"sudo docker stop testooo && sudo docker rm testooo\"",
)
return gcloud_command


@task
def run_on_v6e_4_persistant_TPUVM():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
";".join(
run_test_code_on_persistent_TPUVM()
+ make_sure_docker_container_cleaned_on_persistent_TPUVM()
),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"


# merge all PyTorch/XLA tests ino one Dag
with models.DAG(
dag_id="pytorch_xla_model_regression_test_on_trillium",
Expand All @@ -53,6 +103,7 @@
workload_file_name=workload_file_name,
)
run_workload
run_on_v6e_4_persistant_TPUVM()

# Create a DAG for each job from maxtext
for job in xlml_jobs:
Expand Down
Loading
Loading