GoogleCloudPlatform · ManfeiBai · Apr 10, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libsonnet b/dags/legacy_test/tests/pytorch/vllm_nightly_experimental.libsonnet
@@ -0,0 +1,190 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+local timeouts = import 'templates/timeouts.libsonnet';
+local utils = import 'templates/utils.libsonnet';
+local volumes = import 'templates/volumes.libsonnet';
+
+{
+  BaseTpuVmTest:: {
+    local config = self,
+    local cleanupHook = {
+      preStop: {
+        exec: {
+          command: [
+            'bash',
+            '/scripts/cleanup.sh',
+          ],
+        },
+      },
+    },
+
+    volumeMap+: {
+      scripts: volumes.MemoryVolumeSpec {
+        name: 'scripts',
+        mountPath: '/scripts',
+      },
+    },
+
+    testName+: '-1vm',
+
+    tpuSettings+: {
+      local tpuSettings = self,
+
+      softwareVersion: 'v2-nightly',
+
+      // Startup script in TPU VM metadata.
+      tpuVmStartupScript: 'echo Running startup script',
+
+      // Amount of time to sleep after TPU is READY.
+      tpuVmCreateSleepSeconds:
+        if config.accelerator.version <= 3 then
+          60
+        else
+          90,
+
+      // Additional arguments for test Docker container.
+      tpuVmDockerArgs: '',
+    },
+    podTemplate+:: {
+      spec+: {
+        containerMap+:: {
+          monitor: null,
+          train+: {
+            lifecycle: cleanupHook,
+            resources+: {
+              // HACK: remove standard Cloud TPU resource.
+              local originalLimits = super.limits,
+              limits: {
+                [field]: originalLimits[field]
+                for field in std.objectFields(originalLimits)
+                if !std.startsWith(field, 'cloud-tpus.google.com')
+              },
+            },
+          },
+        },
+        initContainerMap+:: {
+          'create-tpu': {
+            image: 'google/cloud-sdk',
+            local tpuCreateSettings = {
+              acceleratorName: std.escapeStringBash(config.accelerator.name),
+              softwareVersion: std.escapeStringBash(config.tpuSettings.softwareVersion),
+              startupScript: std.escapeStringBash(config.tpuSettings.tpuVmStartupScript),
+              sleepTime: config.tpuSettings.tpuVmCreateSleepSeconds,
+              testName: std.strReplace(config.testName, '.', '-'),
+            },
+            command: utils.scriptCommand(|||
+              project=cloud-ml-benchmarking
+              zone=us-east5-b
+              tpu_name=manfei-2025-v6e-4
+              ssh-keygen -t rsa -f /scripts/id_rsa -q -N ""
+
+              // echo "
+              // gcloud alpha compute tpus tpu-vm delete -q --async ${tpu_name} --zone=${zone}
+              // sleep 60
+              // " > /scripts/cleanup.sh
+
+              echo "xl-ml-test:$(cat /scripts/id_rsa.pub)" > ssh-keys.txt
+              echo %(startupScript)s > startup-script.txt
+
+              // # Retry every 30 seconds for up to 10 minutes
+              // start_time="$(date -u +%%s)"
+              // for i in {1..40}; do
+              //   set +e
+              //   gcloud alpha compute tpus tpu-vm create ${tpu_name} \
+              //     --accelerator-type=%(acceleratorName)s \
+              //     --version=%(softwareVersion)s  \
+              //     --metadata-from-file='ssh-keys=ssh-keys.txt,startup-script=startup-script.txt' \
+              //     --labels='test-name=%(testName)s' \
+              //     --zone=${zone}
+
+              //   exit_code=$?
+              //   set -e
+
+              //   current_time="$(date -u +%%s)"
+              //   elapsed_seconds=$(($current_time-$start_time))
+              //   # Break if command passed or 10-minute limit reached
+              //   test $exit_code = 0 && break
+              //   test $elapsed_seconds -gt 600 && break
+              //   sleep 30
+              // done
+
+              // if [ $exit_code -ne 0 ]; then
+              //   exit $exit_code
+              // fi
+
+              echo ${zone} > /scripts/zone
+              echo ${tpu_name} > /scripts/tpu_name
+              // gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --format="value(networkEndpoints[0].ipAddress)" > /scripts/tpu_ip
+              // gcloud compute tpus describe ${tpu_name} --project=${project} --zone=${zone} --flatten="networkEndpoints[]" --format="csv[no-heading](networkEndpoints.ipAddress)" > /scripts/all_tpu_ips
+
+              sleep %(sleepTime)d
+            ||| % tpuCreateSettings),
+            env: [
+              {
+                name: 'POD_UID',
+                valueFrom: {
+                  fieldRef: {
+                    fieldPath: 'metadata.uid',
+                  },
+                },
+              },
+            ],
+            volumeMounts: [
+              {
+                mountPath: '/scripts',
+                name: 'scripts',
+              },
+            ],
+          },
+        },
+      },
+    },
+  },
+  // `BaseTpuVmMixin` is used to convert a 2VM target to 1VM.
+  BaseTpuVmMixin:: self.BaseTpuVmTest {
+    local config = self,
+
+    // Disable retries
+    jobTemplate+:: {
+      spec+: {
+        activeDeadlineSeconds: std.max(2 * config.timeout, 24 * timeouts.one_hour),
+        backoffLimit: 0,
+      },
+    },
+
+    // TPU VM tests don't run the models directly
+    cpu: 1,
+    memory: '2Gi',
+
+    // Pass TPU VM name to test container
+    podTemplate+:: {
+      spec+: {
+        activeDeadlineSeconds: config.timeout,
+        containerMap+:: {
+          train+: {
+            image: 'google/cloud-sdk',
+            envMap+:: {
+              LOCAL_OUTPUT_DIR: '/tmp/model_dir',
+              KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS: if config.accelerator.replicas == 1 then
+                'local'
+              else
+                'tpu-$(POD_UID)',
+            },
+          },
+        },
+      },
+    },
+  },
+}
diff --git a/dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.libsonnet b/dags/legacy_test/tests/pytorch/vllm_nightly_experimental_2.libsonnet
@@ -0,0 +1,132 @@
+
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+local experimental = import '../experimental.libsonnet';
+local utils = import 'templates/utils.libsonnet';
+
+{
+  PyTorchTpuVmMixin:: experimental.BaseTpuVmMixin {
+    local config = self,
+
+    // Don't need to mount datasets within Kubernetes for TPU VM.
+    volumeMap+: { datasets: null },
+
+    tpuSettings+: {
+      tpuVmPytorchSetup: |||
+        echo No PyTorch setup required.
+      |||,
+      tpuVmExtraSetup: |||
+        echo No extra setup required.
+      |||,
+      // XRT_TPU_CONFIG set up by xla_dist on pods
+      tpuVmExports:
+        if config.accelerator.replicas == 1 then
+          |||
+            export XRT_TPU_CONFIG='localservice;0;localhost:51011'
+            export TPU_NUM_DEVICES=%d
+          ||| % config.accelerator.numCores
+        else
+          '',
+      tpuVmCreateSleepSeconds:
+        if config.accelerator.replicas == 1 then
+          super.tpuVmCreateSleepSeconds
+        else
+          180,
+      tpuVmXlaDistPrefix:
+        if config.accelerator.replicas == 1 then
+          null
+        else
+          [
+            'python3',
+            '-m',
+            'torch_xla.distributed.xla_dist',
+            '--tpu=tpu-$(POD_UID)',
+            '--',
+          ],
+      tpuVmMainCommandWorkers: '0',
+    },
+    podTemplate+:: {
+      spec+: {
+        containerMap+:: {
+          monitor: null,
+          train+: {
+            local scriptSettings = {
+              // Distribute command with xla_dist on pods
+              testCommand: if config.tpuSettings.tpuVmXlaDistPrefix == null then
+                utils.toCommandString(config.command)
+              else
+                utils.toCommandString(
+                  config.tpuSettings.tpuVmXlaDistPrefix + config.command
+                ),
+              commandWorkers: config.tpuSettings.tpuVmMainCommandWorkers,
+              pytorchSetup: config.tpuSettings.tpuVmPytorchSetup,
+              extraSetup: config.tpuSettings.tpuVmExtraSetup,
+              exports: config.tpuSettings.tpuVmExports,
+            },
+            args: null,
+            // PyTorch tests are structured as bash scripts that run directly
+            // on the Cloud TPU VM instead of using docker images.
+            command: [
+              'bash',
+              '-c',
+              |||
+                set -x
+                set -u
+
+                cat > workersetup.sh << TEST_SCRIPT_EOF
+                sudo apt-get -y update
+                // Ensure lock is released after udpate
+                sudo kill -9 $(lsof /var/lib/dpkg/lock-frontend | awk '{print $2}')
+                sudo dpkg --configure -a
+                sudo apt-get -y install nfs-common
+                sudo mkdir /datasets && sudo mount.nfs $(PYTORCH_DATA_LOCATION) /datasets
+
+                yes '' | gcloud compute config-ssh
+
+                cd
+                %(pytorchSetup)s
+
+                cd
+                %(extraSetup)s
+                TEST_SCRIPT_EOF
+                gcloud alpha compute tpus tpu-vm ssh xl-ml-test@$(cat /scripts/tpu_name) --zone=$(cat /scripts/zone) --ssh-key-file=/scripts/id_rsa --strict-host-key-checking=no --internal-ip --worker=all --command "$(cat workersetup.sh)"
+
+                // cat > testscript.sh << 'TEST_SCRIPT_EOF'
+                // %(exports)s
+                // %(testCommand)s
+                // TEST_SCRIPT_EOF
+                // gcloud alpha compute tpus tpu-vm ssh xl-ml-test@$(cat /scripts/tpu_name) --zone=$(cat /scripts/zone) --ssh-key-file=/scripts/id_rsa --strict-host-key-checking=no --internal-ip --worker=%(commandWorkers)s --command "$(cat testscript.sh)"
+
+                exit_code=$?
+                // bash /scripts/cleanup.sh
+
+                exit $exit_code
+              ||| % scriptSettings,
+            ],
+          },
+        },
+      },
+    },
+  },
+  PjRt:: {
+    tpuSettings+: {
+      tpuVmExports: |||
+        export PJRT_DEVICE=TPU
+      |||,
+      tpuVmXlaDistPrefix: null,
+      tpuVmMainCommandWorkers: 'all',
+    },
+  },
+}
@@ -20,6 +20,8 @@
 from xlml.utils import mantaray
 import yaml
 from dags import composer_env
+from dags.pytorch_xla.configs import pytorchxla_torchbench_config as config
+import dags.common.vm_resource as resource
 import re
 
 # Skip running this script in unit test because gcs loading will fail.
@@ -38,6 +40,54 @@
     if re.match(pattern, job["task_name"]):
       workload_file_name_list.append(job["file_name"])
 
+
+  def run_test_code_on_persistent_TPUVM():
+    gcloud_command = (
+        f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t' --worker=all \
+        --command=\"sudo docker run -it --privileged --net host --shm-size=16G --name testooo docker.io/vllm/vllm-tpu:270a5da495d24e947a71e2fa0c56635f4fad2dc3 \
+        bash -c 'export HF_TOKEN=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 && \
+  VLLM_USE_V1=1 python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-8B --disable-log-requests \
+  --max-num-seq=320 --gpu-memory-utilization=0.95 --tensor-parallel-size=4 --max-model-len=8192 --port 8009 & sleep 1200 && \
+  wget --no-verbose https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json && \
+  pip install --upgrade google-cloud-storage && rm -rf inference-benchmark && git clone https://github.com/AI-Hypercomputer/inference-benchmark && \
+  echo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main\" > /etc/apt/sources.list.d/google-cloud-sdk.list && \
+  curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
+  apt-get update && apt-get install -y google-cloud-sdk && apt-get -y install jq && export HF_TOKEN=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 && \
+  export PJRT_DEVICE=TPU && \
+  python inference-benchmark/benchmark_serving.py --save-json-results --port=8009 --dataset=ShareGPT_V3_unfiltered_cleaned_split.json \
+  --tokenizer=meta-llama/Meta-Llama-3-8B --request-rate=1 --backend=vllm --num-prompts=300 --max-input-length=1024 --max-output-length=1024 \
+  --file-prefix=benchmark --models=meta-llama/Meta-Llama-3-8B \"--output-bucket=gs://manfeipublic\"' && sudo docker stop testooo && sudo docker rm testooo\" \
+  ",
+    )
+    return gcloud_command
+
+
+  def make_sure_docker_container_cleaned_on_persistent_TPUVM():
+    gcloud_command = (
+        f"gcloud compute tpus tpu-vm ssh manfei-2025-v6e-4 --zone=us-east5-b --project=cloud-ml-benchmarking --ssh-flag='-t -4 -L 6009:localhost:6009' --worker=all --command=\"sudo docker stop testooo && sudo docker rm testooo\"",
+    )
+    return gcloud_command
+
+
+  @task
+  def run_on_v6e_4_persistant_TPUVM():
+    with tempfile.TemporaryDirectory() as tmpdir:
+      hook = SubprocessHook()
+
+      result = hook.run_command(
+          [
+              "bash",
+              "-c",
+              ";".join(
+                  run_test_code_on_persistent_TPUVM()
+                  + make_sure_docker_container_cleaned_on_persistent_TPUVM()
+              ),
+          ],
+          cwd=tmpdir,
+      )
+      assert result.exit_code == 0, f"Command failed with code {result.exit_code}"
+
+
   # merge all PyTorch/XLA tests ino one Dag
   with models.DAG(
       dag_id="pytorch_xla_model_regression_test_on_trillium",
@@ -53,6 +103,7 @@
           workload_file_name=workload_file_name,
       )
       run_workload
+    run_on_v6e_4_persistant_TPUVM()
 
   # Create a DAG for each job from maxtext
   for job in xlml_jobs: