[Do Not Merge] Test for cloud runners.

shralex · shralex · commit e2c6c2b318f5 · 2025-05-02T17:35:40.000Z
This PR is a test for using cloud runners.
diff --git a/.github/workflows/demo.yml b/.github/workflows/demo.yml
@@ -0,0 +1,12 @@
+name: Actions Runner Controller Demo
+on:
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  Explore-GitHub-Actions:
+    runs-on: linux-x86-ct4p-240-4tpu
+    container:
+      image: us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest
+    steps:
+    - run: echo "🎉 This job uses runner scale set runners!"
diff --git a/.github/workflows/run_tests_internal.yml b/.github/workflows/run_tests_internal.yml
@@ -43,29 +43,29 @@ on:
 
 jobs:
   run:
-    runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
+    #strategy:
+    #  matrix:
+    #    include:
+    #      - runner_label: ${{
+    #          inputs.device_type == 'tpu' && inputs.device_name == 'v4-8'
+    #          && 'linux-x86-ct4p-240-8tpu' ||
+    #          inputs.device_type == 'gpu' && inputs.device_name == 'a100-40gb-4'
+    #          && 'linux-x86-a2-48-a100-4gpu'
+    #        }}
+#   runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
+    #runs-on: ${{ matrix.runner_label }}
+    runs-on: linux-x86-ct4p-240-4tpu
+    container:
+      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
+      #volumes:
+      #  - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
+      env:
+        XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
+        TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
+      options: ${{ inputs.container_resource_option }}
     steps:
-      - name: Authenticate gcloud
-        continue-on-error: true
-        run: |
-          # configure registries as root and as runner
-          sudo gcloud auth configure-docker --quiet
-          gcloud auth configure-docker --quiet
-          sudo gcloud auth configure-docker us-docker.pkg.dev --quiet
-          gcloud auth configure-docker us-docker.pkg.dev --quiet
-      - name: Pull Docker image
-        run: |
-          docker pull gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
       - uses: actions/checkout@v4
-      - name: Run tests inside container
+      - name: Run Tests
         run: |
-          docker run --rm \
-            -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps \
-            -e XLA_PYTHON_CLIENT_MEM_FRACTION=${{ inputs.xla_python_client_mem_fraction }} \
-            -e TF_FORCE_GPU_ALLOW_GROWTH=${{ inputs.tf_force_gpu_allow_growth }} \
-            ${{ inputs.container_resource_option }} \
-            gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }} \
-            bash -c "
-              python3 -m pip install -e . &&
-              python3 -m pytest --pyargs MaxText.tests -m '${{ inputs.pytest_marker }}' --durations=0
-            "
+          python3 -m pip install -e .
+          python3 -m pytest --pyargs MaxText.tests -m "${{ inputs.pytest_marker }}" --durations=0
diff --git a/MaxText/tests/integration_tests/gradient_accumulation_test.py b/MaxText/tests/integration_tests/gradient_accumulation_test.py
@@ -49,6 +49,7 @@ def test_grad_accumulate_same_loss(self):
         r"dataset_path=gs://maxtext-dataset",
         "gradient_clipping_threshold=0",  # Ensures we are testing raw scales of gradients (clipping off)
         "enable_checkpointing=False",
+        "enable_goodput_recording=false"
         "base_emb_dim=256",
         "base_num_decoder_layers=4",
         rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
diff --git a/MaxText/tests/integration_tests/standalone_dl_ckpt_test.py b/MaxText/tests/integration_tests/standalone_dl_ckpt_test.py
@@ -49,6 +49,7 @@ def test_standalone_dataloader(self):
             "dataset_path=gs://maxtext-dataset",
             "steps=100",
             "enable_checkpointing=false",
+            "enable_goodput_recording=false",
             rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
         )
     )  # need to pass relative path to tokenizer
@@ -74,6 +75,7 @@ def test_standalone_checkpointer(self):
             "enable_checkpointing=True",
             "checkpoint_period=50",
             "async_checkpointing=False",
+            "enable_goodput_recording=false",
         )
     )
     # restore at 50 and checkpoint at 100
@@ -93,6 +95,7 @@ def test_standalone_checkpointer(self):
             "enable_checkpointing=True",
             "checkpoint_period=50",
             "async_checkpointing=False",
+            "enable_goodput_recording=false",
         )
     )
 
diff --git a/MaxText/tests/integration_tests/train_tests.py b/MaxText/tests/integration_tests/train_tests.py
@@ -35,6 +35,7 @@ class TrainTests(unittest.TestCase):
           r"dataset_path=gs://maxtext-dataset",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
       "synthetic": [  # tests base config with synthtic dataset
@@ -45,6 +46,7 @@ class TrainTests(unittest.TestCase):
           r"dataset_path=gs://maxtext-dataset",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           "dataset_type=synthetic",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
@@ -56,6 +58,7 @@ class TrainTests(unittest.TestCase):
           r"dataset_path=gs://maxtext-dataset",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           "per_device_batch_size=0.25",
           "ici_tensor_parallelism=4",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
@@ -68,6 +71,7 @@ class TrainTests(unittest.TestCase):
           r"dataset_path=gs://maxtext-dataset",
           "steps=2",
           "ici_tensor_transpose_parallelism=4",
+          "enable_goodput_recording=false",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
       "int8": [  # tests base config with int8
@@ -79,6 +83,7 @@ class TrainTests(unittest.TestCase):
           "quantization=int8",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
       "fp8": [  # tests base config with fp8
@@ -90,6 +95,7 @@ class TrainTests(unittest.TestCase):
           "quantization=fp8",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
       "nanoo_fp8": [  # tests base config with nanoo_fp8
@@ -101,6 +107,7 @@ class TrainTests(unittest.TestCase):
           "quantization=nanoo_fp8",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
       ],
       "dropout": [  # tests base config with dropout
@@ -111,6 +118,7 @@ class TrainTests(unittest.TestCase):
           r"dataset_path=gs://maxtext-dataset",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           "max_target_length=128",
           "per_device_batch_size=1",
           "dropout_rate=0.02",
@@ -123,6 +131,7 @@ class TrainTests(unittest.TestCase):
           "run_name=runner_test",
           "steps=2",
           "enable_checkpointing=False",
+          "enable_goodput_recording=false",
           "dataset_type=hf",
           "hf_path=parquet",
           r"hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet",
@@ -217,6 +226,7 @@ def test_gpu_cudnn_flash_te(self):
         r"dataset_path=gs://maxtext-dataset",
         "steps=2",
         "enable_checkpointing=False",
+        "enable_goodput_recording=false",
         "attention=cudnn_flash_te",
         "packing=False",
         rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
@@ -235,6 +245,7 @@ def test_gpu_context_parallelism(self):
         r"dataset_path=gs://maxtext-dataset",
         "steps=10",
         "enable_checkpointing=False",
+        "enable_goodput_recording=false",
         "attention=cudnn_flash_te",
         "ici_fsdp_parallelism=2",
         "ici_context_parallelism=2",
diff --git a/MaxText/tests/pipeline_parallelism_test.py b/MaxText/tests/pipeline_parallelism_test.py
@@ -157,6 +157,7 @@ def test_circular_minimum_microbatches_same_output_and_grad(self):
     config = pyconfig.initialize(
         [sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
         enable_checkpointing=False,
+        enable_goodput_recording=False,
         run_name="circular_minimum_microbatches",
         max_target_length=128,
         base_emb_dim=28,
@@ -173,6 +174,7 @@ def test_circular_extra_microbatches_same_output_and_grad(self):
     config = pyconfig.initialize(
         [sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
         enable_checkpointing=False,
+        enable_goodput_recording=False,
         run_name="circular_extra_microbatches",
         max_target_length=128,
         base_emb_dim=28,
@@ -189,6 +191,7 @@ def test_circular_ag_once(self):
     config = pyconfig.initialize(
         [sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
         enable_checkpointing=False,
+        enable_goodput_recording=False,
         run_name="circular_ag_once",
         max_target_length=128,
         base_emb_dim=28,
@@ -206,6 +209,7 @@ def test_non_circular_same_output_and_grad(self):
     config = pyconfig.initialize(
         [sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
         enable_checkpointing=False,
+        enable_goodput_recording=False,
         run_name="non_circular",
         max_target_length=128,
         base_emb_dim=28,
@@ -239,6 +243,7 @@ def test_full_train_circular(self):
             "dataset_type=synthetic",
             "steps=3",
             "enable_checkpointing=False",
+            "enable_goodput_recording=False",
             "ici_pipeline_parallelism=4",
             "num_layers_per_pipeline_stage=2",
             "num_pipeline_microbatches=8",
@@ -253,6 +258,7 @@ def test_delay_activation_forwarding_same_output_and_grad(self):
     config = pyconfig.initialize(
         [sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
         enable_checkpointing=False,
+        enable_goodput_recording=False,
         run_name="activation_forwarding",
         max_target_length=128,
         base_emb_dim=28,
@@ -287,6 +293,7 @@ def test_full_train_non_circular(self):
             "dataset_type=synthetic",
             "steps=3",
             "enable_checkpointing=False",
+            "enable_goodput_recording=False",
             "ici_pipeline_parallelism=4",
             "num_layers_per_pipeline_stage=8",
             "num_pipeline_microbatches=8",
@@ -318,6 +325,7 @@ def test_subset_layers(self):
             "dataset_type=synthetic",
             "steps=3",
             "enable_checkpointing=False",
+            "enable_goodput_recording=False",
             "ici_pipeline_parallelism=4",
             "num_layers_per_pipeline_stage=1",
             "num_pipeline_repeats=2",
@@ -351,6 +359,7 @@ def test_full_train_fp8(self):
             "dataset_type=synthetic",
             "steps=3",
             "enable_checkpointing=False",
+            "enable_goodput_recording=False",
             "ici_pipeline_parallelism=4",
             rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
             "quantization=fp8",
@@ -382,6 +391,7 @@ def test_full_train_nanoo_fp8(self):
             "dataset_type=synthetic",
             "steps=3",
             "enable_checkpointing=False",
+            "enable_goodput_recording=False",
             "ici_pipeline_parallelism=4",
             rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
             "quantization=nanoo_fp8",
diff --git a/MaxText/tests/simple_decoder_layer_test.py b/MaxText/tests/simple_decoder_layer_test.py
@@ -32,6 +32,7 @@ def test_simple_decoder_layer(self):
             r"dataset_path=gs://maxtext-dataset",
             "decoder_block=simple",
             "enable_checkpointing=False",
+            "enable_goodput_recording=false",
             rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
             "steps=3",
         ]
@@ -48,6 +49,7 @@ def test_mlp_decoder_layer(self):
             r"dataset_path=gs://maxtext-dataset",
             "decoder_block=simple_mlp",
             "enable_checkpointing=False",
+            "enable_goodput_recording=false",
             rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
             "steps=3",
         ]

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def test_standalone_dataloader(self):`
`49`	`49`	`"dataset_path=gs://maxtext-dataset",`
`50`	`50`	`"steps=100",`
`51`	`51`	`"enable_checkpointing=false",`
	`52`	`+ "enable_goodput_recording=false",`
`52`	`53`	`rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",`
`53`	`54`	`)`
`54`	`55`	`) # need to pass relative path to tokenizer`
`@@ -74,6 +75,7 @@ def test_standalone_checkpointer(self):`
`74`	`75`	`"enable_checkpointing=True",`
`75`	`76`	`"checkpoint_period=50",`
`76`	`77`	`"async_checkpointing=False",`
	`78`	`+ "enable_goodput_recording=false",`
`77`	`79`	`)`
`78`	`80`	`)`
`79`	`81`	`# restore at 50 and checkpoint at 100`
`@@ -93,6 +95,7 @@ def test_standalone_checkpointer(self):`
`93`	`95`	`"enable_checkpointing=True",`
`94`	`96`	`"checkpoint_period=50",`
`95`	`97`	`"async_checkpointing=False",`
	`98`	`+ "enable_goodput_recording=false",`
`96`	`99`	`)`
`97`	`100`	`)`
`98`	`101`