Skip to content

Commit e2c6c2b

Browse files
committed
[Do Not Merge] Test for cloud runners.
This PR is a test for using cloud runners.
1 parent 2fa13e2 commit e2c6c2b

File tree

7 files changed

+62
-23
lines changed

7 files changed

+62
-23
lines changed

.github/workflows/demo.yml

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
name: Actions Runner Controller Demo
2+
on:
3+
pull_request:
4+
workflow_dispatch:
5+
6+
jobs:
7+
Explore-GitHub-Actions:
8+
runs-on: linux-x86-ct4p-240-4tpu
9+
container:
10+
image: us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest
11+
steps:
12+
- run: echo "🎉 This job uses runner scale set runners!"

.github/workflows/run_tests_internal.yml

+23-23
Original file line numberDiff line numberDiff line change
@@ -43,29 +43,29 @@ on:
4343

4444
jobs:
4545
run:
46-
runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
46+
#strategy:
47+
# matrix:
48+
# include:
49+
# - runner_label: ${{
50+
# inputs.device_type == 'tpu' && inputs.device_name == 'v4-8'
51+
# && 'linux-x86-ct4p-240-8tpu' ||
52+
# inputs.device_type == 'gpu' && inputs.device_name == 'a100-40gb-4'
53+
# && 'linux-x86-a2-48-a100-4gpu'
54+
# }}
55+
# runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
56+
#runs-on: ${{ matrix.runner_label }}
57+
runs-on: linux-x86-ct4p-240-4tpu
58+
container:
59+
image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
60+
#volumes:
61+
# - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
62+
env:
63+
XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
64+
TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
65+
options: ${{ inputs.container_resource_option }}
4766
steps:
48-
- name: Authenticate gcloud
49-
continue-on-error: true
50-
run: |
51-
# configure registries as root and as runner
52-
sudo gcloud auth configure-docker --quiet
53-
gcloud auth configure-docker --quiet
54-
sudo gcloud auth configure-docker us-docker.pkg.dev --quiet
55-
gcloud auth configure-docker us-docker.pkg.dev --quiet
56-
- name: Pull Docker image
57-
run: |
58-
docker pull gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
5967
- uses: actions/checkout@v4
60-
- name: Run tests inside container
68+
- name: Run Tests
6169
run: |
62-
docker run --rm \
63-
-v /home/runner/actions-runner/_work/maxtext/maxtext:/deps \
64-
-e XLA_PYTHON_CLIENT_MEM_FRACTION=${{ inputs.xla_python_client_mem_fraction }} \
65-
-e TF_FORCE_GPU_ALLOW_GROWTH=${{ inputs.tf_force_gpu_allow_growth }} \
66-
${{ inputs.container_resource_option }} \
67-
gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }} \
68-
bash -c "
69-
python3 -m pip install -e . &&
70-
python3 -m pytest --pyargs MaxText.tests -m '${{ inputs.pytest_marker }}' --durations=0
71-
"
70+
python3 -m pip install -e .
71+
python3 -m pytest --pyargs MaxText.tests -m "${{ inputs.pytest_marker }}" --durations=0

MaxText/tests/integration_tests/gradient_accumulation_test.py

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def test_grad_accumulate_same_loss(self):
4949
r"dataset_path=gs://maxtext-dataset",
5050
"gradient_clipping_threshold=0", # Ensures we are testing raw scales of gradients (clipping off)
5151
"enable_checkpointing=False",
52+
"enable_goodput_recording=false"
5253
"base_emb_dim=256",
5354
"base_num_decoder_layers=4",
5455
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",

MaxText/tests/integration_tests/standalone_dl_ckpt_test.py

+3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def test_standalone_dataloader(self):
4949
"dataset_path=gs://maxtext-dataset",
5050
"steps=100",
5151
"enable_checkpointing=false",
52+
"enable_goodput_recording=false",
5253
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
5354
)
5455
) # need to pass relative path to tokenizer
@@ -74,6 +75,7 @@ def test_standalone_checkpointer(self):
7475
"enable_checkpointing=True",
7576
"checkpoint_period=50",
7677
"async_checkpointing=False",
78+
"enable_goodput_recording=false",
7779
)
7880
)
7981
# restore at 50 and checkpoint at 100
@@ -93,6 +95,7 @@ def test_standalone_checkpointer(self):
9395
"enable_checkpointing=True",
9496
"checkpoint_period=50",
9597
"async_checkpointing=False",
98+
"enable_goodput_recording=false",
9699
)
97100
)
98101

MaxText/tests/integration_tests/train_tests.py

+11
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class TrainTests(unittest.TestCase):
3535
r"dataset_path=gs://maxtext-dataset",
3636
"steps=2",
3737
"enable_checkpointing=False",
38+
"enable_goodput_recording=false",
3839
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
3940
],
4041
"synthetic": [ # tests base config with synthtic dataset
@@ -45,6 +46,7 @@ class TrainTests(unittest.TestCase):
4546
r"dataset_path=gs://maxtext-dataset",
4647
"steps=2",
4748
"enable_checkpointing=False",
49+
"enable_goodput_recording=false",
4850
"dataset_type=synthetic",
4951
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
5052
],
@@ -56,6 +58,7 @@ class TrainTests(unittest.TestCase):
5658
r"dataset_path=gs://maxtext-dataset",
5759
"steps=2",
5860
"enable_checkpointing=False",
61+
"enable_goodput_recording=false",
5962
"per_device_batch_size=0.25",
6063
"ici_tensor_parallelism=4",
6164
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
@@ -68,6 +71,7 @@ class TrainTests(unittest.TestCase):
6871
r"dataset_path=gs://maxtext-dataset",
6972
"steps=2",
7073
"ici_tensor_transpose_parallelism=4",
74+
"enable_goodput_recording=false",
7175
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
7276
],
7377
"int8": [ # tests base config with int8
@@ -79,6 +83,7 @@ class TrainTests(unittest.TestCase):
7983
"quantization=int8",
8084
"steps=2",
8185
"enable_checkpointing=False",
86+
"enable_goodput_recording=false",
8287
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
8388
],
8489
"fp8": [ # tests base config with fp8
@@ -90,6 +95,7 @@ class TrainTests(unittest.TestCase):
9095
"quantization=fp8",
9196
"steps=2",
9297
"enable_checkpointing=False",
98+
"enable_goodput_recording=false",
9399
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
94100
],
95101
"nanoo_fp8": [ # tests base config with nanoo_fp8
@@ -101,6 +107,7 @@ class TrainTests(unittest.TestCase):
101107
"quantization=nanoo_fp8",
102108
"steps=2",
103109
"enable_checkpointing=False",
110+
"enable_goodput_recording=false",
104111
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
105112
],
106113
"dropout": [ # tests base config with dropout
@@ -111,6 +118,7 @@ class TrainTests(unittest.TestCase):
111118
r"dataset_path=gs://maxtext-dataset",
112119
"steps=2",
113120
"enable_checkpointing=False",
121+
"enable_goodput_recording=false",
114122
"max_target_length=128",
115123
"per_device_batch_size=1",
116124
"dropout_rate=0.02",
@@ -123,6 +131,7 @@ class TrainTests(unittest.TestCase):
123131
"run_name=runner_test",
124132
"steps=2",
125133
"enable_checkpointing=False",
134+
"enable_goodput_recording=false",
126135
"dataset_type=hf",
127136
"hf_path=parquet",
128137
r"hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet",
@@ -217,6 +226,7 @@ def test_gpu_cudnn_flash_te(self):
217226
r"dataset_path=gs://maxtext-dataset",
218227
"steps=2",
219228
"enable_checkpointing=False",
229+
"enable_goodput_recording=false",
220230
"attention=cudnn_flash_te",
221231
"packing=False",
222232
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
@@ -235,6 +245,7 @@ def test_gpu_context_parallelism(self):
235245
r"dataset_path=gs://maxtext-dataset",
236246
"steps=10",
237247
"enable_checkpointing=False",
248+
"enable_goodput_recording=false",
238249
"attention=cudnn_flash_te",
239250
"ici_fsdp_parallelism=2",
240251
"ici_context_parallelism=2",

MaxText/tests/pipeline_parallelism_test.py

+10
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ def test_circular_minimum_microbatches_same_output_and_grad(self):
157157
config = pyconfig.initialize(
158158
[sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
159159
enable_checkpointing=False,
160+
enable_goodput_recording=False,
160161
run_name="circular_minimum_microbatches",
161162
max_target_length=128,
162163
base_emb_dim=28,
@@ -173,6 +174,7 @@ def test_circular_extra_microbatches_same_output_and_grad(self):
173174
config = pyconfig.initialize(
174175
[sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
175176
enable_checkpointing=False,
177+
enable_goodput_recording=False,
176178
run_name="circular_extra_microbatches",
177179
max_target_length=128,
178180
base_emb_dim=28,
@@ -189,6 +191,7 @@ def test_circular_ag_once(self):
189191
config = pyconfig.initialize(
190192
[sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
191193
enable_checkpointing=False,
194+
enable_goodput_recording=False,
192195
run_name="circular_ag_once",
193196
max_target_length=128,
194197
base_emb_dim=28,
@@ -206,6 +209,7 @@ def test_non_circular_same_output_and_grad(self):
206209
config = pyconfig.initialize(
207210
[sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
208211
enable_checkpointing=False,
212+
enable_goodput_recording=False,
209213
run_name="non_circular",
210214
max_target_length=128,
211215
base_emb_dim=28,
@@ -239,6 +243,7 @@ def test_full_train_circular(self):
239243
"dataset_type=synthetic",
240244
"steps=3",
241245
"enable_checkpointing=False",
246+
"enable_goodput_recording=False",
242247
"ici_pipeline_parallelism=4",
243248
"num_layers_per_pipeline_stage=2",
244249
"num_pipeline_microbatches=8",
@@ -253,6 +258,7 @@ def test_delay_activation_forwarding_same_output_and_grad(self):
253258
config = pyconfig.initialize(
254259
[sys.argv[0], os.path.join(PKG_DIR, "configs", "base.yml")],
255260
enable_checkpointing=False,
261+
enable_goodput_recording=False,
256262
run_name="activation_forwarding",
257263
max_target_length=128,
258264
base_emb_dim=28,
@@ -287,6 +293,7 @@ def test_full_train_non_circular(self):
287293
"dataset_type=synthetic",
288294
"steps=3",
289295
"enable_checkpointing=False",
296+
"enable_goodput_recording=False",
290297
"ici_pipeline_parallelism=4",
291298
"num_layers_per_pipeline_stage=8",
292299
"num_pipeline_microbatches=8",
@@ -318,6 +325,7 @@ def test_subset_layers(self):
318325
"dataset_type=synthetic",
319326
"steps=3",
320327
"enable_checkpointing=False",
328+
"enable_goodput_recording=False",
321329
"ici_pipeline_parallelism=4",
322330
"num_layers_per_pipeline_stage=1",
323331
"num_pipeline_repeats=2",
@@ -351,6 +359,7 @@ def test_full_train_fp8(self):
351359
"dataset_type=synthetic",
352360
"steps=3",
353361
"enable_checkpointing=False",
362+
"enable_goodput_recording=False",
354363
"ici_pipeline_parallelism=4",
355364
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
356365
"quantization=fp8",
@@ -382,6 +391,7 @@ def test_full_train_nanoo_fp8(self):
382391
"dataset_type=synthetic",
383392
"steps=3",
384393
"enable_checkpointing=False",
394+
"enable_goodput_recording=False",
385395
"ici_pipeline_parallelism=4",
386396
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
387397
"quantization=nanoo_fp8",

MaxText/tests/simple_decoder_layer_test.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def test_simple_decoder_layer(self):
3232
r"dataset_path=gs://maxtext-dataset",
3333
"decoder_block=simple",
3434
"enable_checkpointing=False",
35+
"enable_goodput_recording=false",
3536
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
3637
"steps=3",
3738
]
@@ -48,6 +49,7 @@ def test_mlp_decoder_layer(self):
4849
r"dataset_path=gs://maxtext-dataset",
4950
"decoder_block=simple_mlp",
5051
"enable_checkpointing=False",
52+
"enable_goodput_recording=false",
5153
rf"tokenizer_path={os.path.join(os.path.dirname(PKG_DIR), 'assets', 'tokenizer.llama2')}",
5254
"steps=3",
5355
]

0 commit comments

Comments
 (0)