|
36 | 36 | catchup=False,
|
37 | 37 | concurrency=2,
|
38 | 38 | ) as dag:
|
39 |
| - # Testing configurations |
40 |
| - tpu_configs = { |
41 |
| - # accelerator: [(model_size, num_cores), ...], |
42 |
| - "v4": [("22b", 128), ("52b", 384)], |
43 |
| - "v5e": [("16b", 256), ("32b", 256), ("64b", 256), ("128b", 256)], |
44 |
| - "v5p": [ |
45 |
| - ("32b", 128), |
46 |
| - ("64b", 128), |
47 |
| - ("128b", 256), |
48 |
| - ("128b", 512), |
49 |
| - ("256b", 1024), |
50 |
| - ("512b", 1024), |
51 |
| - ("1024b", 2048), |
52 |
| - ("1024b", 4096), |
53 |
| - ], |
54 |
| - } |
55 |
| - num_slices = [1, 2] |
56 |
| - docker_images = [ |
57 |
| - (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), |
58 |
| - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), |
59 |
| - ] |
60 |
| - |
61 |
| - run_model_cmds_dict = {} |
62 |
| - for tpu, models in tpu_configs.items(): |
63 |
| - run_model_cmds = [] |
64 |
| - for model_size, num_cores in models: |
65 |
| - for n in num_slices: |
66 |
| - cmd = f"bash MaxText/configs/{tpu}/{model_size}.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY={tpu}-{num_cores} M_COMPILE_TOPOLOGY_NUM_SLICES={n}" |
67 |
| - run_model_cmds.append(cmd) |
68 |
| - run_model_cmds_dict[tpu] = run_model_cmds |
69 | 39 |
|
70 | 40 | quarantine_task_group = TaskGroup(
|
71 | 41 | group_id="Quarantine", dag=dag, prefix_group_id=False
|
72 | 42 | )
|
73 | 43 |
|
74 |
| - for mode, image in docker_images: |
75 |
| - maxtext_v4_configs_test = gke_config.get_gke_config( |
76 |
| - time_out_in_min=60, |
77 |
| - test_name=f"maxtext-aot-v4-{mode.value}", |
78 |
| - run_model_cmds=run_model_cmds_dict["v4"], |
79 |
| - docker_image=image.value, |
80 |
| - test_owner=test_owner.RAYMOND_Z, |
81 |
| - ).run_with_quarantine(quarantine_task_group) |
82 |
| - |
83 |
| - maxtext_v5e_configs_test = gke_config.get_gke_config( |
84 |
| - time_out_in_min=60, |
85 |
| - test_name=f"maxtext-aot-v5e-{mode.value}", |
86 |
| - run_model_cmds=run_model_cmds_dict["v5e"], |
87 |
| - docker_image=image.value, |
88 |
| - test_owner=test_owner.RAYMOND_Z, |
89 |
| - ).run_with_quarantine(quarantine_task_group) |
90 |
| - |
91 |
| - maxtext_v5p_configs_test = gke_config.get_gke_config( |
92 |
| - time_out_in_min=60, |
93 |
| - test_name=f"maxtext-aot-v5p-{mode.value}", |
94 |
| - run_model_cmds=run_model_cmds_dict["v5p"], |
95 |
| - docker_image=image.value, |
96 |
| - test_owner=test_owner.RAYMOND_Z, |
97 |
| - ).run_with_quarantine(quarantine_task_group) |
98 |
| - |
99 |
| - ( |
100 |
| - maxtext_v4_configs_test |
101 |
| - >> maxtext_v5e_configs_test |
102 |
| - >> maxtext_v5p_configs_test |
103 |
| - ) |
104 |
| - |
105 | 44 | # GPU AoT tests
|
106 | 45 | cmd = f"bash MaxText/configs/a3/llama_2_7b/16vm.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY=a3 M_COMPILE_TOPOLOGY_NUM_SLICES=16"
|
107 | 46 | stable_a3_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config(
|
|
0 commit comments