Skip to content

Commit 2b1eca7

Browse files
committed
Remove tpu configs from AOT test and update HybridSim test name
1 parent 2d2ce84 commit 2b1eca7

File tree

3 files changed

+3
-64
lines changed

3 files changed

+3
-64
lines changed

dags/common/quarantined_tests.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ class QuarantineTests:
212212
# DAG: maxtext_configs_aot
213213
"maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
214214
"maxtext-aot-v5e-nightly-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
215-
# DAG: maxtext_configs_aot_hybridsim
215+
# DAG: maxtext_configs_hybridsim
216216
"16b-1xv5litepod-256-aot-hybridsim": TestInfo(
217217
team.PERFORMANCE, "2024-11-12"
218218
),

dags/multipod/maxtext_configs_aot.py

-61
Original file line numberDiff line numberDiff line change
@@ -36,72 +36,11 @@
3636
catchup=False,
3737
concurrency=2,
3838
) as dag:
39-
# Testing configurations
40-
tpu_configs = {
41-
# accelerator: [(model_size, num_cores), ...],
42-
"v4": [("22b", 128), ("52b", 384)],
43-
"v5e": [("16b", 256), ("32b", 256), ("64b", 256), ("128b", 256)],
44-
"v5p": [
45-
("32b", 128),
46-
("64b", 128),
47-
("128b", 256),
48-
("128b", 512),
49-
("256b", 1024),
50-
("512b", 1024),
51-
("1024b", 2048),
52-
("1024b", 4096),
53-
],
54-
}
55-
num_slices = [1, 2]
56-
docker_images = [
57-
(SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK),
58-
(SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY),
59-
]
60-
61-
run_model_cmds_dict = {}
62-
for tpu, models in tpu_configs.items():
63-
run_model_cmds = []
64-
for model_size, num_cores in models:
65-
for n in num_slices:
66-
cmd = f"bash MaxText/configs/{tpu}/{model_size}.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY={tpu}-{num_cores} M_COMPILE_TOPOLOGY_NUM_SLICES={n}"
67-
run_model_cmds.append(cmd)
68-
run_model_cmds_dict[tpu] = run_model_cmds
6939

7040
quarantine_task_group = TaskGroup(
7141
group_id="Quarantine", dag=dag, prefix_group_id=False
7242
)
7343

74-
for mode, image in docker_images:
75-
maxtext_v4_configs_test = gke_config.get_gke_config(
76-
time_out_in_min=60,
77-
test_name=f"maxtext-aot-v4-{mode.value}",
78-
run_model_cmds=run_model_cmds_dict["v4"],
79-
docker_image=image.value,
80-
test_owner=test_owner.RAYMOND_Z,
81-
).run_with_quarantine(quarantine_task_group)
82-
83-
maxtext_v5e_configs_test = gke_config.get_gke_config(
84-
time_out_in_min=60,
85-
test_name=f"maxtext-aot-v5e-{mode.value}",
86-
run_model_cmds=run_model_cmds_dict["v5e"],
87-
docker_image=image.value,
88-
test_owner=test_owner.RAYMOND_Z,
89-
).run_with_quarantine(quarantine_task_group)
90-
91-
maxtext_v5p_configs_test = gke_config.get_gke_config(
92-
time_out_in_min=60,
93-
test_name=f"maxtext-aot-v5p-{mode.value}",
94-
run_model_cmds=run_model_cmds_dict["v5p"],
95-
docker_image=image.value,
96-
test_owner=test_owner.RAYMOND_Z,
97-
).run_with_quarantine(quarantine_task_group)
98-
99-
(
100-
maxtext_v4_configs_test
101-
>> maxtext_v5e_configs_test
102-
>> maxtext_v5p_configs_test
103-
)
104-
10544
# GPU AoT tests
10645
cmd = f"bash MaxText/configs/a3/llama_2_7b/16vm.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY=a3 M_COMPILE_TOPOLOGY_NUM_SLICES=16"
10746
stable_a3_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config(

dags/multipod/maxtext_configs_aot_hybridsim.py dags/multipod/maxtext_configs_hybridsim.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def hybridsim_compile_and_run(test_group_id):
3838
shared_gcs_location = name_format.generate_gcs_folder_location.override(
3939
task_id=f"{test_group_id}_generate_gcs_folder_location"
4040
)(
41-
f"{gcs_subfolder}/maxtext_configs_aot_hybridsim/v{tpu.value}",
41+
f"{gcs_subfolder}/maxtext_configs_hybridsim/v{tpu.value}",
4242
test_group_id,
4343
)
4444

@@ -83,7 +83,7 @@ def hybridsim_compile_and_run(test_group_id):
8383

8484

8585
with models.DAG(
86-
dag_id="maxtext_configs_aot_hybridsim",
86+
dag_id="maxtext_configs_hybridsim",
8787
schedule=SCHEDULED_TIME,
8888
tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"],
8989
start_date=datetime.datetime(2024, 2, 19),

0 commit comments

Comments
 (0)