diff --git a/dags/common/quarantined_tests.py b/dags/common/quarantined_tests.py index 3b480f43e..2b4d37b60 100644 --- a/dags/common/quarantined_tests.py +++ b/dags/common/quarantined_tests.py @@ -212,7 +212,7 @@ class QuarantineTests: # DAG: maxtext_configs_aot "maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"), "maxtext-aot-v5e-nightly-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"), - # DAG: maxtext_configs_aot_hybridsim + # DAG: maxtext_configs_hybridsim "16b-1xv5litepod-256-aot-hybridsim": TestInfo( team.PERFORMANCE, "2024-11-12" ), diff --git a/dags/multipod/maxtext_configs_aot.py b/dags/multipod/maxtext_configs_aot.py index d9d463a50..ab6467844 100644 --- a/dags/multipod/maxtext_configs_aot.py +++ b/dags/multipod/maxtext_configs_aot.py @@ -36,72 +36,10 @@ catchup=False, concurrency=2, ) as dag: - # Testing configurations - tpu_configs = { - # accelerator: [(model_size, num_cores), ...], - "v4": [("22b", 128), ("52b", 384)], - "v5e": [("16b", 256), ("32b", 256), ("64b", 256), ("128b", 256)], - "v5p": [ - ("32b", 128), - ("64b", 128), - ("128b", 256), - ("128b", 512), - ("256b", 1024), - ("512b", 1024), - ("1024b", 2048), - ("1024b", 4096), - ], - } - num_slices = [1, 2] - docker_images = [ - (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK), - (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY), - ] - - run_model_cmds_dict = {} - for tpu, models in tpu_configs.items(): - run_model_cmds = [] - for model_size, num_cores in models: - for n in num_slices: - cmd = f"bash MaxText/configs/{tpu}/{model_size}.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY={tpu}-{num_cores} M_COMPILE_TOPOLOGY_NUM_SLICES={n}" - run_model_cmds.append(cmd) - run_model_cmds_dict[tpu] = run_model_cmds - quarantine_task_group = TaskGroup( group_id="Quarantine", dag=dag, prefix_group_id=False ) - for mode, image in docker_images: - maxtext_v4_configs_test = gke_config.get_gke_config( - time_out_in_min=60, - test_name=f"maxtext-aot-v4-{mode.value}", - run_model_cmds=run_model_cmds_dict["v4"], - docker_image=image.value, - test_owner=test_owner.RAYMOND_Z, - ).run_with_quarantine(quarantine_task_group) - - maxtext_v5e_configs_test = gke_config.get_gke_config( - time_out_in_min=60, - test_name=f"maxtext-aot-v5e-{mode.value}", - run_model_cmds=run_model_cmds_dict["v5e"], - docker_image=image.value, - test_owner=test_owner.RAYMOND_Z, - ).run_with_quarantine(quarantine_task_group) - - maxtext_v5p_configs_test = gke_config.get_gke_config( - time_out_in_min=60, - test_name=f"maxtext-aot-v5p-{mode.value}", - run_model_cmds=run_model_cmds_dict["v5p"], - docker_image=image.value, - test_owner=test_owner.RAYMOND_Z, - ).run_with_quarantine(quarantine_task_group) - - ( - maxtext_v4_configs_test - >> maxtext_v5e_configs_test - >> maxtext_v5p_configs_test - ) - # GPU AoT tests cmd = f"bash MaxText/configs/a3/llama_2_7b/16vm.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY=a3 M_COMPILE_TOPOLOGY_NUM_SLICES=16" stable_a3_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config( diff --git a/dags/multipod/maxtext_configs_aot_hybridsim.py b/dags/multipod/maxtext_configs_hybridsim.py similarity index 97% rename from dags/multipod/maxtext_configs_aot_hybridsim.py rename to dags/multipod/maxtext_configs_hybridsim.py index 9ec1fab5c..f4b8c1d74 100644 --- a/dags/multipod/maxtext_configs_aot_hybridsim.py +++ b/dags/multipod/maxtext_configs_hybridsim.py @@ -38,7 +38,7 @@ def hybridsim_compile_and_run(test_group_id): shared_gcs_location = name_format.generate_gcs_folder_location.override( task_id=f"{test_group_id}_generate_gcs_folder_location" )( - f"{gcs_subfolder}/maxtext_configs_aot_hybridsim/v{tpu.value}", + f"{gcs_subfolder}/maxtext_configs_hybridsim/v{tpu.value}", test_group_id, ) @@ -83,7 +83,7 @@ def hybridsim_compile_and_run(test_group_id): with models.DAG( - dag_id="maxtext_configs_aot_hybridsim", + dag_id="maxtext_configs_hybridsim", schedule=SCHEDULED_TIME, tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"], start_date=datetime.datetime(2024, 2, 19),