Remove tpu configs from AOT test and update HybridSim test name

raymondzouu · raymondzouu · commit 2b1eca7867cb · 2025-01-14T16:47:35.000Z
diff --git a/dags/common/quarantined_tests.py b/dags/common/quarantined_tests.py
@@ -212,7 +212,7 @@ class QuarantineTests:
       # DAG: maxtext_configs_aot
       "maxtext-aot-v5e-stable-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
       "maxtext-aot-v5e-nightly-v4-8": TestInfo(team.PERFORMANCE, "2024-11-12"),
-      # DAG: maxtext_configs_aot_hybridsim
+      # DAG: maxtext_configs_hybridsim
       "16b-1xv5litepod-256-aot-hybridsim": TestInfo(
           team.PERFORMANCE, "2024-11-12"
       ),
diff --git a/dags/multipod/maxtext_configs_aot.py b/dags/multipod/maxtext_configs_aot.py
@@ -36,72 +36,11 @@
     catchup=False,
     concurrency=2,
 ) as dag:
-  # Testing configurations
-  tpu_configs = {
-      # accelerator: [(model_size, num_cores), ...],
-      "v4": [("22b", 128), ("52b", 384)],
-      "v5e": [("16b", 256), ("32b", 256), ("64b", 256), ("128b", 256)],
-      "v5p": [
-          ("32b", 128),
-          ("64b", 128),
-          ("128b", 256),
-          ("128b", 512),
-          ("256b", 1024),
-          ("512b", 1024),
-          ("1024b", 2048),
-          ("1024b", 4096),
-      ],
-  }
-  num_slices = [1, 2]
-  docker_images = [
-      (SetupMode.STABLE, DockerImage.MAXTEXT_TPU_JAX_STABLE_STACK),
-      (SetupMode.NIGHTLY, DockerImage.MAXTEXT_TPU_JAX_NIGHTLY),
-  ]
-
-  run_model_cmds_dict = {}
-  for tpu, models in tpu_configs.items():
-    run_model_cmds = []
-    for model_size, num_cores in models:
-      for n in num_slices:
-        cmd = f"bash MaxText/configs/{tpu}/{model_size}.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY={tpu}-{num_cores} M_COMPILE_TOPOLOGY_NUM_SLICES={n}"
-        run_model_cmds.append(cmd)
-    run_model_cmds_dict[tpu] = run_model_cmds
 
   quarantine_task_group = TaskGroup(
       group_id="Quarantine", dag=dag, prefix_group_id=False
   )
 
-  for mode, image in docker_images:
-    maxtext_v4_configs_test = gke_config.get_gke_config(
-        time_out_in_min=60,
-        test_name=f"maxtext-aot-v4-{mode.value}",
-        run_model_cmds=run_model_cmds_dict["v4"],
-        docker_image=image.value,
-        test_owner=test_owner.RAYMOND_Z,
-    ).run_with_quarantine(quarantine_task_group)
-
-    maxtext_v5e_configs_test = gke_config.get_gke_config(
-        time_out_in_min=60,
-        test_name=f"maxtext-aot-v5e-{mode.value}",
-        run_model_cmds=run_model_cmds_dict["v5e"],
-        docker_image=image.value,
-        test_owner=test_owner.RAYMOND_Z,
-    ).run_with_quarantine(quarantine_task_group)
-
-    maxtext_v5p_configs_test = gke_config.get_gke_config(
-        time_out_in_min=60,
-        test_name=f"maxtext-aot-v5p-{mode.value}",
-        run_model_cmds=run_model_cmds_dict["v5p"],
-        docker_image=image.value,
-        test_owner=test_owner.RAYMOND_Z,
-    ).run_with_quarantine(quarantine_task_group)
-
-    (
-        maxtext_v4_configs_test
-        >> maxtext_v5e_configs_test
-        >> maxtext_v5p_configs_test
-    )
-
   # GPU AoT tests
   cmd = f"bash MaxText/configs/a3/llama_2_7b/16vm.sh EXECUTABLE=train_compile.py M_COMPILE_TOPOLOGY=a3 M_COMPILE_TOPOLOGY_NUM_SLICES=16"
   stable_a3_gpu = gke_config.get_maxtext_end_to_end_gpu_gke_test_config(
diff --git a/dags/multipod/maxtext_configs_hybridsim.py b/dags/multipod/maxtext_configs_hybridsim.py
@@ -38,7 +38,7 @@ def hybridsim_compile_and_run(test_group_id):
     shared_gcs_location = name_format.generate_gcs_folder_location.override(
         task_id=f"{test_group_id}_generate_gcs_folder_location"
     )(
-        f"{gcs_subfolder}/maxtext_configs_aot_hybridsim/v{tpu.value}",
+        f"{gcs_subfolder}/maxtext_configs_hybridsim/v{tpu.value}",
         test_group_id,
     )
 
@@ -83,7 +83,7 @@ def hybridsim_compile_and_run(test_group_id):
 
 
 with models.DAG(
-    dag_id="maxtext_configs_aot_hybridsim",
+    dag_id="maxtext_configs_hybridsim",
     schedule=SCHEDULED_TIME,
     tags=["multipod_team", "maxtext", "nightly", "mlscale_onduty"],
     start_date=datetime.datetime(2024, 2, 19),