Fix TP, enable test, silence noisy logs (#2761)

ebsmothers · web-flow · commit 0d906758cde5 · 2025-05-22T19:43:46.000-07:00
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -231,7 +231,11 @@ def __init__(self, cfg: DictConfig) -> None:
         self._activation_offloading_use_streams = cfg.get(
             "activation_offloading_use_streams", True
         )
-        if self._activation_offloading_use_streams and self.parallel_dims.tp_enabled:
+        if (
+            self._enable_activation_offloading
+            and self._activation_offloading_use_streams
+            and self.parallel_dims.tp_enabled
+        ):
             warn(
                 message=(
                     "Using activation offloading with streams is not advised in tensor parallel, and may "
diff --git a/tests/recipes/test_full_finetune_distributed.py b/tests/recipes/test_full_finetune_distributed.py
@@ -11,6 +11,7 @@
 
 import pytest
 import torch
+from packaging import version
 from tests.common import TUNE_PATH
 
 from tests.recipes.utils import (
@@ -130,7 +131,8 @@ def test_loss(
         )
 
     @pytest.mark.skipif(
-        torch.__version__ < "2.8.0", reason="2D parallel test requires PyTorch >= 2.8"
+        version.parse(torch.__version__).base_version < "2.8.0",
+        reason="2D parallel test requires PyTorch >= 2.8",
     )
     @pytest.mark.integration_test
     @pytest.mark.parametrize(
diff --git a/torchtune/models/llama3/_parallelism.py b/torchtune/models/llama3/_parallelism.py
@@ -37,8 +37,8 @@ def _get_base_llama_tp_training_plan(
         "norm": SequenceParallel(),
         "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()),
         "layers.*.attn": layerwise_prepare_module_input_cls(
-            input_layouts=(Shard(1), None),
-            desired_input_layouts=(Replicate(), None),
+            input_layouts=(Shard(1), Shard(1)),
+            desired_input_layouts=(Replicate(), Replicate()),
         ),
         "layers.*.mlp": layerwise_prepare_module_input_cls(
             input_layouts=(Shard(1),),
diff --git a/torchtune/models/llama4/_parallelism.py b/torchtune/models/llama4/_parallelism.py
@@ -63,8 +63,8 @@ def decoder_only_tp_training_plan(model: nn.Module) -> dict[str, ParallelStyle]:
         layer_plan = {
             f"decoder.layers.{layer_id}.sa_norm": SequenceParallel(),
             f"decoder.layers.{layer_id}.attn": PrepareModuleInput(
-                input_layouts=(Shard(1), None),
-                desired_input_layouts=(Replicate(), None),
+                input_layouts=(Shard(1), Shard(1)),
+                desired_input_layouts=(Replicate(), Replicate()),
             ),
             f"decoder.layers.{layer_id}.attn.q_proj": ColwiseParallel(),
             f"decoder.layers.{layer_id}.attn.k_proj": ColwiseParallel(),

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`
`12`	`12`	`import pytest`
`13`	`13`	`import torch`
	`14`	`+from packaging import version`
`14`	`15`	`from tests.common import TUNE_PATH`
`15`	`16`
`16`	`17`	`from tests.recipes.utils import (`
`@@ -130,7 +131,8 @@ def test_loss(`
`130`	`131`	`)`
`131`	`132`
`132`	`133`	`@pytest.mark.skipif(`
`133`		`- torch.__version__ < "2.8.0", reason="2D parallel test requires PyTorch >= 2.8"`
	`134`	`+ version.parse(torch.__version__).base_version < "2.8.0",`
	`135`	`+ reason="2D parallel test requires PyTorch >= 2.8",`
`134`	`136`	`)`
`135`	`137`	`@pytest.mark.integration_test`
`136`	`138`	`@pytest.mark.parametrize(`