Feat: address comments

S1ro1 · S1ro1 · commit 1daa26b12f7f · 2025-08-04T13:44:21.000Z
diff --git a/docs/source/concept_guides/context_parallelism.md b/docs/source/concept_guides/context_parallelism.md
@@ -91,16 +91,33 @@ This can scale your context size to 1M+ sequence length potentially. Below, we s
 </p>
 
 > [!Tip]
-> These examples were created with a script you can find [in the examples folder](https://github.com/huggingface/accelerate/blob/main/examples/fsdp2/nd_parallel.py). For instructions on how to run it, see the [README](https://github.com/huggingface/accelerate/blob/main/examples/fsdp2/README.md) in the same folder.
+> These examples were created with a script you can find [in the examples folder](https://github.com/huggingface/accelerate/blob/main/examples/fsdp2/nd_parallel.py). To run the example on 8 H100 GPUs (128k sequence length), you can use the following command:
+> ```bash
+> accelerate launch --use-fsdp --fsdp-activation-checkpointing=TRUE examples/fsdp2/nd_parallel.py --cp-size=8 --sequence-length=128000
+> ```
 
 
 ## Accelerate's interface
 
 The context manager takes a few arguments, that are used to configure the context parallelism.
 
 - `buffers`: This is a list of tensors that are to be sharded across the sequence dimension. These tensors are usually input ids, labels and attention mask.
-- `buffer_seq_dims`: This is a list of integers, that specify the sequence dimension of the buffers, in the order of the `buffers` list.
-- `no_restore_buffers`: The implementation of context parallelism modifies the buffers in-place, converting them to `torch.distributed.tensor.Dtensor`s. After the context manager is exited, a communication kernel would need to be launched to restore the buffers to their original state (usually all-gather). This takes some time, so it is recommended to pass the same tensors as in the `buffers` argument, to avoid unnecessary communication, unless you are sure that you need to use the buffers after the context manager is exited.
+- `buffer_seq_dims`: This is a list of integers, that specify the sequence dimension of the buffers, in the order of the `buffers` list. If you pass `buffers=[input_ids, shift_labels]` with both having shape `[batch_size, sequence_length]`, you would pass `buffer_seq_dims=[1, 1]`.
+                     as the sequence dimension is the second dimension of the tensors. This is required for correct computation of the model outputs.
+- `no_restore_buffers`: The implementation of context parallelism modifies the buffers in-place, converting them to `torch.distributed.tensor.Dtensor`s. After the context manager exits, a communication kernel would need to be launched to restore the buffers to their original state (usually all-gather). This takes some time, so it is recommended to pass the same tensors as in the `buffers` argument, to avoid unnecessary communication, unless you are sure that you need to use the buffers after the context manager exits.
+
+
+> [!Warning]
+> Context parallelism is not compatible with `labels` that are a copy of `input_ids`, which models from 🤗 transformers can shift to enable causal language modeling themselves.
+> Imagine this case:
+> labels = [l1, l2, l3, l4, ... li]
+> if we apply context parallelism, each rank would end up with a part of labels, such as this:
+> labels_rank_0 = [l1, l2], labels_rank_1 = [l3, l4], ...
+> after transformers modelling code shifts the labels, it would end up with:
+> labels_rank_0 = [l2, PAD], labels_rank_1 = [l3, PAD], ...
+> where `PAD` is a padding token. This would result in incorrect loss computation, as the labels are not aligned with the inputs anymore.
+> Because of this, you need to manually shift the labels before passing them in the model
+
 
 ## Configurable options
 Accelerate provides only a single option to configure context parallelism (except of `cp_size`)
diff --git a/examples/fsdp2/nd_parallel.py b/examples/fsdp2/nd_parallel.py
@@ -23,6 +23,7 @@
 import torch.distributed as dist
 from torch.utils.data import DataLoader
 from transformers import AutoModelForCausalLM
+from transformers.loss.loss_utils import ForCausalLMLoss
 
 from accelerate import Accelerator
 from accelerate.parallelism_config import ParallelismConfig
@@ -54,9 +55,9 @@ def parse_args():
 
 
 def forward(model, batch, optimizer, accelerator: Accelerator):
-    input_ids, labels = batch["input_ids"], batch["labels"]
+    input_ids, shift_labels = batch["input_ids"], batch["shift_labels"]
     with accelerator.maybe_context_parallel(
-        buffers=[input_ids, labels], buffer_seq_dims=[1, 1], no_restore_buffers={input_ids, labels}
+        buffers=[input_ids, shift_labels], buffer_seq_dims=[1, 1], no_restore_buffers={input_ids, shift_labels}
     ):
         # To get the proper loss value, we need to average across devices that are participating in data parallel/context parallel training
         loss_reduce_grp = (
@@ -65,7 +66,10 @@ def forward(model, batch, optimizer, accelerator: Accelerator):
             else None
         )
         outputs = model(**batch)
-        loss = outputs.loss
+        # With shift labels we need to compute loss ourselves
+        loss = ForCausalLMLoss(
+            logits=outputs.logits, labels=None, shift_labels=shift_labels, vocab_size=model.config.vocab_size
+        )
         accelerator.backward(loss)
         optimizer.step()
         optimizer.zero_grad()
@@ -90,7 +94,6 @@ def train(args):
             auto_wrap_policy="transformer_based_wrap",
             transformer_cls_names_to_wrap=["LlamaDecoderLayer"],
             state_dict_type="SHARDED_STATE_DICT",
-            activation_checkpointing=True,
         )
 
     accelerator = Accelerator(
@@ -155,7 +158,7 @@ def train(args):
 if __name__ == "__main__":
     set_seed(42)
     args = parse_args()
-    if args.dp_shard_size == 1:
+    if args.dp_shard_size == 1 and args.tp_size > 1:
         # We currently don't support saving with `save_state` when using only
         # tensor parallelism, fsdp must be enabled
         warnings.warn(
diff --git a/examples/fsdp2/utils.py b/examples/fsdp2/utils.py
@@ -69,7 +69,7 @@ def create_packed_sequences(examples):
             packed_input_ids.append(full_sequence[:-1])
             packed_labels.append(full_sequence[1:])
 
-        return {"input_ids": packed_input_ids, "labels": packed_labels}
+        return {"input_ids": packed_input_ids, "shift_labels": packed_labels}
 
     with accelerator.main_process_first():
         packed_dataset = tokenized_dataset.map(
@@ -111,8 +111,8 @@ def create_collate_fn():
 
     def collate_fn(batch):
         input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
-        labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
-        return {"input_ids": input_ids, "labels": labels}
+        shift_labels = torch.tensor([item["shift_labels"] for item in batch], dtype=torch.long)
+        return {"input_ids": input_ids, "shift_labels": shift_labels}
 
     return collate_fn
 
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -35,7 +35,7 @@
 
 from accelerate.utils.dataclasses import FP8BackendType
 
-from .big_modeling import attach_context_parallel_hooks
+from .big_modeling import _attach_context_parallel_hooks
 from .checkpointing import load_accelerator_state, load_custom_state, save_accelerator_state, save_custom_state
 from .data_loader import DataLoaderDispatcher, prepare_data_loader, skip_first_batches
 from .logging import get_logger
@@ -449,12 +449,6 @@ def __init__(
 
         parallelism_config = self._setup_parallelism_config(parallelism_config, torch_tp_plugin)
 
-        # TODO: Siro - figure out a better place where this can go (needs to be above AcceleratorState init)
-        if parallelism_config and parallelism_config.cp_enabled and fsdp_plugin is None:
-            raise ValueError(
-                "`cp_enabled` is set to `True` in the `parallelism_config`, but no `fsdp_plugin` was provided. We need a `fsdp_plugin` to use `cp_enabled=True`, as we also shard the model across the device mesh to save more memory"
-            )
-
         kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
         kwargs["parallelism_config"] = parallelism_config
         self.state = AcceleratorState(
@@ -1585,10 +1579,8 @@ def _prepare_cp(self, *args):
         self._cp_context = functools.partial(context_parallel, mesh=self.torch_device_mesh["cp"])
 
         for arg in args:
-            if not isinstance(arg, torch.nn.Module):
-                continue
-
-            attach_context_parallel_hooks(arg)
+            if isinstance(arg, torch.nn.Module):
+                _attach_context_parallel_hooks(arg)
 
         return args
 
@@ -3991,10 +3983,10 @@ def maybe_context_parallel(
             ):
                 yield
         else:
-            if not getattr(self, "_warned_about_cp", False) and self.is_main_process:
-                logger.warning("Context parallel training is not enabled. This context manager will have no effect.")
-            # As this context manager is recreated each training step, we only warn once
-            self._warned_about_cp = True
+            logger.warning_once(
+                "Context parallel training is not enabled. This context manager will have no effect. "
+                "To enable it, set `parallelism_config.cp_size` > 1 in the `Accelerator` constructor."
+            )
             yield
 
     @contextmanager
diff --git a/src/accelerate/big_modeling.py b/src/accelerate/big_modeling.py
@@ -749,7 +749,7 @@ def _attach_layerwise_casting_hooks(
         )
 
 
-def attach_context_parallel_hooks(
+def _attach_context_parallel_hooks(
     model: nn.Module,
 ):
     """
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -1039,15 +1039,8 @@ def _validate_launch_command(args):
     if args.multi_gpu and (args.num_processes is not None) and (args.num_processes < 2):
         raise ValueError("You need to use at least 2 processes to use `--multi_gpu`.")
 
-    # TODO: Merge into 1 if
-    if not args.use_fsdp and args.use_parallelism_config:
-        raise ValueError(
-            "You cannot use `--use_parallelism_config` without `--use_fsdp`. Please set `--use_fsdp` to True if you want to use parallelism config."
-        )
-    elif args.fsdp_version == 1 and args.use_parallelism_config:
-        raise ValueError(
-            "You cannot use `--use_parallelism_config` with FSDP version 1. Please set `--fsdp_version=2` if you want to use parallelism config."
-        )
+    if (not args.use_fsdp or args.fsdp_version == 1) and args.use_parallelism_config:
+        raise ValueError("You cannot use `--use_parallelism_config` without `--use_fsdp` and `--fsdp_version=2`. ")
 
     defaults = None
     warned = []
diff --git a/src/accelerate/parallelism_config.py b/src/accelerate/parallelism_config.py
@@ -57,10 +57,10 @@ class ParallelismConfig:
 
     """
 
-    dp_replicate_size: int = 1
-    dp_shard_size: int = 1
-    tp_size: int = 1
-    cp_size: int = 1
+    dp_replicate_size: int = None
+    dp_shard_size: int = None
+    tp_size: int = None
+    cp_size: int = None
 
     # we use Union because we might support other x parallel plugins (i.e. deepspeed, etc)
     tp_handler: Union[None, TorchTensorParallelConfig] = None
@@ -210,13 +210,13 @@ def _get_mesh(self) -> tuple[tuple[int, ...], tuple[str, ...]]:
 
     def __post_init__(self):
         # Basic size validation
-        if self.dp_replicate_size == 1:
+        if self.dp_replicate_size is None:
             self.dp_replicate_size = int(os.environ.get("PARALLELISM_CONFIG_DP_REPLICATE_SIZE", "1"))
-        if self.dp_shard_size == 1:
+        if self.dp_shard_size is None:
             self.dp_shard_size = int(os.environ.get("PARALLELISM_CONFIG_DP_SHARD_SIZE", "1"))
-        if self.tp_size == 1:
+        if self.tp_size is None:
             self.tp_size = int(os.environ.get("PARALLELISM_CONFIG_TP_SIZE", "1"))
-        if self.cp_size == 1:
+        if self.cp_size is None:
             self.cp_size = int(os.environ.get("PARALLELISM_CONFIG_CP_SIZE", "1"))
 
         if self.tp_size > 1:
diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -981,6 +981,10 @@ def __init__(
                 DistributedType.MULTI_XPU,
                 DistributedType.MULTI_HPU,
             ]:
+                if self.parallelism_config and self.parallelism_config.cp_enabled and fsdp_plugin is None:
+                    raise ValueError(
+                        "`cp_size > 1` in the `parallelism_config`, but no `fsdp_plugin` was provided. We need a `fsdp_plugin` to use `cp_enabled=True`, as we also shard the model across the device mesh to save more memory"
+                    )
                 if (os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" or fsdp_plugin is not None) or (
                     self.parallelism_config is not None and self.parallelism_config.cp_enabled
                 ):
diff --git a/tests/test_dataclasses.py b/tests/test_dataclasses.py
@@ -105,10 +105,10 @@ def test_get_mesh(
     ):
         # Skip tests based on version requirements
         if _should_skip_cp_test(cp_size):
-            pytest.skip(f"CP tests require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
+            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
         if _should_skip_tp_test(tp_size):
             pytest.skip(
-                f"TP tests require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
+                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
             )
 
         config = ParallelismConfig(
@@ -145,10 +145,10 @@ def test_build_device_mesh(
         """Test build_device_mesh creates correct mesh and applies flattening."""
         # Skip tests based on version requirements
         if _should_skip_cp_test(cp_size):
-            pytest.skip(f"CP tests require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
+            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
         if _should_skip_tp_test(tp_size):
             pytest.skip(
-                f"TP tests require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
+                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
             )
 
         config = ParallelismConfig(
@@ -194,10 +194,10 @@ def test_from_env(
         cp_size,
     ):
         if _should_skip_cp_test(cp_size):
-            pytest.skip(f"CP tests require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
+            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
         if _should_skip_tp_test(tp_size):
             pytest.skip(
-                f"TP tests require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
+                f"tests with `tp_size>1` require torch >= {BETA_TP_AVAILABLE_PYTORCH_VERSION}, transformers available and >= {BETA_TP_AVAILABLE_TRANSFORMERS_VERSION}"
             )
 
         new_env = {
@@ -217,7 +217,7 @@ def test_cp_handler(self):
 
         # Any cp_size > 1 requires torch >= BETA_CP_AVAILABLE_PYTORCH_VERSION, we use placeholder for this check as this test doesn't depend on a specific size
         if _should_skip_cp_test(2):
-            pytest.skip(f"CP tests require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
+            pytest.skip(f"tests with `cp_size>1` require torch >= {BETA_CP_AVAILABLE_PYTORCH_VERSION}")
 
         from accelerate.utils import TorchContextParallelConfig
 

Original file line number	Diff line number	Diff line change
`@@ -749,7 +749,7 @@ def _attach_layerwise_casting_hooks(`
`749`	`749`	`)`
`750`	`750`
`751`	`751`
`752`		`-def attach_context_parallel_hooks(`
	`752`	`+def _attach_context_parallel_hooks(`
`753`	`753`	`model: nn.Module,`
`754`	`754`	`):`
`755`	`755`	`"""`