huggingface
diff --git a/‎examples/fsdp2/nd_parallel.py
Lines changed: 45 additions & 16 deletions b/‎examples/fsdp2/nd_parallel.py
Lines changed: 45 additions & 16 deletions
diff --git a/‎src/accelerate/accelerator.py
Lines changed: 84 additions & 0 deletions b/‎src/accelerate/accelerator.py
Lines changed: 84 additions & 0 deletions
diff --git a/‎src/accelerate/commands/config/cluster.py
Lines changed: 48 additions & 0 deletions b/‎src/accelerate/commands/config/cluster.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/accelerate/commands/config/config_args.py
Lines changed: 4 additions & 0 deletions b/‎src/accelerate/commands/config/config_args.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/accelerate/commands/launch.py
Lines changed: 56 additions & 0 deletions b/‎src/accelerate/commands/launch.py
Lines changed: 56 additions & 0 deletions
@@ -43,6 +43,7 @@ def parse_args():
     parser.add_argument("--dp-replicate-size", type=int, default=1)
     parser.add_argument("--dp-shard-size", type=int, default=1)
     parser.add_argument("--tp-size", type=int, default=1)
+    parser.add_argument("--cp-size", type=int, default=1)
     parser.add_argument("--sequence-length", type=int, default=1024)
     parser.add_argument("--num-steps", type=int, default=1000)
     parser.add_argument("--save-dir", type=str, default="./outputs")
@@ -52,17 +53,42 @@ def parse_args():
     return parser.parse_args()
 
 
-def forward(model, batch, optimizer, accelerator):
-    # To get the proper loss value, we need to average across devices that are participating in data parallel/context parallel training
-    loss_reduce_grp = (
-        accelerator.torch_device_mesh["dp_cp"].get_group() if accelerator.parallelism_config.dp_cp_dim_names else None
-    )
-    outputs = model(**batch)
-    loss = outputs.loss
-    accelerator.backward(loss)
-    optimizer.step()
-    optimizer.zero_grad()
-    dist.all_reduce(loss, op=dist.ReduceOp.AVG, group=loss_reduce_grp)
+def _self_attn_pre_forward_hook(module, *args, **kwargs):
+    kwargs = args[1]
+    kwargs["attention_mask"] = None
+    kwargs["is_causal"] = True
+    args = list(args)
+    args[1] = kwargs
+    return tuple(args)
+
+
+def fix_model(model):
+    for name, module in model.named_modules():
+        if name.endswith("self_attn"):
+            module: torch.nn.Module
+            module.register_forward_pre_hook(_self_attn_pre_forward_hook, with_kwargs=True)
+
+    return model
+
+
+def forward(model, batch, optimizer, accelerator: Accelerator):
+    input_ids, labels = batch["input_ids"], batch["labels"]
+    with accelerator.maybe_context_parallel(
+        buffers=[input_ids, labels], buffer_seq_dims=[1, 1], no_restore_buffers={input_ids, labels}
+    ):
+        # To get the proper loss value, we need to average across devices that are participating in data parallel/context parallel training
+        loss_reduce_grp = (
+            accelerator.torch_device_mesh["dp_cp"].get_group()
+            if accelerator.parallelism_config.dp_cp_dim_names
+            else None
+        )
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        optimizer.zero_grad()
+        dist.all_reduce(loss, op=dist.ReduceOp.AVG, group=loss_reduce_grp)
+
     return loss
 
 
@@ -71,21 +97,22 @@ def train(args):
         dp_replicate_size=args.dp_replicate_size,
         dp_shard_size=args.dp_shard_size,
         tp_size=args.tp_size,
+        cp_size=args.cp_size,
     )
 
     # FSDP needs extra configuration, so we properly shard the model
-    if parallelism_config.dp_shard_enabled:
+    fsdp2_plugin = None
+    if parallelism_config.dp_shard_enabled or parallelism_config.cp_enabled:
         fsdp2_plugin = FullyShardedDataParallelPlugin(
             fsdp_version=2,
             auto_wrap_policy="transformer_based_wrap",
             transformer_cls_names_to_wrap=["LlamaDecoderLayer"],
+            state_dict_type="SHARDED_STATE_DICT",
+            activation_checkpointing=True,
         )
 
     accelerator = Accelerator(
-        log_with=["wandb"],
-        mixed_precision="bf16",
-        parallelism_config=parallelism_config,
-        fsdp_plugin=fsdp2_plugin if parallelism_config.dp_shard_enabled else None,
+        log_with=["wandb"], mixed_precision="bf16", parallelism_config=parallelism_config, fsdp_plugin=fsdp2_plugin
     )
     accelerator.init_trackers("nd_parallel_training")
 
@@ -107,6 +134,8 @@ def train(args):
     dataloader = DataLoader(dataset, batch_size=1, collate_fn=create_collate_fn())
 
     model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+    if parallelism_config.cp_enabled:
+        model = fix_model(model)
 
     total_num_steps = min(args.num_steps, len(dataloader))
     performance_tracker = PerformanceTracker(warmup_steps=5)
 
@@ -448,6 +448,12 @@ def __init__(
 
         parallelism_config = self._setup_parallelism_config(parallelism_config, torch_tp_plugin)
 
+        # TODO: Siro - figure out a better place where this can go (needs to be above AcceleratorState init)
+        if parallelism_config and parallelism_config.cp_enabled and fsdp_plugin is None:
+            raise ValueError(
+                "`cp_enabled` is set to `True` in the `parallelism_config`, but no `fsdp_plugin` was provided. We need a `fsdp_plugin` to use `cp_enabled=True`, as we also shard the model across the device mesh to save more memory"
+            )
+
         kwargs = self.init_handler.to_kwargs() if self.init_handler is not None else {}
         kwargs["parallelism_config"] = parallelism_config
         self.state = AcceleratorState(
@@ -776,6 +782,10 @@ def _setup_parallelism_config(
     ):
         if parallelism_config is None:
             if PartialState._shared_state != {} and PartialState().parallelism_config is not None:
+                if os.environ.get("ACCELERATE_USE_PARALLELISM_CONFIG", "false") == "true":
+                    raise ValueError(
+                        "Partial state contains a `parallelism_config` which is not None, but you configured `parallelism_config` from the `accelerate launch` CLI. We don't know which to use, please remove one of those configuration methods."
+                    )
                 parallelism_config = PartialState().parallelism_config
             else:
                 # TODO: Remove after deprecating tp_plugin
@@ -1497,6 +1507,9 @@ def prepare(self, *args, device_placement=None):
         if self.parallelism_config and self.parallelism_config.tp_enabled:
             args = self._prepare_tp(*args)
 
+        if self.parallelism_config and self.parallelism_config.cp_enabled:
+            self._prepare_cp()
+
         if self.fp8_backend == FP8BackendType.TE:
             args = self._prepare_te(*args)
         elif self.fp8_backend == FP8BackendType.AO:
@@ -1561,6 +1574,15 @@ def _prepare_tp(self, *args):
 
         return args
 
+    def _prepare_cp(self):
+        from torch.distributed.tensor.experimental import context_parallel
+        from torch.distributed.tensor.experimental._attention import set_rotate_method
+
+        cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_stategy
+        set_rotate_method(cp_comm_strategy)
+
+        self._cp_context = functools.partial(context_parallel, mesh=self.torch_device_mesh["cp"])
+
     def _prepare_fsdp2(self, *args):
         # First pass: prepare everything except schedulers (and model, which is prepared separately below)
         result = [
@@ -3903,6 +3925,68 @@ def register_for_checkpointing(self, *objects):
             raise ValueError(err)
         self._custom_objects.extend(objects)
 
+    @contextmanager
+    def maybe_context_parallel(
+        self,
+        buffers: list[torch.Tensor] | None = None,
+        buffer_seq_dims: list[int] | None = None,
+        no_restore_buffers: set[torch.Tensor] | None = None,
+    ):
+        """
+        A context manager that enables context parallel training.
+
+        Args:
+            buffers (`list[torch.Tensor]`, `optional`):
+                Buffers, which are going to be sharded along the sequence dimension. Common examples are inputs, labels
+                or positional embedding buffers. This context manager will modify these buffers in-place, and after
+                exiting the context, the buffers will be restored to their original state. To avoid unnecessary
+                restores, you can use `no_restore_buffers` to specify which buffers don't need to be restored.
+            buffer_seq_dims (`list[int]`, `optional`):
+                Sequence dimensions of `buffers`.
+            no_restore_buffers (`set[torch.Tensor]`, `optional`):
+                This set must be a subset of `buffers`. Specifies which buffers from `buffers` argument won't be
+                restored after the context exits. These buffers will be then kept in sharded state.
+
+        <Tip warning={true}>
+
+        `context_parallel` is currently only supported together with FSDP2, and requires `parallelism_config.cp_size` > 1. If either
+        of these conditions are not met, this context manager will have no effect, though to enable fewer code changes it will not raise an Exception.
+
+        </Tip>
+
+        <Tip warning={true}>
+
+        This context manager has to be recreated with each training step, as shown in the example below.
+
+        </Tip>
+
+        Example:
+
+        ```python
+        >>> for batch in dataloader:
+        ...     with accelerator.maybe_context_parallel(
+        ...         buffers=[batch["input_ids"], batch["attention_mask"]],
+        ...         buffer_seq_dims=[1, 1],
+        ...         no_restore_buffers={batch["input_ids"]},
+        ...     ):
+        ...         outputs = model(batch)
+        ...         ...
+        ```
+        """
+        # We don't need to check FSDP2 as parallelism_config does that for us
+        # Invariant: in this branch self._cp_context is set, as it was set by `self._prepare_cp`
+        if self.parallelism_config and self.parallelism_config.cp_enabled:
+            with self._cp_context(
+                buffers=buffers, buffer_seq_dims=buffer_seq_dims, no_restore_buffers=no_restore_buffers
+            ):
+                yield
+        else:
+            if not getattr(self, "_warned_about_cp", False) and self.is_main_process:
+                logger.warning("Context parallel training is not enabled. This context manager will have no effect.")
+            # As this context manager is recreated each training step, we only warn once
+            self._warned_about_cp = True
+            yield
+
     @contextmanager
     def autocast(self, autocast_handler: AutocastKwargs = None):
         """
 
@@ -505,6 +505,53 @@ def get_cluster_input():
                 error_message="Please enter yes or no.",
             )
 
+    parallelism_config = {}
+
+    if fsdp_config.get("fsdp_version", 1) == 2:
+        use_parallelism_config = _ask_field(
+            "Do you want to use the parallelism config? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
+        )
+
+        if use_parallelism_config:
+            prefix = "parallelism_config_"
+            parallelism_config[prefix + "dp_replicate_size"] = _ask_field(
+                "What is the data parallelism replicate size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+
+            parallelism_config[prefix + "dp_shard_size"] = _ask_field(
+                "What is the FSDP shard size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+
+            parallelism_config[prefix + "tp_size"] = _ask_field(
+                "What is the tensor parallelism size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+
+            parallelism_config[prefix + "cp_size"] = _ask_field(
+                "What is the context parallelism size? [1]: ",
+                int,
+                default=1,
+                error_message="Please enter an integer.",
+            )
+            if parallelism_config[prefix + "cp_size"] > 1:
+                parallelism_config[prefix + "cp_comm_strategy"] = _ask_options(
+                    "What is the compute parallelism communication strategy?",
+                    ["allgather", "alltoall"],
+                    lambda x: ["allgather", "alltoall"][int(x)],
+                    default=0,
+                )
+
     megatron_lm_config = {}
     if distributed_type in [DistributedType.MULTI_GPU]:
         use_megatron_lm = _ask_field(
@@ -849,6 +896,7 @@ def get_cluster_input():
         fp8_config=fp8_config,
         deepspeed_config=deepspeed_config,
         fsdp_config=fsdp_config,
+        parallelism_config=parallelism_config,
         megatron_lm_config=megatron_lm_config,
         ipex_config=ipex_config,
         mpirun_config=mpirun_config,
 
@@ -194,6 +194,8 @@ class ClusterConfig(BaseConfig):
     deepspeed_config: dict = None
     # args for fsdp
     fsdp_config: dict = None
+    # args for parallelism config
+    parallelism_config: dict = None
     # args for megatron_lm
     megatron_lm_config: dict = None
     # args for ipex
@@ -229,6 +231,8 @@ def __post_init__(self):
             self.mpirun_config = {}
         if self.fp8_config is None:
             self.fp8_config = {}
+        if self.parallelism_config is None:
+            self.parallelism_config = {}
         return super().__post_init__()
 
 
 
@@ -269,6 +269,12 @@ def launch_command_parser(subparsers=None):
         action="store_true",
         help="Whether to use fsdp.",
     )
+    paradigm_args.add_argument(
+        "--use_parallelism_config",
+        default=False,
+        action="store_true",
+        help="Whether to use the parallelism config to configure the N-d distributed training.",
+    )
     paradigm_args.add_argument(
         "--use_megatron_lm",
         default=False,
@@ -767,6 +773,45 @@ def launch_command_parser(subparsers=None):
         help="The number of oneCCL worker threads when using Accelerate to launch multi-CPU training with mpirun.",
     )
 
+    # ParallelismConfig arguments
+    parallelism_config_args = parser.add_argument_group(
+        "ParallelismConfig Arguments",
+        "Arguments related to the ParallelismConfig used for distributed training.",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_dp_replicate_size",
+        type=int,
+        default=1,
+        help="The number of processes for data parallel training. Defaults to 1 (no data parallelism).",
+    )
+
+    parallelism_config_args.add_argument(
+        "--parallelism_config_dp_shard_size",
+        type=int,
+        default=-1,
+        help="The number of processes for FSDP sharding. Defaults to 1 (No FSDP sharding).",
+    )
+
+    parallelism_config_args.add_argument(
+        "--parallelism_config_tp_size",
+        type=int,
+        default=1,
+        help="The number of processes for tensor parallel training. Defaults to 1 (no tensor parallelism).",
+    )
+
+    parallelism_config_args.add_argument(
+        "--parallelism_config_cp_size",
+        type=int,
+        default=1,
+        help="The number of processese for context parallel training. Defaults to 1 (no context parallelism).",
+    )
+    parallelism_config_args.add_argument(
+        "--parallelism_config_cp_comm_strategy",
+        type=str,
+        default="allgather",
+        help="The communication strategy for context parallel training. Defaults to 'allgather'. Other option is alltoall",
+    )
+
     # Other arguments of the training scripts
     parser.add_argument("training_script_args", nargs=argparse.REMAINDER, help="Arguments of the training script.")
 
@@ -994,6 +1039,16 @@ def _validate_launch_command(args):
     if args.multi_gpu and (args.num_processes is not None) and (args.num_processes < 2):
         raise ValueError("You need to use at least 2 processes to use `--multi_gpu`.")
 
+    # TODO: Merge into 1 if
+    if not args.use_fsdp and args.use_parallelism_config:
+        raise ValueError(
+            "You cannot use `--use_parallelism_config` without `--use_fsdp`. Please set `--use_fsdp` to True if you want to use parallelism config."
+        )
+    elif args.fsdp_version == 1 and args.use_parallelism_config:
+        raise ValueError(
+            "You cannot use `--use_parallelism_config` with FSDP version 1. Please set `--fsdp_version=2` if you want to use parallelism config."
+        )
+
     defaults = None
     warned = []
     mp_from_config_flag = False
@@ -1027,6 +1082,7 @@ def _validate_launch_command(args):
             args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
             args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
             args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
+            args.use_parallelism_config = defaults.parallelism_config != {}
         if args.gpu_ids is None:
             if defaults.gpu_ids is not None:
                 args.gpu_ids = defaults.gpu_ids