huggingface · SunMarc · Aug 5, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -1732,6 +1732,17 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                         "limit_all_gathers": fsdp_plugin.limit_all_gathers,
                         "device_id": self.device,
                     }
+
+                    if isinstance(kwargs["ignored_modules"], str):
+                        reg = re.compile(kwargs["ignored_modules"])
+                        ignored = []
+                        for name, module in model.named_modules():
+                            if reg.fullmatch(name):
+                                # ensure that the device for these modules is still set correctly
+                                module.to(self.device)
+                                ignored.append(module)
+                        kwargs["ignored_modules"] = ignored
+
                     model = FSDP(model, **kwargs)
                     if fsdp_plugin.activation_checkpointing:
                         from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (

diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -1516,8 +1516,9 @@ class FullyShardedDataParallelPlugin:
             Whether to offload parameters to CPU. Should be either a `bool` or an instance of
             `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload` or
             `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffloadPolicy` if `fsdp_version` is set to 2.
-        ignored_modules (`Optional[Iterable[torch.nn.Module]]`, defaults to `None`):
-            A list of modules to ignore when wrapping with FSDP.
+        ignored_modules (`Optional[Union[Iterable[torch.nn.Module], str]]`, defaults to `None`):
+            A list of modules to ignore when wrapping with FSDP. When passing a string, will match the modules by name
+            using regex fullmatch.
         state_dict_type (`Union[str, torch.distributed.fsdp.StateDictType]`, defaults to `'FULL_STATE_DICT'`):
             State dict type to use. If a string, it must be one of `full_state_dict`, `local_state_dict`, or
             `sharded_state_dict`.
@@ -1607,7 +1608,7 @@ class FullyShardedDataParallelPlugin:
             "help": "Whether to offload parameters to CPU. Should be either a `bool` or an instance of `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffload` or `torch.distributed.fsdp.fully_sharded_data_parallel.CPUOffloadPolicy` if `fsdp_version` is set to 2. Defaults to `False`"
         },
     )
-    ignored_modules: Optional[Iterable[torch.nn.Module]] = field(
+    ignored_modules: Optional[Union[Iterable[torch.nn.Module], str]] = field(
         default=None,
         metadata={"help": "A list of modules to ignore when wrapping with FSDP."},
     )
@@ -1839,6 +1840,9 @@ def __post_init__(self):
                 str_to_bool(os.environ.get(env_prefix + "ACTIVATION_CHECKPOINTING", "False")) == 1
             )
 
+        if self.ignored_modules is None:
+            self.ignored_modules = os.environ.get(env_prefix + "IGNORED_MODULES", None)
+
         if self.cpu_ram_efficient_loading is None:
             self.cpu_ram_efficient_loading = (
                 str_to_bool(os.environ.get(env_prefix + "CPU_RAM_EFFICIENT_LOADING", "False")) == 1

diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
@@ -328,6 +328,8 @@ def prepare_multi_gpu_env(args: argparse.Namespace) -> dict[str, str]:
         current_env["FSDP_CPU_RAM_EFFICIENT_LOADING"] = str(args.fsdp_cpu_ram_efficient_loading).lower()
         current_env["FSDP_SYNC_MODULE_STATES"] = str(args.fsdp_sync_module_states).lower()
         current_env["FSDP_ACTIVATION_CHECKPOINTING"] = str(args.fsdp_activation_checkpointing).lower()
+        if getattr(args, "fsdp_ignored_modules", None) is not None:
+            current_env["FSDP_IGNORED_MODULES"] = str(args.fsdp_ignored_modules)
 
     if args.use_megatron_lm:
         prefix = "MEGATRON_LM_"

diff --git a/tests/test_configs/latest_fsdp.yaml b/tests/test_configs/latest_fsdp.yaml
@@ -15,6 +15,7 @@ fsdp_config:
   fsdp_sync_module_states: true
   fsdp_transformer_layer_cls_to_wrap: BertLayer
   fsdp_use_orig_params: true
+  fsdp_ignored_modules: null
 machine_rank: 0
 main_training_function: main
 mixed_precision: 'no'