xpu enablement on left cases (#3654)

yao-matrix · github-actions[bot] · web-flow · commit 1ac8643df702 · 2025-07-07T18:10:53.000+02:00
* 1. enable xpu for launcher 2. expand cuda only ds uts to xpu 3. expand profiler example to xpu

Signed-off-by: YAO Matrix &lt;matrix.yao@intel.com&gt;

* fix style

Signed-off-by: YAO Matrix &lt;matrix.yao@intel.com&gt;

* rename

Signed-off-by: YAO Matrix &lt;matrix.yao@intel.com&gt;

* Update profiler.py

* Apply style fixes

---------

Signed-off-by: YAO Matrix &lt;matrix.yao@intel.com&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/examples/by_feature/profiler.py b/examples/by_feature/profiler.py
@@ -31,8 +31,8 @@
 #
 # This example trains a Bert base model on GLUE MRPC
 # in any of the following settings (with the same script):
-#   - single CPU or single GPU
-#   - multi GPUS (using PyTorch distributed mode)
+#   - single CPU or single device (CUDA GPU, Intel XPU etc.)
+#   - multi devices (using PyTorch distributed mode)
 #   - (multi) TPUs
 #   - fp16 (mixed-precision) or fp32 (normal precision)
 #
@@ -183,7 +183,8 @@ def training_function(config, args):
         # New Code #
         accelerator.print(
             prof.key_averages().table(
-                sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
+                sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
+                row_limit=-1,
             )
         )
 
@@ -215,7 +216,7 @@ def main():
         choices=["no", "fp16", "bf16", "fp8"],
         help="Whether to use mixed precision. Choose"
         "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-        "and an Nvidia Ampere GPU.",
+        "and an Nvidia Ampere GPU or an Intel XPU.",
     )
     # New Code #
     parser.add_argument(
diff --git a/src/accelerate/launchers.py b/src/accelerate/launchers.py
@@ -60,8 +60,8 @@ def notebook_launcher(
 
     <Tip warning={true}>
 
-    To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If
-    any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability.
+    To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
+    have been made, you will need to restart the notebook and make sure no cells use any device capability.
 
     Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
     of those calls have been made.
@@ -76,11 +76,11 @@ def notebook_launcher(
             Tuple of arguments to pass to the function (it will receive `*args`).
         num_processes (`int`, *optional*):
             The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
-            the number of GPUs available otherwise.
+            the number of devices available otherwise.
         mixed_precision (`str`, *optional*, defaults to `"no"`):
-            If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
+            If `fp16` or `bf16`, will use mixed precision training on multi-device.
         use_port (`str`, *optional*, defaults to `"29500"`):
-            The port to use to communicate between processes when launching a multi-GPU training.
+            The port to use to communicate between processes when launching a multi-device training.
         master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
             The address to use for communication between processes.
         node_rank (`int`, *optional*, defaults to 0):
@@ -105,7 +105,7 @@ def notebook_launcher(
     Example:
 
     ```python
-    # Assume this is defined in a Jupyter Notebook on an instance with two GPUs
+    # Assume this is defined in a Jupyter Notebook on an instance with two devices
     from accelerate import notebook_launcher
 
 
@@ -158,27 +158,27 @@ def train(*args):
     else:
         if num_processes is None:
             raise ValueError(
-                "You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
+                "You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
             )
         if node_rank >= num_nodes:
             raise ValueError("The node_rank must be less than the number of nodes.")
         if num_processes > 1:
-            # Multi-GPU launch
+            # Multi-device launch
             from torch.distributed.launcher.api import LaunchConfig, elastic_launch
             from torch.multiprocessing import start_processes
             from torch.multiprocessing.spawn import ProcessRaisedException
 
             if len(AcceleratorState._shared_state) > 0:
                 raise ValueError(
-                    "To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
+                    "To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
                     "inside your training function. Restart your notebook and make sure no cells initializes an "
                     "`Accelerator`."
                 )
-            # Check for specific libraries known to initialize CUDA that users constantly use
+            # Check for specific libraries known to initialize device that users constantly use
             problematic_imports = are_libraries_initialized("bitsandbytes")
             if len(problematic_imports) > 0:
                 err = (
-                    "Could not start distributed process. Libraries known to initialize CUDA upon import have been "
+                    "Could not start distributed process. Libraries known to initialize device upon import have been "
                     "imported already. Please keep these imports inside your training function to try and help with this:"
                 )
                 for lib_name in problematic_imports:
@@ -203,24 +203,26 @@ def train(*args):
             # process here (the other ones will be set be the launcher).
             with patch_environment(**patched_env):
                 # First dummy launch
+                device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
+                distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
                 if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
-                    launcher = PrepareForLaunch(test_launch, distributed_type="MULTI_GPU")
+                    launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
                     try:
                         start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
                     except ProcessRaisedException as e:
                         err = "An issue was found when verifying a stable environment for the notebook launcher."
-                        if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
+                        if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                             raise RuntimeError(
                                 f"{err}"
                                 "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                                 "Please review your imports and test them when running the `notebook_launcher()` to identify "
-                                "which one is problematic and causing CUDA to be initialized."
+                                f"which one is problematic and causing {device_type.upper()} to be initialized."
                             ) from e
                         else:
                             raise RuntimeError(f"{err} The following error was raised: {e}") from e
                 # Now the actual launch
-                launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
-                print(f"Launching training on {num_processes} GPUs.")
+                launcher = PrepareForLaunch(function, distributed_type=distributed_type)
+                print(f"Launching training on {num_processes} {device_type.upper()}s.")
                 try:
                     if rdzv_conf is None:
                         rdzv_conf = {}
@@ -244,23 +246,25 @@ def train(*args):
                         launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
                     elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
                 except ProcessRaisedException as e:
-                    if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
+                    if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
                         raise RuntimeError(
-                            "CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
+                            f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
                             "This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
                             "Please review your imports and test them when running the `notebook_launcher()` to identify "
-                            "which one is problematic and causing CUDA to be initialized."
+                            f"which one is problematic and causing {device_type.upper()} to be initialized."
                         ) from e
                     else:
                         raise RuntimeError(f"An issue was found when launching the training: {e}") from e
 
         else:
-            # No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
+            # No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
             if is_mps_available():
                 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
                 print("Launching training on MPS.")
             elif torch.cuda.is_available():
                 print("Launching training on one GPU.")
+            elif torch.xpu.is_available():
+                print("Launching training on one XPU.")
             else:
                 print("Launching training on CPU.")
             function(*args)
diff --git a/tests/deepspeed/test_deepspeed_gradient_accumulation.py b/tests/deepspeed/test_deepspeed_gradient_accumulation.py
@@ -22,7 +22,7 @@
 from transformers.trainer_utils import set_seed
 
 from accelerate.accelerator import Accelerator
-from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_deepspeed
+from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
 from accelerate.test_utils.training import RegressionDataset
 from accelerate.utils import patch_environment
 from accelerate.utils.dataclasses import DeepSpeedPlugin
@@ -37,7 +37,6 @@
 
 
 @require_deepspeed
-@require_cuda
 class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
     def setUp(self):
         super().setUp()

Original file line number	Diff line number	Diff line change
`@@ -31,8 +31,8 @@`
`31`	`31`	`#`
`32`	`32`	`# This example trains a Bert base model on GLUE MRPC`
`33`	`33`	`# in any of the following settings (with the same script):`
`34`		`-# - single CPU or single GPU`
`35`		`-# - multi GPUS (using PyTorch distributed mode)`
	`34`	`+# - single CPU or single device (CUDA GPU, Intel XPU etc.)`
	`35`	`+# - multi devices (using PyTorch distributed mode)`
`36`	`36`	`# - (multi) TPUs`
`37`	`37`	`# - fp16 (mixed-precision) or fp32 (normal precision)`
`38`	`38`	`#`
`@@ -183,7 +183,8 @@ def training_function(config, args):`
`183`	`183`	`# New Code #`
`184`	`184`	`accelerator.print(`
`185`	`185`	`prof.key_averages().table(`
`186`		`- sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1`
	`186`	`+ sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",`
	`187`	`+ row_limit=-1,`
`187`	`188`	`)`
`188`	`189`	`)`
`189`	`190`
`@@ -215,7 +216,7 @@ def main():`
`215`	`216`	`choices=["no", "fp16", "bf16", "fp8"],`
`216`	`217`	`help="Whether to use mixed precision. Choose"`
`217`	`218`	`"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."`
`218`		`- "and an Nvidia Ampere GPU.",`
	`219`	`+ "and an Nvidia Ampere GPU or an Intel XPU.",`
`219`	`220`	`)`
`220`	`221`	`# New Code #`
`221`	`222`	`parser.add_argument(`