Skip to content

Commit 1ac8643

Browse files
xpu enablement on left cases (#3654)
* 1. enable xpu for launcher 2. expand cuda only ds uts to xpu 3. expand profiler example to xpu Signed-off-by: YAO Matrix <[email protected]> * fix style Signed-off-by: YAO Matrix <[email protected]> * rename Signed-off-by: YAO Matrix <[email protected]> * Update profiler.py * Apply style fixes --------- Signed-off-by: YAO Matrix <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 07ce748 commit 1ac8643

File tree

3 files changed

+30
-26
lines changed

3 files changed

+30
-26
lines changed

examples/by_feature/profiler.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
#
3232
# This example trains a Bert base model on GLUE MRPC
3333
# in any of the following settings (with the same script):
34-
# - single CPU or single GPU
35-
# - multi GPUS (using PyTorch distributed mode)
34+
# - single CPU or single device (CUDA GPU, Intel XPU etc.)
35+
# - multi devices (using PyTorch distributed mode)
3636
# - (multi) TPUs
3737
# - fp16 (mixed-precision) or fp32 (normal precision)
3838
#
@@ -183,7 +183,8 @@ def training_function(config, args):
183183
# New Code #
184184
accelerator.print(
185185
prof.key_averages().table(
186-
sort_by="self_cpu_time_total" if args.cpu else "self_cuda_time_total", row_limit=-1
186+
sort_by="self_cpu_time_total" if args.cpu else f"self_{accelerator.device.type}_time_total",
187+
row_limit=-1,
187188
)
188189
)
189190

@@ -215,7 +216,7 @@ def main():
215216
choices=["no", "fp16", "bf16", "fp8"],
216217
help="Whether to use mixed precision. Choose"
217218
"between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
218-
"and an Nvidia Ampere GPU.",
219+
"and an Nvidia Ampere GPU or an Intel XPU.",
219220
)
220221
# New Code #
221222
parser.add_argument(

src/accelerate/launchers.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def notebook_launcher(
6060
6161
<Tip warning={true}>
6262
63-
To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If
64-
any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability.
63+
To use this function absolutely zero calls to a device must be made in the notebook session before calling. If any
64+
have been made, you will need to restart the notebook and make sure no cells use any device capability.
6565
6666
Setting `ACCELERATE_DEBUG_MODE="1"` in your environment will run a test before truly launching to ensure that none
6767
of those calls have been made.
@@ -76,11 +76,11 @@ def notebook_launcher(
7676
Tuple of arguments to pass to the function (it will receive `*args`).
7777
num_processes (`int`, *optional*):
7878
The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to
79-
the number of GPUs available otherwise.
79+
the number of devices available otherwise.
8080
mixed_precision (`str`, *optional*, defaults to `"no"`):
81-
If `fp16` or `bf16`, will use mixed precision training on multi-GPU.
81+
If `fp16` or `bf16`, will use mixed precision training on multi-device.
8282
use_port (`str`, *optional*, defaults to `"29500"`):
83-
The port to use to communicate between processes when launching a multi-GPU training.
83+
The port to use to communicate between processes when launching a multi-device training.
8484
master_addr (`str`, *optional*, defaults to `"127.0.0.1"`):
8585
The address to use for communication between processes.
8686
node_rank (`int`, *optional*, defaults to 0):
@@ -105,7 +105,7 @@ def notebook_launcher(
105105
Example:
106106
107107
```python
108-
# Assume this is defined in a Jupyter Notebook on an instance with two GPUs
108+
# Assume this is defined in a Jupyter Notebook on an instance with two devices
109109
from accelerate import notebook_launcher
110110
111111
@@ -158,27 +158,27 @@ def train(*args):
158158
else:
159159
if num_processes is None:
160160
raise ValueError(
161-
"You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call."
161+
"You have to specify the number of devices you would like to use, add `num_processes=...` to your call."
162162
)
163163
if node_rank >= num_nodes:
164164
raise ValueError("The node_rank must be less than the number of nodes.")
165165
if num_processes > 1:
166-
# Multi-GPU launch
166+
# Multi-device launch
167167
from torch.distributed.launcher.api import LaunchConfig, elastic_launch
168168
from torch.multiprocessing import start_processes
169169
from torch.multiprocessing.spawn import ProcessRaisedException
170170

171171
if len(AcceleratorState._shared_state) > 0:
172172
raise ValueError(
173-
"To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized "
173+
"To launch a multi-device training from your notebook, the `Accelerator` should only be initialized "
174174
"inside your training function. Restart your notebook and make sure no cells initializes an "
175175
"`Accelerator`."
176176
)
177-
# Check for specific libraries known to initialize CUDA that users constantly use
177+
# Check for specific libraries known to initialize device that users constantly use
178178
problematic_imports = are_libraries_initialized("bitsandbytes")
179179
if len(problematic_imports) > 0:
180180
err = (
181-
"Could not start distributed process. Libraries known to initialize CUDA upon import have been "
181+
"Could not start distributed process. Libraries known to initialize device upon import have been "
182182
"imported already. Please keep these imports inside your training function to try and help with this:"
183183
)
184184
for lib_name in problematic_imports:
@@ -203,24 +203,26 @@ def train(*args):
203203
# process here (the other ones will be set be the launcher).
204204
with patch_environment(**patched_env):
205205
# First dummy launch
206+
device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
207+
distributed_type = "MULTI_XPU" if device_type == "xpu" else "MULTI_GPU"
206208
if os.environ.get("ACCELERATE_DEBUG_MODE", "false").lower() == "true":
207-
launcher = PrepareForLaunch(test_launch, distributed_type="MULTI_GPU")
209+
launcher = PrepareForLaunch(test_launch, distributed_type=distributed_type)
208210
try:
209211
start_processes(launcher, args=(), nprocs=num_processes, start_method="fork")
210212
except ProcessRaisedException as e:
211213
err = "An issue was found when verifying a stable environment for the notebook launcher."
212-
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
214+
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
213215
raise RuntimeError(
214216
f"{err}"
215217
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
216218
"Please review your imports and test them when running the `notebook_launcher()` to identify "
217-
"which one is problematic and causing CUDA to be initialized."
219+
f"which one is problematic and causing {device_type.upper()} to be initialized."
218220
) from e
219221
else:
220222
raise RuntimeError(f"{err} The following error was raised: {e}") from e
221223
# Now the actual launch
222-
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU")
223-
print(f"Launching training on {num_processes} GPUs.")
224+
launcher = PrepareForLaunch(function, distributed_type=distributed_type)
225+
print(f"Launching training on {num_processes} {device_type.upper()}s.")
224226
try:
225227
if rdzv_conf is None:
226228
rdzv_conf = {}
@@ -244,23 +246,25 @@ def train(*args):
244246
launch_config_kwargs["log_line_prefix_template"] = log_line_prefix_template
245247
elastic_launch(config=LaunchConfig(**launch_config_kwargs), entrypoint=function)(*args)
246248
except ProcessRaisedException as e:
247-
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]:
249+
if f"Cannot re-initialize {device_type.upper()} in forked subprocess" in e.args[0]:
248250
raise RuntimeError(
249-
"CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. "
251+
f"{device_type.upper()} has been initialized before the `notebook_launcher` could create a forked subprocess. "
250252
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. "
251253
"Please review your imports and test them when running the `notebook_launcher()` to identify "
252-
"which one is problematic and causing CUDA to be initialized."
254+
f"which one is problematic and causing {device_type.upper()} to be initialized."
253255
) from e
254256
else:
255257
raise RuntimeError(f"An issue was found when launching the training: {e}") from e
256258

257259
else:
258-
# No need for a distributed launch otherwise as it's either CPU, GPU or MPS.
260+
# No need for a distributed launch otherwise as it's either CPU, GPU, XPU or MPS.
259261
if is_mps_available():
260262
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
261263
print("Launching training on MPS.")
262264
elif torch.cuda.is_available():
263265
print("Launching training on one GPU.")
266+
elif torch.xpu.is_available():
267+
print("Launching training on one XPU.")
264268
else:
265269
print("Launching training on CPU.")
266270
function(*args)

tests/deepspeed/test_deepspeed_gradient_accumulation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from transformers.trainer_utils import set_seed
2323

2424
from accelerate.accelerator import Accelerator
25-
from accelerate.test_utils.testing import AccelerateTestCase, require_cuda, require_deepspeed
25+
from accelerate.test_utils.testing import AccelerateTestCase, require_deepspeed
2626
from accelerate.test_utils.training import RegressionDataset
2727
from accelerate.utils import patch_environment
2828
from accelerate.utils.dataclasses import DeepSpeedPlugin
@@ -37,7 +37,6 @@
3737

3838

3939
@require_deepspeed
40-
@require_cuda
4140
class DeepSpeedGradientAccumulationTest(AccelerateTestCase):
4241
def setUp(self):
4342
super().setUp()

0 commit comments

Comments
 (0)