Updated docstring.

TitouanCh · TitouanCh · commit 9d493564cc99 · 2025-07-22T15:02:21.000+02:00
diff --git a/src/accelerate/utils/operations.py b/src/accelerate/utils/operations.py
@@ -67,7 +67,11 @@ def is_namedtuple(data):
     Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
     `namedtuple` perfectly.
     """
-    return isinstance(data, tuple) and hasattr(data, "_asdict") and hasattr(data, "_fields")
+    return (
+        isinstance(data, tuple)
+        and hasattr(data, "_asdict")
+        and hasattr(data, "_fields")
+    )
 
 
 def honor_type(obj, generator):
@@ -81,7 +85,9 @@ def honor_type(obj, generator):
         return type(obj)(generator)
 
 
-def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_other_type=False, **kwargs):
+def recursively_apply(
+    func, data, *args, test_type=is_torch_tensor, error_on_other_type=False, **kwargs
+):
     """
     Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.
 
@@ -108,7 +114,12 @@ def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_oth
             data,
             (
                 recursively_apply(
-                    func, o, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
+                    func,
+                    o,
+                    *args,
+                    test_type=test_type,
+                    error_on_other_type=error_on_other_type,
+                    **kwargs,
                 )
                 for o in data
             ),
@@ -117,7 +128,12 @@ def recursively_apply(func, data, *args, test_type=is_torch_tensor, error_on_oth
         return type(data)(
             {
                 k: recursively_apply(
-                    func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
+                    func,
+                    v,
+                    *args,
+                    test_type=test_type,
+                    error_on_other_type=error_on_other_type,
+                    **kwargs,
                 )
                 for k, v in data.items()
             }
@@ -167,7 +183,13 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
             return tensor.to(device)
     elif isinstance(tensor, (tuple, list)):
         return honor_type(
-            tensor, (send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys) for t in tensor)
+            tensor,
+            (
+                send_to_device(
+                    t, device, non_blocking=non_blocking, skip_keys=skip_keys
+                )
+                for t in tensor
+            ),
         )
     elif isinstance(tensor, Mapping):
         if isinstance(skip_keys, str):
@@ -176,7 +198,13 @@ def send_to_device(tensor, device, non_blocking=False, skip_keys=None):
             skip_keys = []
         return type(tensor)(
             {
-                k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
+                k: (
+                    t
+                    if k in skip_keys
+                    else send_to_device(
+                        t, device, non_blocking=non_blocking, skip_keys=skip_keys
+                    )
+                )
                 for k, t in tensor.items()
             }
         )
@@ -231,7 +259,9 @@ def initialize_tensors(data_structure):
     def _initialize_tensor(tensor_info):
         return torch.empty(*tensor_info.shape, dtype=tensor_info.dtype)
 
-    return recursively_apply(_initialize_tensor, data_structure, test_type=is_tensor_information)
+    return recursively_apply(
+        _initialize_tensor, data_structure, test_type=is_tensor_information
+    )
 
 
 def find_batch_size(data):
@@ -253,7 +283,9 @@ def find_batch_size(data):
         for k in data.keys():
             return find_batch_size(data[k])
     elif not isinstance(data, torch.Tensor):
-        raise TypeError(f"Can only find the batch size of tensors but got {type(data)}.")
+        raise TypeError(
+            f"Can only find the batch size of tensors but got {type(data)}."
+        )
     return data.shape[0]
 
 
@@ -344,7 +376,9 @@ def _gpu_gather_one(tensor):
             # a backend of `None` is always CPU
             # also gloo does not support `all_gather_into_tensor`,
             # which will result in a larger memory overhead for the op
-            output_tensors = [torch.empty_like(tensor) for _ in range(state.num_processes)]
+            output_tensors = [
+                torch.empty_like(tensor) for _ in range(state.num_processes)
+            ]
             torch.distributed.all_gather(output_tensors, tensor)
             return torch.cat(output_tensors, dim=0)
 
@@ -367,7 +401,10 @@ def verify_operation(function):
 
     @wraps(function)
     def wrapper(*args, **kwargs):
-        if PartialState().distributed_type == DistributedType.NO or not PartialState().debug:
+        if (
+            PartialState().distributed_type == DistributedType.NO
+            or not PartialState().debug
+        ):
             return function(*args, **kwargs)
         operation = f"{function.__module__}.{function.__name__}"
         if "tensor" in kwargs:
@@ -384,7 +421,9 @@ def wrapper(*args, **kwargs):
         if output[0] is not None:
             are_same = output.count(output[0]) == len(output)
             if not are_same:
-                process_shape_str = "\n  - ".join([f"Process {i}: {shape}" for i, shape in enumerate(output)])
+                process_shape_str = "\n  - ".join(
+                    [f"Process {i}: {shape}" for i, shape in enumerate(output)]
+                )
                 raise DistributedOperationException(
                     f"Cannot apply desired operation due to shape mismatches. "
                     "All shapes across devices must be valid."
@@ -465,14 +504,21 @@ def _gpu_broadcast_one(tensor, src=0):
         torch.distributed.broadcast(tensor, src=src)
         return tensor
 
-    return recursively_apply(_gpu_broadcast_one, data, error_on_other_type=True, src=src)
+    return recursively_apply(
+        _gpu_broadcast_one, data, error_on_other_type=True, src=src
+    )
 
 
 def _tpu_broadcast(tensor, src=0, name="broadcast tensor"):
     if isinstance(tensor, (list, tuple)):
-        return honor_type(tensor, (_tpu_broadcast(t, name=f"{name}_{i}") for i, t in enumerate(tensor)))
+        return honor_type(
+            tensor,
+            (_tpu_broadcast(t, name=f"{name}_{i}") for i, t in enumerate(tensor)),
+        )
     elif isinstance(tensor, Mapping):
-        return type(tensor)({k: _tpu_broadcast(v, name=f"{name}_{k}") for k, v in tensor.items()})
+        return type(tensor)(
+            {k: _tpu_broadcast(v, name=f"{name}_{k}") for k, v in tensor.items()}
+        )
     return xm.mesh_reduce(name, tensor, lambda x: x[src])
 
 
@@ -499,15 +545,19 @@ def gather_tensor_shape(tensor):
     # Allocate 80 bytes to store the shape
     max_tensor_dimension = 2**20
     state = PartialState()
-    base_tensor = torch.empty(max_tensor_dimension, dtype=torch.int, device=state.device)
+    base_tensor = torch.empty(
+        max_tensor_dimension, dtype=torch.int, device=state.device
+    )
 
     # Since PyTorch can't just send a tensor to another GPU without
     # knowing its size, we store the size of the tensor with data
     # in an allocation
     if tensor is not None:
         shape = tensor.shape
         tensor_dtype = TENSOR_TYPE_TO_INT[tensor.dtype]
-        base_tensor[: len(shape) + 1] = torch.tensor(list(shape) + [tensor_dtype], dtype=int)
+        base_tensor[: len(shape) + 1] = torch.tensor(
+            list(shape) + [tensor_dtype], dtype=int
+        )
     # Perform a reduction to copy the size data onto all GPUs
     base_tensor = reduce(base_tensor, reduction="sum")
     base_tensor = base_tensor[base_tensor.nonzero()]
@@ -549,7 +599,9 @@ def broadcast(tensor, from_process: int = 0):
         The same data structure as `tensor` with all tensors broadcasted to the proper device.
     """
     if PartialState().distributed_type == DistributedType.XLA:
-        return _tpu_broadcast(tensor, src=from_process, name="accelerate.utils.broadcast")
+        return _tpu_broadcast(
+            tensor, src=from_process, name="accelerate.utils.broadcast"
+        )
     elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
         return _gpu_broadcast(tensor, src=from_process)
     else:
@@ -571,7 +623,9 @@ def broadcast_object_list(object_list, from_process: int = 0):
     """
     if PartialState().distributed_type == DistributedType.XLA:
         for i, obj in enumerate(object_list):
-            object_list[i] = xm.mesh_reduce("accelerate.utils.broadcast_object_list", obj, lambda x: x[from_process])
+            object_list[i] = xm.mesh_reduce(
+                "accelerate.utils.broadcast_object_list", obj, lambda x: x[from_process]
+            )
     elif PartialState().distributed_type in TORCH_DISTRIBUTED_OPERATION_TYPES:
         torch.distributed.broadcast_object_list(object_list, src=from_process)
     return object_list
@@ -599,10 +653,14 @@ def _slice_tensor(tensor, tensor_slice):
 
 def concatenate(data, dim=0):
     """
-    Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.
+    Recursively concatenates elements in a nested structure of tensors or strings.
+
+    Supports nested lists, tuples, or dictionaries that contain either:
+    - torch.Tensors (with the same shape except along `dim`)
+    - strings (concatenated as flat lists)
 
     Args:
-        data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
+        data (nested list/tuple/dictionary of lists of tensors `torch.Tensor` or `str`):
             The data to concatenate.
         dim (`int`, *optional*, defaults to 0):
             The dimension on which to concatenate.
@@ -612,11 +670,17 @@ def concatenate(data, dim=0):
     """
     if isinstance(data[0], (tuple, list)):
         first_inner = data[0][0] if len(data[0]) > 0 else None
-            
+
         if isinstance(first_inner, str):
             return honor_type(data[0], [item for sublist in data for item in sublist])
         else:
-            return honor_type(data[0], (concatenate([d[i] for d in data], dim=dim) for i in range(len(data[0]))))
+            return honor_type(
+                data[0],
+                (
+                    concatenate([d[i] for d in data], dim=dim)
+                    for i in range(len(data[0]))
+                ),
+            )
 
     elif isinstance(data[0], Mapping):
         return type(data[0])(
@@ -675,15 +739,24 @@ def _pad_across_processes(tensor, dim=0, pad_index=0, pad_first=False):
         new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
         if pad_first:
             indices = tuple(
-                slice(max_size - old_size[dim], max_size) if i == dim else slice(None) for i in range(len(new_size))
+                slice(max_size - old_size[dim], max_size) if i == dim else slice(None)
+                for i in range(len(new_size))
             )
         else:
-            indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
+            indices = tuple(
+                slice(0, old_size[dim]) if i == dim else slice(None)
+                for i in range(len(new_size))
+            )
         new_tensor[indices] = tensor
         return new_tensor
 
     return recursively_apply(
-        _pad_across_processes, tensor, error_on_other_type=True, dim=dim, pad_index=pad_index, pad_first=pad_first
+        _pad_across_processes,
+        tensor,
+        error_on_other_type=True,
+        dim=dim,
+        pad_index=pad_index,
+        pad_first=pad_first,
     )
 
 
@@ -713,7 +786,10 @@ def _pad_input_tensors(tensor, batch_size, num_processes, dim=0):
         new_size = list(old_size)
         new_size[0] = batch_size + to_pad
         new_tensor = tensor.new_zeros(tuple(new_size))
-        indices = tuple(slice(0, old_size[dim]) if i == dim else slice(None) for i in range(len(new_size)))
+        indices = tuple(
+            slice(0, old_size[dim]) if i == dim else slice(None)
+            for i in range(len(new_size))
+        )
         new_tensor[indices] = tensor
         return new_tensor
 
@@ -765,7 +841,11 @@ def _reduce_across_processes(tensor, reduction="mean", scale=1.0):
         return cloned_tensor
 
     return recursively_apply(
-        _reduce_across_processes, tensor, error_on_other_type=True, reduction=reduction, scale=scale
+        _reduce_across_processes,
+        tensor,
+        error_on_other_type=True,
+        reduction=reduction,
+        scale=scale,
     )
 
 
@@ -785,7 +865,9 @@ def _convert_to_fp32(tensor):
         return tensor.float()
 
     def _is_fp16_bf16_tensor(tensor):
-        return (is_torch_tensor(tensor) or hasattr(tensor, "dtype")) and tensor.dtype in (
+        return (
+            is_torch_tensor(tensor) or hasattr(tensor, "dtype")
+        ) and tensor.dtype in (
             torch.float16,
             torch.bfloat16,
         )