Speedup model loading by 4-5x in Diffusers ⚡ (#3674)

a-r-r-o-w · web-flow · commit 524e5f9828b1 · 2025-07-11T16:58:35.000+02:00
* update

* update

* make style

* update

* merge if statements
diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py
@@ -222,6 +222,8 @@ def set_module_tensor_to_device(
     dtype: Optional[Union[str, torch.dtype]] = None,
     fp16_statistics: Optional[torch.HalfTensor] = None,
     tied_params_map: Optional[dict[int, dict[torch.device, torch.Tensor]]] = None,
+    non_blocking: bool = False,
+    clear_cache: bool = True,
 ):
     """
     A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing
@@ -245,6 +247,10 @@ def set_module_tensor_to_device(
             A map of current data pointers to dictionaries of devices to already dispatched tied weights. For a given
             execution device, this parameter is useful to reuse the first available pointer of a shared weight on the
             device for all others, instead of duplicating memory.
+        non_blocking (`bool`, *optional*, defaults to `False`):
+            If `True`, the device transfer will be asynchronous with respect to the host, if possible.
+        clear_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not to clear the device cache after setting the tensor on the device.
     """
     # Recurse if needed
     if "." in tensor_name:
@@ -295,9 +301,9 @@ def set_module_tensor_to_device(
 
         if dtype is None:
             # For compatibility with PyTorch load_state_dict which converts state dict dtype to existing dtype in model
-            value = value.to(old_value.dtype)
+            value = value.to(old_value.dtype, non_blocking=non_blocking)
         elif not str(value.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
-            value = value.to(dtype)
+            value = value.to(dtype, non_blocking=non_blocking)
 
     device_quantization = None
     with torch.no_grad():
@@ -326,15 +332,15 @@ def set_module_tensor_to_device(
         if "xpu" in str(device) and not is_xpu_available():
             raise ValueError(f'{device} is not available, you should use device="cpu" instead')
         if value is None:
-            new_value = old_value.to(device)
+            new_value = old_value.to(device, non_blocking=non_blocking)
             if dtype is not None and device in ["meta", torch.device("meta")]:
                 if not str(old_value.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
-                    new_value = new_value.to(dtype)
+                    new_value = new_value.to(dtype, non_blocking=non_blocking)
 
                 if not is_buffer:
                     module._parameters[tensor_name] = param_cls(new_value, requires_grad=old_value.requires_grad)
         elif isinstance(value, torch.Tensor):
-            new_value = value.to(device)
+            new_value = value.to(device, non_blocking=non_blocking)
         else:
             new_value = torch.tensor(value, device=device)
         if device_quantization is not None:
@@ -347,24 +353,30 @@ def set_module_tensor_to_device(
             if param_cls.__name__ in ["Int8Params", "FP4Params", "Params4bit"]:
                 if param_cls.__name__ == "Int8Params" and new_value.dtype == torch.float32:
                     # downcast to fp16 if any - needed for 8bit serialization
-                    new_value = new_value.to(torch.float16)
+                    new_value = new_value.to(torch.float16, non_blocking=non_blocking)
                 # quantize module that are going to stay on the cpu so that we offload quantized weights
                 if device == "cpu" and param_cls.__name__ == "Int8Params":
                     new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(0).to("cpu")
                     new_value.CB = new_value.CB.to("cpu")
                     new_value.SCB = new_value.SCB.to("cpu")
                 else:
-                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(device)
+                    new_value = param_cls(new_value, requires_grad=old_value.requires_grad, **kwargs).to(
+                        device, non_blocking=non_blocking
+                    )
             elif param_cls.__name__ in ["QTensor", "QBitsTensor"]:
-                new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad).to(device)
+                new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad).to(
+                    device, non_blocking=non_blocking
+                )
             elif param_cls.__name__ in ["AffineQuantizedTensor"]:
-                new_value = new_value.to(device)
+                new_value = new_value.to(device, non_blocking=non_blocking)
             else:
-                new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
+                new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(
+                    device, non_blocking=non_blocking
+                )
 
             module._parameters[tensor_name] = new_value
             if fp16_statistics is not None:
-                module._parameters[tensor_name].SCB = fp16_statistics.to(device)
+                module._parameters[tensor_name].SCB = fp16_statistics.to(device, non_blocking=non_blocking)
                 del fp16_statistics
             # as we put the weight to meta, it doesn't have SCB attr anymore. make sure that it is not a meta weight
             if (
@@ -390,8 +402,9 @@ def set_module_tensor_to_device(
                 device_index = torch.device(device).index if torch.device(device).type == "cuda" else None
                 if not getattr(module.weight, "quant_state", None) and device_index is not None:
                     module.weight = module.weight.cuda(device_index)
+
     # clean pre and post forward hook
-    if device != "cpu":
+    if clear_cache and device != "cpu":
         clear_device_cache()
 
     # When handling tied weights, we update tied_params_map to keep track of the tied weights that have already been allocated on the device in