remove xformers dependency

jaeh · jaeh · commit 7ba515146f4a · 2024-06-01T14:52:16.000+02:00
diff --git a/lgm/mvdream/mv_unet.py b/lgm/mvdream/mv_unet.py
@@ -11,19 +11,39 @@
 from diffusers.configuration_utils import ConfigMixin
 from diffusers.models.modeling_utils import ModelMixin
 
-# require xformers!
-import xformers
-import xformers.ops
-
 from kiui.cam import orbit_camera
 
+def memory_efficient_attention(q, k, v, attn_bias=None):
+    """
+    Implements a memory-efficient attention mechanism.
+
+    Parameters:
+    - q, k, v: Query, Key, Value tensors of shape (batch_size, seq_len_q, dim)
+    - attn_bias: Optional bias tensor of shape (seq_len_k, seq_len_v)
+
+    Returns:
+    - output tensor of shape (batch_size, seq_len_q, dim)
+    """
+    # Calculate attention scores
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(k.size(-1))
+    if attn_bias is not None:
+        scores += attn_bias
+
+    # Apply softmax to get attention weights
+    attention_weights = torch.nn.functional.softmax(scores, dim=-1)
+
+    # Compute the weighted sum of values
+    output = torch.matmul(attention_weights, v)
+
+    return output
+
 def get_camera(
     num_frames, elevation=0, azimuth_start=0, azimuth_span=360, blender_coord=True, extra_view=False,
 ):
     angle_gap = azimuth_span / num_frames
     cameras = []
     for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
-        
+
         pose = orbit_camera(elevation, azimuth, radius=1) # [4, 4]
 
         # opengl to blender
@@ -140,17 +160,17 @@ def forward(self, x):
 class MemoryEfficientCrossAttention(nn.Module):
     # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
     def __init__(
-            self, 
-            query_dim, 
-            context_dim=None, 
-            heads=8, 
-            dim_head=64, 
+            self,
+            query_dim,
+            context_dim=None,
+            heads=8,
+            dim_head=64,
             dropout=0.0,
             ip_dim=0,
             ip_weight=1,
         ):
         super().__init__()
-        
+
         inner_dim = dim_head * heads
         context_dim = default(context_dim, query_dim)
 
@@ -199,7 +219,7 @@ def forward(self, x, context=None):
         )
 
         # actually compute the attention, what we cannot get enough of
-        out = xformers.ops.memory_efficient_attention(
+        out = memory_efficient_attention(
             q, k, v, attn_bias=None, op=self.attention_op
         )
 
@@ -213,7 +233,7 @@ def forward(self, x, context=None):
                 (k_ip, v_ip),
             )
             # actually compute the attention, what we cannot get enough of
-            out_ip = xformers.ops.memory_efficient_attention(
+            out_ip = memory_efficient_attention(
                 q, k_ip, v_ip, attn_bias=None, op=self.attention_op
             )
             out = out + self.ip_weight * out_ip
@@ -228,7 +248,7 @@ def forward(self, x, context=None):
 
 
 class BasicTransformerBlock3D(nn.Module):
-    
+
     def __init__(
         self,
         dim,
@@ -259,7 +279,7 @@ def __init__(
             # ip only applies to cross-attention
             ip_dim=ip_dim,
             ip_weight=ip_weight,
-        ) 
+        )
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
@@ -311,9 +331,9 @@ def __init__(
                 for d in range(depth)
             ]
         )
-        
+
         self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
-        
+
 
     def forward(self, x, context=None, num_frames=1):
         # note: if no context is given, cross-attention defaults to self-attention
@@ -328,7 +348,7 @@ def forward(self, x, context=None, num_frames=1):
             x = block(x, context=context[i], num_frames=num_frames)
         x = self.proj_out(x)
         x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
-        
+
         return x + x_in
 
 
@@ -672,7 +692,7 @@ def __init__(
     ):
         super().__init__()
         assert context_dim is not None
-        
+
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
 
@@ -699,7 +719,7 @@ def __init__(
                     "as a list/tuple (per-level) with the same length as channel_mult"
                 )
             self.num_res_blocks = num_res_blocks
-        
+
         if num_attention_blocks is not None:
             assert len(num_attention_blocks) == len(self.num_res_blocks)
             assert all(
@@ -848,7 +868,7 @@ def __init__(
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
-        
+
         self.middle_block = CondSequential(
             ResBlock(
                 ch,
@@ -865,7 +885,7 @@ def __init__(
                 depth=transformer_depth,
                 ip_dim=self.ip_dim,
                 ip_weight=self.ip_weight,
-            ), 
+            ),
             ResBlock(
                 ch,
                 time_embed_dim,
@@ -983,7 +1003,7 @@ def forward(
         # Add camera embeddings
         if camera is not None:
             emb = emb + self.camera_embed(camera)
-        
+
         # imagedream variant
         if self.ip_dim > 0:
             x[(num_frames - 1) :: num_frames, :, :, :] = ip_img # place at [4, 9]
@@ -1002,4 +1022,4 @@ def forward(
         if self.predict_codebook_ids:
             return self.id_predictor(h)
         else:
-            return self.out(h)
+            return self.out(h)