Motion Predictor

IDGallagher · IDGallagher · commit 4d0276fcca18 · 2024-07-15T20:46:30.000+01:00
diff --git a/__init__.py b/__init__.py
@@ -25,6 +25,7 @@
     "IG Path Join":         IG_PathJoin,
     "IG Cross Fade Images": IG_CrossFadeImages,
     "IG Interpolate":       IG_Interpolate,
+    "IG MotionPredictor":   IG_MotionPredictor,
     "IG ZFill":             IG_ZFill,
     "IG String List":       IG_StringList,
     "IG Float List":        IG_FloatList,
@@ -43,6 +44,7 @@
     "IG Path Join":         "📂 IG Path Join",
     "IG Cross Fade Images": "🧑🏻‍🧑🏿‍🧒🏽 IG Cross Fade Images",
     "IG Interpolate":       "🧑🏻‍🧑🏿‍🧒🏽 IG Interpolate",
+    "IG MotionPredictor":   "🏃‍♀️ IG Motion Predictor",
     "IG ZFill":             "⌨️ IG ZFill",
     "IG String List":       "📃 IG String List",
     "IG Float List":        "📃 IG Float List",
diff --git a/motion_predictor/__init__.py b/motion_predictor/__init__.py
@@ -0,0 +1,2 @@
+# __init__.py
+from .motion_predictor import MotionPredictor
diff --git a/motion_predictor/motion_predictor.py b/motion_predictor/motion_predictor.py
@@ -0,0 +1,95 @@
+import logging
+import math
+
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.models import ModelMixin
+from einops import rearrange
+from torch import nn
+
+logger = logging.getLogger(__name__)
+
+def generate_positional_encodings(length, hidden_dim):
+    # Precompute positional encodings once in log space
+    position = torch.arange(length).unsqueeze(1)
+    div_term = torch.exp(torch.arange(0, hidden_dim, 2) * -(math.log(10000.0) / hidden_dim))
+    pe = torch.zeros(length, hidden_dim)
+    pe[:, 0::2] = torch.sin(position * div_term)
+    pe[:, 1::2] = torch.cos(position * div_term)
+    return pe
+
+class MotionPredictor(ModelMixin, ConfigMixin):
+    def __init__(self, token_dim:int=768, hidden_dim:int=1024, num_heads:int=16, num_layers:int=8, total_frames:int=16, tokens_per_frame:int=16):
+        super(MotionPredictor, self).__init__()
+        self.total_frames = total_frames
+        self.tokens_per_frame = tokens_per_frame
+
+        # Initialize layers
+        self.input_projection = nn.Linear(token_dim, hidden_dim)  # Project token to hidden dimension
+        self.transformer = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads),
+            num_layers=num_layers
+        )
+        self.output_projection = nn.Linear(hidden_dim, token_dim)  # Project back to token dimension
+        # Initialize positional encodings
+        self.positional_encodings = generate_positional_encodings(total_frames, hidden_dim)
+        self.positional_encodings = nn.Parameter(self.positional_encodings, requires_grad=False)  # Optionally make it a parameter if you want it on the same device automatically
+
+    def create_attention_mask(self, total_frames, num_tokens):
+        # Initialize the mask with float('-inf') everywhere
+        mask = torch.zeros((total_frames * num_tokens, total_frames * num_tokens), dtype=torch.bool, device=self.device)
+
+        # Indices for the first frame tokens and the last frame tokens
+        first_frame_indices = torch.arange(0, num_tokens, device=self.device)
+        last_frame_indices = torch.arange((total_frames - 1) * num_tokens, total_frames * num_tokens, device=self.device)
+
+        # Allow attention to the first and last frame tokens
+        mask[first_frame_indices, :] = 0
+        mask[last_frame_indices, :] = 0
+
+        return mask
+
+    def interpolate_tokens(self, start_tokens:torch.Tensor, end_tokens:torch.Tensor):
+        # Linear interpolation in the token space
+        interpolation_steps = torch.linspace(0, 1, steps=self.total_frames, device=start_tokens.device, dtype=torch.float16)[:, None, None]
+        start_tokens_expanded = start_tokens.unsqueeze(1)  # Shape becomes [batch_size, 1, tokens, token_dim]
+        end_tokens_expanded = end_tokens.unsqueeze(1)      # Shape becomes [batch_size, 1, tokens, token_dim]
+        interpolated_tokens = (start_tokens_expanded * (1 - interpolation_steps) + end_tokens_expanded * interpolation_steps)
+        return interpolated_tokens  # Shape: [batch_size, total_frames, tokens, token_dim]
+
+    def predict_motion(self, start_tokens:torch.Tensor, end_tokens:torch.Tensor):
+        start_tokens = start_tokens.to(self.device)
+        end_tokens = end_tokens.to(self.device)
+
+        # Get interpolated tokens
+        interpolated_tokens = self.interpolate_tokens(start_tokens, end_tokens).to(self.dtype)
+
+        # Flatten frames and tokens dimensions
+        batch_size, total_frames, num_tokens, token_dim = interpolated_tokens.shape
+
+        print(f"Interpolated tokens {interpolated_tokens.shape}")
+        # Apply input projection
+        projected_tokens = self.input_projection(interpolated_tokens)
+
+        # Add positional encodings
+        projected_tokens += self.positional_encodings[:total_frames * num_tokens].unsqueeze(0).unsqueeze(2)  # Add PE to each frame
+
+        # Reshape to match the transformer expected input [seq_len, batch_size, hidden_dim]
+        projected_tokens = rearrange(projected_tokens, 'b f t d -> (f t) b d')
+
+        # Create an attention mask that only allows attending to the first and last frame
+        attention_mask = self.create_attention_mask(total_frames, num_tokens)
+
+        # Transformer predicts the motion along the new sequence dimension
+        logger.debug(f"projected_tokens {projected_tokens.shape} attention_mask {attention_mask.shape}")
+        motion_tokens = self.transformer(projected_tokens, projected_tokens, memory_mask=attention_mask)
+
+        # Reshape back and apply output projection
+        motion_tokens = rearrange(motion_tokens, '(f t) b d -> b f t d', t=num_tokens, f=total_frames)
+        motion_tokens = self.output_projection(motion_tokens)
+
+        return motion_tokens
+
+    def forward(self, start_tokens:torch.Tensor, end_tokens:torch.Tensor):
+        return self.predict_motion(start_tokens, end_tokens)
diff --git a/nodes/interpolate.py b/nodes/interpolate.py
@@ -6,9 +6,12 @@
 import json
 
 import sys
+from comfy import model_management
 import folder_paths
 from ..common.tree import *
 from ..common.constants import *
+from ..motion_predictor import MotionPredictor
+import comfy.utils
 
 def crossfade(images_1, images_2, alpha):
     crossfade = (1 - alpha) * images_1 + alpha * images_2
@@ -42,6 +45,99 @@ def exponential_ease_out(t):
     "exponential_ease_out": exponential_ease_out,
 }
 
+def tensor_to_size(source, dest_size):
+    if isinstance(dest_size, torch.Tensor):
+        dest_size = dest_size.shape[0]
+    source_size = source.shape[0]
+
+    if source_size < dest_size:
+        shape = [dest_size - source_size] + [1]*(source.dim()-1)
+        source = torch.cat((source, source[-1:].repeat(shape)), dim=0)
+    elif source_size > dest_size:
+        source = source[:dest_size]
+
+    return source
+
+class IG_MotionPredictor:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "pos_embeds": ("PROJ_EMBEDS",),
+                "neg_embeds": ("PROJ_EMBEDS",),
+                "transitioning_frames": ("INT", {"default": 16,"min": 0, "max": 4096, "step": 1}),
+                "repeat_count": ("INT", {"default": 1, "min": 1, "max": 4096, "step": 1}),
+                "mode": (["motion_predict", "interpolate_linear"], ),
+                "motion_predictor_file": (folder_paths.get_filename_list("ipadapter"),),
+            }, 
+            "optional": {
+                "positive_prompts": ("STRING", {"default": [], "forceInput": True}),
+                "negative_prompts": ("STRING", {"default": [], "forceInput": True}),
+            }
+        }
+
+    RETURN_TYPES = ("PROJ_EMBEDS", "PROJ_EMBEDS", "STRING", "STRING", "INT",)
+    RETURN_NAMES = ("pos_embeds", "neg_embeds", "positive_string", "negative_string", "BATCH_SIZE", )
+    FUNCTION = "main"
+    CATEGORY = TREE_INTERP 
+
+    @torch.inference_mode()
+    def main(self, pos_embeds, neg_embeds, transitioning_frames, repeat_count, mode, motion_predictor_file, positive_prompts=None, negative_prompts=None):
+        
+        torch_device = model_management.get_torch_device()
+        dtype = model_management.unet_dtype()
+    
+        easing_function = easing_functions["linear"]
+        
+        print( f"Embed shape {pos_embeds.shape}")
+        
+        inbetween_embeds = []
+        # Make sure we have 2 images
+        if len(pos_embeds) > 1:
+            if mode == "motion_predict":
+                motion_predictor = MotionPredictor(total_frames=transitioning_frames).to(torch_device, dtype=dtype)
+                motion_predictor_path = folder_paths.get_full_path("ipadapter", motion_predictor_file)
+                checkpoint = comfy.utils.load_torch_file(motion_predictor_path, safe_load=True)
+                motion_predictor.load_state_dict(checkpoint)
+                for i in range(len(pos_embeds) - 1):
+                    embed1 = pos_embeds[i]
+                    embed2 = pos_embeds[i + 1]
+                    embed1 = embed1.unsqueeze(0)
+                    embed2 = embed2.unsqueeze(0)
+                    inbetween_embeds = motion_predictor(embed1, embed2).squeeze(0)
+            elif mode == "interpolate_linear":
+                # Interpolate embeds
+                for i in range(len(pos_embeds) - 1):
+                    embed1 = pos_embeds[i]
+                    embed2 = pos_embeds[i + 1]
+                    alphas = torch.linspace(0, 1, transitioning_frames)
+                    for alpha in alphas:
+                        eased_alpha = easing_function(alpha.item())
+                        print(f"eased alpha {eased_alpha}")
+                        inbetween_embed = (1 - eased_alpha) * embed1 + eased_alpha * embed2
+                        inbetween_embeds.extend([inbetween_embed])
+                        
+            inbetween_embeds = [embed for embed in inbetween_embeds for _ in range(repeat_count)]
+            # Find size of batch
+            batch_size = len(inbetween_embeds)
+
+        inbetween_embeds = torch.stack(inbetween_embeds, dim=0)
+
+        # ensure that cond and uncond have the same batch size
+        neg_embeds = tensor_to_size(neg_embeds, inbetween_embeds.shape[0])
+
+        # Combine and format prompt strings
+        def format_text_prompts(text_prompts):
+            string = ""
+            for i, prompt in enumerate(text_prompts):
+                string += f"\"{i * transitioning_frames * repeat_count - 1}\":\"{prompt}\",\n"
+            return string
+        
+        positive_string = format_text_prompts(positive_prompts) if positive_prompts is not None and len(positive_prompts) > 0 else "\"0\":\"\",\n"
+        negative_string = format_text_prompts(negative_prompts) if negative_prompts is not None and len(negative_prompts) > 0 else "\"0\":\"\",\n"
+        
+        return (inbetween_embeds, neg_embeds, positive_string, negative_string, batch_size,)
+
 class IG_Interpolate:
     @classmethod
     def INPUT_TYPES(s):
@@ -68,6 +164,7 @@ def INPUT_TYPES(s):
     FUNCTION = "main"
     CATEGORY = TREE_INTERP 
 
+    @torch.inference_mode()
     def main(self, ipadapter, clip_vision, transitioning_frames, repeat_count, interpolation, buffer, input_images1=None, input_images2=None, input_images3=None, positive_prompts=None, negative_prompts=None):
         if 'ipadapter' in ipadapter:
             ipadapter_model = ipadapter['ipadapter']['model']
@@ -89,12 +186,16 @@ def main(self, ipadapter, clip_vision, transitioning_frames, repeat_count, inter
                 continue
             # Create pos embeds
             img_cond_embeds = clip_vision.encode_image(input_images)
-            
+            print( f"penultimate_hidden_states shape {img_cond_embeds.penultimate_hidden_states.shape}")
+            print( f"last_hidden_state shape {img_cond_embeds.last_hidden_state.shape}")
+            print( f"image_embeds shape {img_cond_embeds.image_embeds.shape}")
+
             if is_plus:
                 img_cond_embeds = img_cond_embeds.penultimate_hidden_states
             else:
                 img_cond_embeds = img_cond_embeds.image_embeds
             print( f"Embed shape {img_cond_embeds.shape}")
+            
             inbetween_embeds = []
             # Make sure we have 2 images
             if len(img_cond_embeds) > 1:
@@ -187,65 +288,4 @@ def main(self, input_images, transitioning_frames, interpolation, repeat_count):
         # crossfade_images.append(last_image)
 
         crossfade_images = torch.stack(crossfade_images, dim=0)
-    
-        # If not at end, transition image
-            
-
-        # for i in range(transitioning_frames):
-        #     alpha = alphas[i]
-        #     image1 = images_1[i + transition_start_index]
-        #     image2 = images_2[i + transition_start_index]
-        #     easing_function = easing_functions.get(interpolation)
-        #     alpha = easing_function(alpha)  # Apply the easing function to the alpha value
-
-        #     crossfade_image = crossfade(image1, image2, alpha)
-        #     crossfade_images.append(crossfade_image)
-            
-        # # Convert crossfade_images to tensor
-        # crossfade_images = torch.stack(crossfade_images, dim=0)
-        # # Get the last frame result of the interpolation
-        # last_frame = crossfade_images[-1]
-        # # Calculate the number of remaining frames from images_2
-        # remaining_frames = len(images_2) - (transition_start_index + transitioning_frames)
-        # # Crossfade the remaining frames with the last used alpha value
-        # for i in range(remaining_frames):
-        #     alpha = alphas[-1]
-        #     image1 = images_1[i + transition_start_index + transitioning_frames]
-        #     image2 = images_2[i + transition_start_index + transitioning_frames]
-        #     easing_function = easing_functions.get(interpolation)
-        #     alpha = easing_function(alpha)  # Apply the easing function to the alpha value
-
-        #     crossfade_image = crossfade(image1, image2, alpha)
-        #     crossfade_images = torch.cat([crossfade_images, crossfade_image.unsqueeze(0)], dim=0)
-        # # Append the beginning of images_1
-        # beginning_images_1 = images_1[:transition_start_index]
-        # crossfade_images = torch.cat([beginning_images_1, crossfade_images], dim=0)
         return (crossfade_images, )
-    
-
-# class IG_ParseqToWeights:
-
-#     FUNCTION = "main"
-#     CATEGORY = TREE_INTERP
-#     RETURN_TYPES = ("FLOAT",)
-#     RETURN_NAMES = ("weights",)
-
-#     @classmethod
-#     def INPUT_TYPES(s):
-#         return {
-#             "required": {
-#                 "parseq": ("STRING", {"default": '', "multiline": True}),
-#             },
-#         } 
-
-#     def main(self, parseq):
-#         # Load the JSON string into a dictionary
-#         data = json.loads(parseq)
-
-#         # Extract the list of frames
-#         frames = data.get('rendered_frames', [])
-
-#         # Extract the prompt_weight_1 from each frame and store it in a list
-#         prompt_weights = [frame['prompt_weight_1'] for frame in frames]
-
-#         return (prompt_weights, )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# __init__.py`
	`2`	`+from .motion_predictor import MotionPredictor`