Merger

EricLBuehler · EricLBuehler · commit 8e10e52b7360 · 2025-06-07T22:59:45.000+02:00
diff --git a/mistralrs-core/src/vision_models/conformer/encoder.rs b/mistralrs-core/src/vision_models/conformer/encoder.rs
@@ -348,7 +348,7 @@ impl ConvModule {
             None
         };
 
-        let mut fix_len1 = false;
+        let fix_len1;
         let ext_pw_conv_1d = if cfg.causal {
             if cfg.ext_pw_kernel_size > 1 {
                 fix_len1 = true;
@@ -642,7 +642,7 @@ impl ConformerEncoder {
 }
 
 fn unfold_tensor(xs_pad: &Tensor, max_seq_len: usize) -> Result<Tensor> {
-    let (n, t, d) = xs_pad.dims3()?;
+    let (_n, t, _d) = xs_pad.dims3()?;
 
     // If sequence length is already <= max_seq_len, no need to unfold
     if t <= max_seq_len {
diff --git a/mistralrs-core/src/vision_models/conformer/nemo.rs b/mistralrs-core/src/vision_models/conformer/nemo.rs
@@ -10,9 +10,7 @@ use super::config::NemoConvConfig;
 
 pub struct NemoConvSubsampling {
     conv: Vec<Arc<dyn Module + Send + Sync>>,
-    conv2d_subsampling: bool,
     out: Linear,
-    subsampling_causal_cond: bool,
     subsampling_factor: usize,
 }
 
@@ -23,8 +21,6 @@ impl NemoConvSubsampling {
         }
 
         let sampling_num = (cfg.subsampling_factor as f32).log2() as usize;
-        let subsampling_causal_cond =
-            ["dw_striding", "striding", "striding_conv1d"].contains(&cfg.subsampling.as_str());
 
         let mut in_channels = 1;
         let mut layers: Vec<Arc<dyn Module + Send + Sync>> = Vec::new();
@@ -114,13 +110,10 @@ impl NemoConvSubsampling {
             true,
             vb.pp("out"),
         )?;
-        let conv2d_subsampling = false;
 
         Ok(Self {
             conv: layers,
-            conv2d_subsampling,
             out,
-            subsampling_causal_cond,
             subsampling_factor: cfg.subsampling_factor,
         })
     }
@@ -135,7 +128,7 @@ impl NemoConvSubsampling {
     ) -> usize {
         let add_pad = all_paddings as f32 - kernel_size as f32;
         let one = 1f32;
-        for i in 0..repeat_num {
+        for _ in 0..repeat_num {
             length = (length + add_pad) / (stride as f32) + one;
             if ceil_mode {
                 length = length.ceil();
diff --git a/mistralrs-core/src/vision_models/phi4/audio_embedding.rs b/mistralrs-core/src/vision_models/phi4/audio_embedding.rs
@@ -1,8 +1,8 @@
 use std::{collections::HashMap, sync::Arc};
 
-use candle_core::Result;
+use candle_core::{DType, Device, IndexOp, Result, Tensor};
 use candle_nn::Module;
-use mistralrs_quant::ShardedVarBuilder;
+use mistralrs_quant::{NonZeroOp, ShardedVarBuilder};
 
 use crate::{
     layers::{self, Activation},
@@ -16,7 +16,7 @@ use super::Phi4MMConfig;
 
 pub(super) const AUDIO_SPECIAL_TOKEN_ID: f64 = 200011.;
 
-#[derive(Eq, Hash, PartialEq)]
+#[derive(Eq, Hash, PartialEq, Debug, Clone, Copy)]
 pub enum AudioProjectionMode {
     /// If only speech
     Speech,
@@ -25,13 +25,16 @@ pub enum AudioProjectionMode {
 }
 
 pub struct AudioEmbedding {
+    wte: candle_nn::Embedding,
     proj: HashMap<AudioProjectionMode, Vec<Arc<dyn Module + Send + Sync>>>,
     encoder: ConformerEncoder,
+    target_device_dtype: (Device, DType),
 }
 
 impl AudioEmbedding {
     pub fn new(
         cfg: &Phi4MMConfig,
+        wte: candle_nn::Embedding,
         audio_embd_config: &Phi4MMAudioEmbedConfig,
         vb: ShardedVarBuilder,
     ) -> Result<Self> {
@@ -90,6 +93,123 @@ impl AudioEmbedding {
             proj.insert(AudioProjectionMode::Vision, layers_for_vision);
         }
 
-        Ok(Self { proj, encoder })
+        Ok(Self {
+            wte,
+            proj,
+            encoder,
+            target_device_dtype: (vb.device().clone(), vb.dtype()),
+        })
+    }
+
+    fn get_audio_features(
+        &self,
+        input_embeds: &Tensor,
+        audio_attention_mask: &Tensor,
+        audio_projection_mode: &AudioProjectionMode,
+    ) -> Result<Tensor> {
+        // Get audio features from encoder
+        let (audio_features, _masks) = self
+            .encoder
+            .forward(input_embeds, Some(audio_attention_mask))?;
+
+        // Apply projection based on mode
+        let projection_layers = self.proj.get(audio_projection_mode).ok_or_else(|| {
+            candle_core::Error::Msg(format!(
+                "Projection mode {:?} not found",
+                audio_projection_mode
+            ))
+        })?;
+
+        let mut audio_set_tensor = audio_features;
+        for layer in projection_layers {
+            audio_set_tensor = layer.forward(&audio_set_tensor)?;
+        }
+
+        Ok(audio_set_tensor)
+    }
+
+    pub fn forward(
+        &self,
+        input_ids: &Tensor,
+        input_embeds: &Tensor,
+        audio_embed_sizes: Vec<usize>,
+        audio_attention_mask: &Tensor,
+        audio_projection_mode: &AudioProjectionMode,
+    ) -> Result<Tensor> {
+        // Reshape input_ids to 2D
+        let input_shape = input_ids.shape();
+        let input_ids = if input_shape.rank() > 2 {
+            input_ids.reshape((
+                input_shape.elem_count() / input_shape.dims()[input_shape.rank() - 1],
+                input_shape.dims()[input_shape.rank() - 1],
+            ))?
+        } else {
+            input_ids.clone()
+        };
+
+        let positions = input_ids.eq(AUDIO_SPECIAL_TOKEN_ID)?.nonzero()?;
+
+        // Get target device and dtype from projection layers
+        let (target_device, target_dtype) = self.target_device_dtype.clone();
+
+        let audio_set_tensor = if positions.dim(0)? > 0 {
+            // Convert to target device/dtype if needed
+            let input_embeds = if input_embeds.device().same_device(&target_device)
+                || input_embeds.dtype() != target_dtype
+            {
+                input_embeds
+                    .to_device(&target_device)?
+                    .to_dtype(target_dtype)?
+            } else {
+                input_embeds.clone()
+            };
+
+            self.get_audio_features(&input_embeds, audio_attention_mask, audio_projection_mode)?
+        } else {
+            // Return early if no audio tokens and not training
+            return self.wte.forward(&input_ids);
+        };
+
+        // Get initial hidden states from word embeddings
+        let mut hidden_states = self.wte.forward(&input_ids)?;
+
+        // Verify that audio_embed_sizes sum matches positions count
+        let total_audio_tokens = audio_embed_sizes.iter().sum::<usize>();
+        if total_audio_tokens != positions.dim(0)? {
+            return Err(candle_core::Error::Msg(format!(
+                "Audio embed sizes sum ({}) doesn't match positions count ({})",
+                total_audio_tokens,
+                positions.dim(0)?
+            )));
+        }
+
+        let mut audio_sets = Vec::new();
+        for (i, size) in audio_embed_sizes.into_iter().enumerate() {
+            audio_sets.push(audio_set_tensor.i((i, size, ..))?);
+        }
+        let merged_audio_set_tensor = Tensor::cat(&audio_sets, 0)?;
+
+        let original_shape = hidden_states.shape().clone();
+        let (hs_b, hs_l, hs_d) = hidden_states.dims3()?;
+        let mut hidden_states_flat = hidden_states.reshape(((), hs_d))?;
+
+        // Get the equiv 0th and 1th rows of the positions_tuple
+        let positions_transposed = positions.to_dtype(DType::F32)?;
+        let positions_transposed_0 = positions_transposed.i((.., 0))?;
+        let positions_transposed_1 = positions_transposed.i((.., 1))?;
+
+        let mut linear_index =
+            ((positions_transposed_0 * (hs_l * hs_b) as f64)? + positions_transposed_1)?;
+        linear_index = linear_index.to_dtype(DType::U32)?;
+        linear_index = linear_index.unsqueeze(1)?.repeat((1, hs_d))?;
+
+        let current_vals = hidden_states_flat.gather(&linear_index, 0)?;
+        let delta = merged_audio_set_tensor.broadcast_sub(&current_vals)?;
+
+        hidden_states_flat = hidden_states_flat.scatter_add(&linear_index, &delta, 0)?;
+
+        hidden_states = hidden_states_flat.reshape(original_shape)?;
+
+        Ok(hidden_states)
     }
 }
diff --git a/mistralrs-core/src/vision_models/phi4/inputs_processor.rs b/mistralrs-core/src/vision_models/phi4/inputs_processor.rs
@@ -203,9 +203,12 @@ impl InputsProcessor for Phi4MMInputsProcessor {
                             position_ids,
                             pixel_values: None,
                             model_specific_args: Box::new(Phi4MMVisionSpecificArgs {
-                                image_sizes: None,
-                                image_attention_mask: None,
                                 input_image_embeds: None,
+                                image_attention_mask: None,
+                                image_sizes: None,
+                                input_audio_embeds: None,   // TODO!
+                                audio_embed_sizes: None,    // TODO!
+                                audio_attention_mask: None, // TODO!
                             }),
                             paged_attn_meta,
                             flash_meta,
@@ -326,9 +329,12 @@ impl InputsProcessor for Phi4MMInputsProcessor {
                 position_ids,
                 pixel_values: pixel_values.clone(),
                 model_specific_args: Box::new(Phi4MMVisionSpecificArgs {
-                    image_sizes: image_sizes.clone(),
-                    image_attention_mask: pixel_attention_mask,
                     input_image_embeds: pixel_values,
+                    image_attention_mask: pixel_attention_mask,
+                    image_sizes: image_sizes.clone(),
+                    input_audio_embeds: None,   // TODO!
+                    audio_embed_sizes: None,    // TODO!
+                    audio_attention_mask: None, // TODO!
                 }),
                 paged_attn_meta,
                 flash_meta,
diff --git a/mistralrs-core/src/vision_models/phi4/mm_embedding.rs b/mistralrs-core/src/vision_models/phi4/mm_embedding.rs
@@ -2,7 +2,13 @@ use candle_core::{Result, Tensor, D};
 use candle_nn::Module;
 use mistralrs_quant::ShardedVarBuilder;
 
-use crate::utils::unvarbuilder::UnVarBuilder;
+use crate::{
+    utils::unvarbuilder::UnVarBuilder,
+    vision_models::phi4::{
+        audio_embedding::{AudioProjectionMode, AUDIO_SPECIAL_TOKEN_ID},
+        image_embedding::IMAGE_SPECIAL_TOKEN_ID,
+    },
+};
 
 use super::{audio_embedding::AudioEmbedding, image_embedding::ImageEmbedding, Phi4MMConfig};
 
@@ -34,6 +40,7 @@ impl Phi4MMImageAudioEmbedding {
         let audio_embed = if let Some(audio_embd_config) = &cfg.embd_layer.audio_embd_layer {
             Some(AudioEmbedding::new(
                 cfg,
+                wte.clone(),
                 audio_embd_config,
                 vb.pp("audio_embed"),
             )?)
@@ -52,29 +59,59 @@ impl Phi4MMImageAudioEmbedding {
     pub fn forward(
         &self,
         input_ids: &Tensor,
-        input_image_embeds: &Tensor,
+        input_image_embeds: Option<&Tensor>,
         image_attention_mask: Option<&Tensor>,
         image_sizes: Option<Vec<(u32, u32)>>,
+        input_audio_embeds: Option<&Tensor>,
+        audio_embed_sizes: Option<Vec<usize>>,
+        audio_attention_mask: Option<&Tensor>,
+        audio_projection_mode: AudioProjectionMode,
     ) -> Result<Tensor> {
         assert!(-MAX_INPUT_ID < self.image_input_id);
 
         let input_ids = input_ids.reshape(((), input_ids.dim(D::Minus1)?))?;
 
-        let image_hidden_states = if let Some(image_embed) = &self.image_embed {
-            Some(image_embed.forward(
+        let image_hidden_states = match &self.image_embed {
+            Some(image_embed) if input_image_embeds.is_some() => Some(image_embed.forward(
                 &input_ids,
-                input_image_embeds,
+                input_image_embeds.expect("input_image_embeds"),
                 image_attention_mask,
                 image_sizes,
-            )?)
-        } else {
-            None
+            )?),
+            _ => None,
         };
 
-        match image_hidden_states {
-            Some(image_hidden_states) => Ok(image_hidden_states),
+        let audio_hidden_states = match &self.audio_embed {
+            Some(audio_embed) if input_audio_embeds.is_some() => Some(audio_embed.forward(
+                &input_ids,
+                input_audio_embeds.expect("input_audio_embeds"),
+                audio_embed_sizes.expect("audio_embed_sizes"),
+                audio_attention_mask.expect("audio_attention_mask"),
+                &audio_projection_mode,
+            )?),
+            _ => None,
+        };
+
+        let image_position_mask = input_ids.eq(IMAGE_SPECIAL_TOKEN_ID)?;
+        let non_image_position_mask = input_ids.eq(AUDIO_SPECIAL_TOKEN_ID)?;
+
+        match (image_hidden_states, audio_hidden_states) {
+            (Some(image_hidden_states), Some(audio_hidden_states)) => {
+                // Merge
+                image_hidden_states.broadcast_mul(
+                    &image_position_mask
+                        .to_dtype(image_hidden_states.dtype())?
+                        .unsqueeze(D::Minus1)?,
+                )? + audio_hidden_states.broadcast_mul(
+                    &non_image_position_mask
+                        .to_dtype(audio_hidden_states.dtype())?
+                        .unsqueeze(D::Minus1)?,
+                )?
+            }
+            (Some(image_hidden_states), None) => Ok(image_hidden_states),
+            (None, Some(audio_hidden_states)) => Ok(audio_hidden_states),
 
-            None => self.wte.forward(&input_ids),
+            (None, None) => self.wte.forward(&input_ids),
         }
     }
 
diff --git a/mistralrs-core/src/vision_models/phi4/mod.rs b/mistralrs-core/src/vision_models/phi4/mod.rs