Fully loading speech stack

EricLBuehler · EricLBuehler · commit 3907feb5dd5d · 2025-06-07T22:14:22.000+02:00
diff --git a/mistralrs-core/src/vision_models/conformer/encoder.rs b/mistralrs-core/src/vision_models/conformer/encoder.rs
@@ -536,15 +536,15 @@ impl EncoderEmbedding {
     }
 }
 
-pub struct Encoder {
+pub struct ConformerEncoder {
     encoder_embedding: EncoderEmbedding,
     embed: NemoConvSubsampling,
     pos_embed: AbsolutePositionalEncoding,
     relative_attention_bias_layer: T5RelativeAttentionLogitBias,
     encoders: Vec<EncoderLayer>,
 }
 
-impl Encoder {
+impl ConformerEncoder {
     pub fn new(mut cfg: ConformerEncoderConfig, vb: ShardedVarBuilder) -> Result<Self> {
         assert_eq!(cfg.input_layer, "nemo_conv");
 
diff --git a/mistralrs-core/src/vision_models/conformer/mod.rs b/mistralrs-core/src/vision_models/conformer/mod.rs
@@ -2,5 +2,3 @@ pub mod config;
 pub mod encoder;
 pub mod nemo;
 pub mod pos_embed;
-
-pub use encoder::Encoder as ConformerEncoder;
diff --git a/mistralrs-core/src/vision_models/conformer/nemo.rs b/mistralrs-core/src/vision_models/conformer/nemo.rs
@@ -1,3 +1,5 @@
+use std::sync::Arc;
+
 use candle_core::{Result, Tensor};
 use candle_nn::{Conv2dConfig, Linear, Module};
 use mistralrs_quant::ShardedVarBuilder;
@@ -7,7 +9,7 @@ use crate::layers;
 use super::config::NemoConvConfig;
 
 pub struct NemoConvSubsampling {
-    conv: Vec<Box<dyn Module>>,
+    conv: Vec<Arc<dyn Module + Send + Sync>>,
     conv2d_subsampling: bool,
     out: Linear,
     subsampling_causal_cond: bool,
@@ -25,7 +27,7 @@ impl NemoConvSubsampling {
             ["dw_striding", "striding", "striding_conv1d"].contains(&cfg.subsampling.as_str());
 
         let mut in_channels = 1;
-        let mut layers: Vec<Box<dyn Module>> = Vec::new();
+        let mut layers: Vec<Arc<dyn Module + Send + Sync>> = Vec::new();
 
         let stride = 2;
         let kernel_size = 3;
@@ -46,7 +48,7 @@ impl NemoConvSubsampling {
             let vb_layers = vb.pp("conv");
 
             let mut idx = 0;
-            layers.push(Box::new(layers::conv2d(
+            layers.push(Arc::new(layers::conv2d(
                 in_channels,
                 cfg.conv_channels,
                 kernel_size,
@@ -61,11 +63,11 @@ impl NemoConvSubsampling {
 
             in_channels = cfg.conv_channels;
             idx += 1;
-            layers.push(Box::new(cfg.activation));
+            layers.push(Arc::new(cfg.activation));
 
             for _ in 0..(sampling_num - 1) {
                 idx += 1;
-                layers.push(Box::new(layers::conv2d(
+                layers.push(Arc::new(layers::conv2d(
                     in_channels,
                     in_channels,
                     kernel_size,
@@ -79,7 +81,7 @@ impl NemoConvSubsampling {
                 )?));
 
                 idx += 1;
-                layers.push(Box::new(layers::conv2d(
+                layers.push(Arc::new(layers::conv2d(
                     in_channels,
                     cfg.conv_channels,
                     1,
@@ -93,7 +95,7 @@ impl NemoConvSubsampling {
                 )?));
 
                 idx += 1;
-                layers.push(Box::new(cfg.activation));
+                layers.push(Arc::new(cfg.activation));
             }
         }
 
diff --git a/mistralrs-core/src/vision_models/phi4/audio_embedding.rs b/mistralrs-core/src/vision_models/phi4/audio_embedding.rs
@@ -0,0 +1,95 @@
+use std::{collections::HashMap, sync::Arc};
+
+use candle_core::Result;
+use candle_nn::Module;
+use mistralrs_quant::ShardedVarBuilder;
+
+use crate::{
+    layers::{self, Activation},
+    vision_models::{
+        conformer::encoder::ConformerEncoder,
+        phi4::config::{Phi4MMAudioConfig, Phi4MMAudioEmbedConfig},
+    },
+};
+
+use super::Phi4MMConfig;
+
+pub(super) const AUDIO_SPECIAL_TOKEN_ID: f64 = 200011.;
+
+#[derive(Eq, Hash, PartialEq)]
+pub enum AudioProjectionMode {
+    /// If only speech
+    Speech,
+    /// If vision + speech or only vision (not sure why that is necesary though)
+    Vision,
+}
+
+pub struct AudioEmbedding {
+    proj: HashMap<AudioProjectionMode, Vec<Arc<dyn Module + Send + Sync>>>,
+    encoder: ConformerEncoder,
+}
+
+impl AudioEmbedding {
+    pub fn new(
+        cfg: &Phi4MMConfig,
+        audio_embd_config: &Phi4MMAudioEmbedConfig,
+        vb: ShardedVarBuilder,
+    ) -> Result<Self> {
+        let hidden_size = audio_embd_config.n_embd.unwrap_or(cfg.hidden_size);
+
+        let conformer_config = match &cfg.audio_processor {
+            Some(Phi4MMAudioConfig { config, name }) if name == "cascades" => config,
+            _ => candle_core::bail!("Must have audio processor (`cascades`)"),
+        };
+        let encoder = ConformerEncoder::new(conformer_config.clone(), vb.pp("encoder"))?;
+
+        // let audio_dim_in = conformer_config.input_size;
+        let audio_dim_out = conformer_config.attention_dim;
+
+        let mut proj = HashMap::new();
+        {
+            assert_eq!(audio_embd_config.projection_cls, "mlp");
+
+            let dim_projection = hidden_size;
+            let depth = 2;
+            let linear_downsample_rate = audio_embd_config.downsample_rate;
+
+            let embedding_cls_vb = vb.pp("audio_projection");
+
+            let mut layers_for_speech: Vec<Arc<dyn Module + Send + Sync>> =
+                vec![Arc::new(layers::linear(
+                    audio_dim_out * linear_downsample_rate,
+                    dim_projection,
+                    embedding_cls_vb.pp("speech").pp(0),
+                )?)];
+            for i in 1..depth {
+                layers_for_speech.push(Arc::new(Activation::Gelu));
+                layers_for_speech.push(Arc::new(layers::linear(
+                    dim_projection,
+                    dim_projection,
+                    embedding_cls_vb.pp("speech").pp(i + 1),
+                )?));
+            }
+
+            let mut layers_for_vision: Vec<Arc<dyn Module + Send + Sync>> =
+                vec![Arc::new(layers::linear(
+                    audio_dim_out * linear_downsample_rate,
+                    dim_projection,
+                    embedding_cls_vb.pp("vision").pp(0),
+                )?)];
+            for i in 1..depth {
+                layers_for_vision.push(Arc::new(Activation::Gelu));
+                layers_for_vision.push(Arc::new(layers::linear(
+                    dim_projection,
+                    dim_projection,
+                    embedding_cls_vb.pp("vision").pp(i + 1),
+                )?));
+            }
+
+            proj.insert(AudioProjectionMode::Speech, layers_for_speech);
+            proj.insert(AudioProjectionMode::Vision, layers_for_vision);
+        }
+
+        Ok(Self { proj, encoder })
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -536,15 +536,15 @@ impl EncoderEmbedding {`
`536`	`536`	`}`
`537`	`537`	`}`
`538`	`538`
`539`		`-pub struct Encoder {`
	`539`	`+pub struct ConformerEncoder {`
`540`	`540`	`encoder_embedding: EncoderEmbedding,`
`541`	`541`	`embed: NemoConvSubsampling,`
`542`	`542`	`pos_embed: AbsolutePositionalEncoding,`
`543`	`543`	`relative_attention_bias_layer: T5RelativeAttentionLogitBias,`
`544`	`544`	`encoders: Vec<EncoderLayer>,`
`545`	`545`	`}`
`546`	`546`
`547`		`-impl Encoder {`
	`547`	`+impl ConformerEncoder {`
`548`	`548`	`pub fn new(mut cfg: ConformerEncoderConfig, vb: ShardedVarBuilder) -> Result<Self> {`
`549`	`549`	`assert_eq!(cfg.input_layer, "nemo_conv");`
`550`	`550`