Skip to content

Commit 34ca7d4

Browse files
committed
It works!
1 parent a0f9bfc commit 34ca7d4

File tree

4 files changed

+12
-16
lines changed

4 files changed

+12
-16
lines changed

mistralrs-core/src/vision_models/conformer/encoder.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,7 @@ impl EncoderEmbedding {
529529
pub struct ConformerEncoder {
530530
encoder_embedding: EncoderEmbedding,
531531
embed: NemoConvSubsampling,
532+
#[allow(unused)]
532533
pos_embed: AbsolutePositionalEncoding,
533534
relative_attention_bias_layer: T5RelativeAttentionLogitBias,
534535
encoders: Vec<EncoderLayer>,

mistralrs-core/src/vision_models/conformer/pos_embed.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::layers;
66

77
use super::config::ConformerEncoderConfig;
88

9+
#[allow(unused)]
910
pub struct AbsolutePositionalEncoding {
1011
pe: Tensor,
1112
xscale: f64,
@@ -59,6 +60,7 @@ impl AbsolutePositionalEncoding {
5960
})
6061
}
6162

63+
#[allow(unused)]
6264
pub fn forward(&self, xs: &Tensor) -> Result<Tensor> {
6365
if xs.dim(1)? >= self.pe.dim(1)? {
6466
candle_core::bail!("Need to recompute positional embeds");

mistralrs-core/src/vision_models/phi4/audio_embedding.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,6 @@ impl AudioEmbedding {
134134
audio_attention_mask: Option<&Tensor>,
135135
audio_projection_mode: &AudioProjectionMode,
136136
) -> Result<Tensor> {
137-
let input_embeds = Tensor::read_npy("input_embeds.npy")?
138-
.to_dtype(input_embeds.dtype())?
139-
.to_device(input_embeds.device())?;
140137
// Reshape input_ids to 2D
141138
let input_shape = input_ids.shape();
142139
let input_ids = if input_shape.rank() > 2 {

mistralrs-core/src/vision_models/phi4/inputs_processor.rs

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl ProcessorCreator for Phi4MMProcessor {
7272
audio_feat_stride: pre_processor_config
7373
.audio_feat_stride
7474
.expect("audio_feat_stride"),
75-
eightk_method: "fillzero".to_string(), // Default to fillzero like Python
75+
eightk_method: "fillzero".to_string(), // Default to fillzero
7676
}),
7777
})
7878
}
@@ -424,7 +424,7 @@ impl Phi4MMInputsProcessor {
424424
// Apply mel filterbank
425425
let mel_features = self.apply_mel_filterbank(&spectrogram, sample_rate)?;
426426

427-
// Take log - match Python: clip to minimum 1.0 then log
427+
// Take log: clip to minimum 1.0 then log
428428
let log_features: Vec<Vec<f32>> = mel_features
429429
.iter()
430430
.map(|frame| frame.iter().map(|&x| (x.max(1.0)).ln()).collect())
@@ -459,7 +459,7 @@ impl Phi4MMInputsProcessor {
459459
// Create Hamming window
460460
let window = self.create_hamming_window(win_length);
461461

462-
// Extract frames - match Python logic exactly
462+
// Extract frames
463463
let n_batch = (wav.len() - win_length) / hop_length + 1;
464464
let mut frames = Vec::new();
465465
for i in 0..n_batch {
@@ -470,7 +470,7 @@ impl Phi4MMInputsProcessor {
470470
}
471471
}
472472

473-
// Apply preemphasis - FIXED to match Python
473+
// Apply preemphasis
474474
let preemphasis = 0.97;
475475
self.apply_preemphasis_frames(&mut frames, preemphasis);
476476

@@ -479,7 +479,7 @@ impl Phi4MMInputsProcessor {
479479
let fft = planner.plan_fft_forward(n_fft);
480480

481481
let mut spectrogram = Vec::new();
482-
for (frame_idx, frame) in frames.iter().enumerate() {
482+
for (_frame_idx, frame) in frames.iter().enumerate() {
483483
// Apply window and convert to complex
484484
let mut windowed: Vec<Complex32> = frame
485485
.iter()
@@ -499,7 +499,7 @@ impl Phi4MMInputsProcessor {
499499
.map(|c| c.norm())
500500
.collect();
501501

502-
// Handle 8kHz case - FIXED to match Python padding logic
502+
// Handle 8kHz case
503503
if fs == 8000 && self.eightk_method == "fillzero" {
504504
// Remove nyquist bin and pad with zeros to match 16kHz structure
505505
magnitude.pop(); // Remove nyquist
@@ -513,7 +513,6 @@ impl Phi4MMInputsProcessor {
513513
Ok(spectrogram)
514514
}
515515

516-
// NEW: Fixed preemphasis to match Python frame-level processing
517516
fn apply_preemphasis_frames(&self, frames: &mut [Vec<f32>], preemphasis: f32) {
518517
if frames.is_empty() {
519518
return;
@@ -599,7 +598,6 @@ impl Phi4MMInputsProcessor {
599598
.collect()
600599
}
601600

602-
// FIXED: Apply mel filterbank with proper frequency range matching Python
603601
fn apply_mel_filterbank(
604602
&self,
605603
spectrogram: &[Vec<f32>],
@@ -635,7 +633,6 @@ impl Phi4MMInputsProcessor {
635633
Ok(mel_features)
636634
}
637635

638-
// FIXED: Mel filterbank creation to match Python SpeechLib logic
639636
fn create_mel_filterbank(
640637
&self,
641638
n_mels: usize,
@@ -646,15 +643,14 @@ impl Phi4MMInputsProcessor {
646643
let fmax = sample_rate / 2.0;
647644
let fmin = 0.0;
648645

649-
// Mel scale conversion functions (matching Python)
646+
// Mel scale conversion functions
650647
let hz_to_mel = |f: f32| 1127.0 * (1.0 + f / 700.0).ln();
651-
let mel_to_hz = |mel: f32| 700.0 * (mel / 1127.0).exp() - 700.0;
648+
let _mel_to_hz = |mel: f32| 700.0 * (mel / 1127.0).exp() - 700.0;
652649
let bin_to_mel = |fft_bin: usize| {
653650
1127.0 * (1.0 + (fft_bin as f32 * sample_rate) / (n_fft as f32 * 700.0)).ln()
654651
};
655652
let f_to_bin = |f: f32| ((f * n_fft as f32 / sample_rate) + 0.5) as usize;
656653

657-
// Match Python frequency range logic
658654
let klo = f_to_bin(fmin) + 1; // Skip DC component
659655
let khi = f_to_bin(fmax).max(klo);
660656

@@ -676,7 +672,7 @@ impl Phi4MMInputsProcessor {
676672
let center = mel_centers[m + 1];
677673
let right = mel_centers[m + 2];
678674

679-
// Match Python frequency range: process from klo to khi
675+
// Process from klo to khi
680676
for fft_bin in klo..khi.min(bank_width) {
681677
let mbin = bin_to_mel(fft_bin);
682678
if left < mbin && mbin < right {

0 commit comments

Comments
 (0)