@@ -72,7 +72,7 @@ impl ProcessorCreator for Phi4MMProcessor {
72
72
audio_feat_stride : pre_processor_config
73
73
. audio_feat_stride
74
74
. expect ( "audio_feat_stride" ) ,
75
- eightk_method : "fillzero" . to_string ( ) , // Default to fillzero like Python
75
+ eightk_method : "fillzero" . to_string ( ) , // Default to fillzero
76
76
} ) ,
77
77
} )
78
78
}
@@ -424,7 +424,7 @@ impl Phi4MMInputsProcessor {
424
424
// Apply mel filterbank
425
425
let mel_features = self . apply_mel_filterbank ( & spectrogram, sample_rate) ?;
426
426
427
- // Take log - match Python : clip to minimum 1.0 then log
427
+ // Take log: clip to minimum 1.0 then log
428
428
let log_features: Vec < Vec < f32 > > = mel_features
429
429
. iter ( )
430
430
. map ( |frame| frame. iter ( ) . map ( |& x| ( x. max ( 1.0 ) ) . ln ( ) ) . collect ( ) )
@@ -459,7 +459,7 @@ impl Phi4MMInputsProcessor {
459
459
// Create Hamming window
460
460
let window = self . create_hamming_window ( win_length) ;
461
461
462
- // Extract frames - match Python logic exactly
462
+ // Extract frames
463
463
let n_batch = ( wav. len ( ) - win_length) / hop_length + 1 ;
464
464
let mut frames = Vec :: new ( ) ;
465
465
for i in 0 ..n_batch {
@@ -470,7 +470,7 @@ impl Phi4MMInputsProcessor {
470
470
}
471
471
}
472
472
473
- // Apply preemphasis - FIXED to match Python
473
+ // Apply preemphasis
474
474
let preemphasis = 0.97 ;
475
475
self . apply_preemphasis_frames ( & mut frames, preemphasis) ;
476
476
@@ -479,7 +479,7 @@ impl Phi4MMInputsProcessor {
479
479
let fft = planner. plan_fft_forward ( n_fft) ;
480
480
481
481
let mut spectrogram = Vec :: new ( ) ;
482
- for ( frame_idx , frame) in frames. iter ( ) . enumerate ( ) {
482
+ for ( _frame_idx , frame) in frames. iter ( ) . enumerate ( ) {
483
483
// Apply window and convert to complex
484
484
let mut windowed: Vec < Complex32 > = frame
485
485
. iter ( )
@@ -499,7 +499,7 @@ impl Phi4MMInputsProcessor {
499
499
. map ( |c| c. norm ( ) )
500
500
. collect ( ) ;
501
501
502
- // Handle 8kHz case - FIXED to match Python padding logic
502
+ // Handle 8kHz case
503
503
if fs == 8000 && self . eightk_method == "fillzero" {
504
504
// Remove nyquist bin and pad with zeros to match 16kHz structure
505
505
magnitude. pop ( ) ; // Remove nyquist
@@ -513,7 +513,6 @@ impl Phi4MMInputsProcessor {
513
513
Ok ( spectrogram)
514
514
}
515
515
516
- // NEW: Fixed preemphasis to match Python frame-level processing
517
516
fn apply_preemphasis_frames ( & self , frames : & mut [ Vec < f32 > ] , preemphasis : f32 ) {
518
517
if frames. is_empty ( ) {
519
518
return ;
@@ -599,7 +598,6 @@ impl Phi4MMInputsProcessor {
599
598
. collect ( )
600
599
}
601
600
602
- // FIXED: Apply mel filterbank with proper frequency range matching Python
603
601
fn apply_mel_filterbank (
604
602
& self ,
605
603
spectrogram : & [ Vec < f32 > ] ,
@@ -635,7 +633,6 @@ impl Phi4MMInputsProcessor {
635
633
Ok ( mel_features)
636
634
}
637
635
638
- // FIXED: Mel filterbank creation to match Python SpeechLib logic
639
636
fn create_mel_filterbank (
640
637
& self ,
641
638
n_mels : usize ,
@@ -646,15 +643,14 @@ impl Phi4MMInputsProcessor {
646
643
let fmax = sample_rate / 2.0 ;
647
644
let fmin = 0.0 ;
648
645
649
- // Mel scale conversion functions (matching Python)
646
+ // Mel scale conversion functions
650
647
let hz_to_mel = |f : f32 | 1127.0 * ( 1.0 + f / 700.0 ) . ln ( ) ;
651
- let mel_to_hz = |mel : f32 | 700.0 * ( mel / 1127.0 ) . exp ( ) - 700.0 ;
648
+ let _mel_to_hz = |mel : f32 | 700.0 * ( mel / 1127.0 ) . exp ( ) - 700.0 ;
652
649
let bin_to_mel = |fft_bin : usize | {
653
650
1127.0 * ( 1.0 + ( fft_bin as f32 * sample_rate) / ( n_fft as f32 * 700.0 ) ) . ln ( )
654
651
} ;
655
652
let f_to_bin = |f : f32 | ( ( f * n_fft as f32 / sample_rate) + 0.5 ) as usize ;
656
653
657
- // Match Python frequency range logic
658
654
let klo = f_to_bin ( fmin) + 1 ; // Skip DC component
659
655
let khi = f_to_bin ( fmax) . max ( klo) ;
660
656
@@ -676,7 +672,7 @@ impl Phi4MMInputsProcessor {
676
672
let center = mel_centers[ m + 1 ] ;
677
673
let right = mel_centers[ m + 2 ] ;
678
674
679
- // Match Python frequency range: process from klo to khi
675
+ // Process from klo to khi
680
676
for fft_bin in klo..khi. min ( bank_width) {
681
677
let mbin = bin_to_mel ( fft_bin) ;
682
678
if left < mbin && mbin < right {
0 commit comments