@@ -396,28 +396,6 @@ impl InputsProcessor for Phi4MMInputsProcessor {
396
396
}
397
397
398
398
impl Phi4MMInputsProcessor {
399
- fn load_dummy_audio ( & self ) -> Result < ( Vec < f32 > , u32 ) > {
400
- let mut reader = hound:: WavReader :: open ( "dummy_audio.wav" )
401
- . map_err ( |e| candle_core:: Error :: Msg ( format ! ( "Failed to load audio: {}" , e) ) ) ?;
402
- let spec = reader. spec ( ) ;
403
-
404
- let samples: Vec < f32 > = match spec. sample_format {
405
- hound:: SampleFormat :: Float => reader
406
- . samples :: < f32 > ( )
407
- . map ( |s| s. map_err ( |e| candle_core:: Error :: Msg ( e. to_string ( ) ) ) )
408
- . collect :: < std:: result:: Result < _ , _ > > ( ) ?,
409
-
410
- hound:: SampleFormat :: Int => reader
411
- . samples :: < i16 > ( ) // read as integers
412
- . map ( |s| {
413
- s. map ( |v| v as f32 / i16:: MAX as f32 ) // scale to –1.0…1.0
414
- . map_err ( |e| candle_core:: Error :: Msg ( e. to_string ( ) ) )
415
- } )
416
- . collect :: < std:: result:: Result < _ , _ > > ( ) ?,
417
- } ;
418
- Ok ( ( samples, spec. sample_rate ) )
419
- }
420
-
421
399
fn extract_audio_features (
422
400
& self ,
423
401
audio_data : & [ f32 ] ,
@@ -741,14 +719,26 @@ impl Phi4MMInputsProcessor {
741
719
let has_audio = seq. get_toks ( ) . contains ( & ( AUDIO_SPECIAL_TOKEN_ID as u32 ) ) ;
742
720
743
721
if has_audio {
722
+ // Convert multi-channel audio to mono by averaging channels
744
723
let ( audio_data, sample_rate) = if let Some ( mut audios) = seq. take_audios ( ) {
745
724
if let Some ( audio) = audios. pop ( ) {
746
- ( audio. samples , audio. sample_rate )
725
+ let channels = audio. channels as usize ; // use the actual field name for channel count
726
+ let samples = if channels > 1 && audio. samples . len ( ) % channels == 0 {
727
+ audio
728
+ . samples
729
+ . chunks ( channels)
730
+ . map ( |frame| frame. iter ( ) . copied ( ) . sum :: < f32 > ( ) / channels as f32 )
731
+ . collect :: < Vec < f32 > > ( )
732
+ } else {
733
+ audio. samples
734
+ } ;
735
+
736
+ ( samples, audio. sample_rate )
747
737
} else {
748
- self . load_dummy_audio ( ) ?
738
+ candle_core :: bail! ( "No audios in `process_audio_for_sequences`" ) ;
749
739
}
750
740
} else {
751
- self . load_dummy_audio ( ) ?
741
+ candle_core :: bail! ( "No audios in `process_audio_for_sequences`" ) ;
752
742
} ;
753
743
754
744
// Extract features
0 commit comments