Skip to content

Commit b81ca71

Browse files
committed
Support stereo
1 parent e546c5b commit b81ca71

File tree

2 files changed

+23
-25
lines changed

2 files changed

+23
-25
lines changed

mistralrs-core/src/request.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ pub struct WebSearchOptions {
124124
pub struct AudioInput {
125125
pub samples: Vec<f32>,
126126
pub sample_rate: u32,
127+
pub channels: u16,
127128
}
128129

129130
impl AudioInput {
@@ -150,6 +151,7 @@ impl AudioInput {
150151
Ok(Self {
151152
samples,
152153
sample_rate: spec.sample_rate,
154+
channels: spec.channels,
153155
})
154156
}
155157

@@ -180,6 +182,11 @@ impl AudioInput {
180182
let sample_rate = codec_params
181183
.sample_rate
182184
.ok_or_else(|| anyhow::anyhow!("unknown sample rate"))?;
185+
#[allow(clippy::cast_possible_truncation)]
186+
let channels = codec_params
187+
.channels
188+
.map(|channels| channels.count() as u16)
189+
.unwrap_or(1);
183190

184191
let mut decoder =
185192
symphonia::default::get_codecs().make(codec_params, &DecoderOptions::default())?;
@@ -206,6 +213,7 @@ impl AudioInput {
206213
Ok(Self {
207214
samples,
208215
sample_rate,
216+
channels,
209217
})
210218
}
211219
}

mistralrs-core/src/vision_models/phi4/inputs_processor.rs

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -396,28 +396,6 @@ impl InputsProcessor for Phi4MMInputsProcessor {
396396
}
397397

398398
impl Phi4MMInputsProcessor {
399-
fn load_dummy_audio(&self) -> Result<(Vec<f32>, u32)> {
400-
let mut reader = hound::WavReader::open("dummy_audio.wav")
401-
.map_err(|e| candle_core::Error::Msg(format!("Failed to load audio: {}", e)))?;
402-
let spec = reader.spec();
403-
404-
let samples: Vec<f32> = match spec.sample_format {
405-
hound::SampleFormat::Float => reader
406-
.samples::<f32>()
407-
.map(|s| s.map_err(|e| candle_core::Error::Msg(e.to_string())))
408-
.collect::<std::result::Result<_, _>>()?,
409-
410-
hound::SampleFormat::Int => reader
411-
.samples::<i16>() // read as integers
412-
.map(|s| {
413-
s.map(|v| v as f32 / i16::MAX as f32) // scale to –1.0…1.0
414-
.map_err(|e| candle_core::Error::Msg(e.to_string()))
415-
})
416-
.collect::<std::result::Result<_, _>>()?,
417-
};
418-
Ok((samples, spec.sample_rate))
419-
}
420-
421399
fn extract_audio_features(
422400
&self,
423401
audio_data: &[f32],
@@ -741,14 +719,26 @@ impl Phi4MMInputsProcessor {
741719
let has_audio = seq.get_toks().contains(&(AUDIO_SPECIAL_TOKEN_ID as u32));
742720

743721
if has_audio {
722+
// Convert multi-channel audio to mono by averaging channels
744723
let (audio_data, sample_rate) = if let Some(mut audios) = seq.take_audios() {
745724
if let Some(audio) = audios.pop() {
746-
(audio.samples, audio.sample_rate)
725+
let channels = audio.channels as usize; // use the actual field name for channel count
726+
let samples = if channels > 1 && audio.samples.len() % channels == 0 {
727+
audio
728+
.samples
729+
.chunks(channels)
730+
.map(|frame| frame.iter().copied().sum::<f32>() / channels as f32)
731+
.collect::<Vec<f32>>()
732+
} else {
733+
audio.samples
734+
};
735+
736+
(samples, audio.sample_rate)
747737
} else {
748-
self.load_dummy_audio()?
738+
candle_core::bail!("No audios in `process_audio_for_sequences`");
749739
}
750740
} else {
751-
self.load_dummy_audio()?
741+
candle_core::bail!("No audios in `process_audio_for_sequences`");
752742
};
753743

754744
// Extract features

0 commit comments

Comments
 (0)