Skip to content

Commit 0a38c08

Browse files
committed
AudioStream: Improve >100% volume output quality
1 parent f0a4ceb commit 0a38c08

File tree

1 file changed

+51
-5
lines changed

1 file changed

+51
-5
lines changed

src/util/audio_stream.cpp

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@ std::vector<std::pair<std::string, std::string>> AudioStream::GetDriverNames(Aud
8888
return ret;
8989
}
9090

91-
std::vector<AudioStream::DeviceInfo> AudioStream::GetOutputDevices(AudioBackend backend, const char* driver, u32 sample_rate)
91+
std::vector<AudioStream::DeviceInfo> AudioStream::GetOutputDevices(AudioBackend backend, const char* driver,
92+
u32 sample_rate)
9293
{
9394
std::vector<AudioStream::DeviceInfo> ret;
9495
switch (backend)
@@ -367,12 +368,57 @@ void AudioStream::ReadFrames(SampleType* samples, u32 num_frames)
367368

368369
if (m_volume != 100)
369370
{
370-
const s32 volume_mult = static_cast<s32>((static_cast<float>(m_volume) / 100.0f) * 32768.0f);
371-
372371
u32 num_samples = num_frames * m_output_channels;
372+
#if defined(CPU_ARCH_SSE)
373+
const u32 aligned_samples = Common::AlignDownPow2(num_samples, 8);
374+
num_samples -= aligned_samples;
375+
376+
const __m128 volume_multv = _mm_set1_ps(m_volume / 100.0f);
377+
const SampleType* const aligned_samples_end = samples + aligned_samples;
378+
for (; samples != aligned_samples_end; samples += 8)
379+
{
380+
__m128i iv = _mm_loadu_si128(reinterpret_cast<const __m128i*>(samples));
381+
__m128i iv1 = _mm_unpacklo_epi16(iv, iv); // [0, 0, 1, 1, 2, 2, 3, 3]
382+
__m128i iv2 = _mm_unpackhi_epi16(iv, iv); // [4, 4, 5, 5, 6, 6, 7, 7]
383+
iv1 = _mm_srai_epi32(iv1, 16); // [0, 1, 2, 3]
384+
iv2 = _mm_srai_epi32(iv2, 16); // [4, 5, 6, 7]
385+
__m128 fv1 = _mm_cvtepi32_ps(iv1); // [f0, f1, f2, f3]
386+
__m128 fv2 = _mm_cvtepi32_ps(iv2); // [f4, f5, f6, f7]
387+
fv1 = _mm_mul_ps(fv1, volume_multv); // [f0, f1, f2, f3]
388+
fv2 = _mm_mul_ps(fv2, volume_multv); // [f4, f5, f6, f7]
389+
iv1 = _mm_cvtps_epi32(fv1); // [0, 1, 2, 3]
390+
iv2 = _mm_cvtps_epi32(fv2); // [4, 5, 6, 7]
391+
iv = _mm_packs_epi32(iv1, iv2); // [0, 1, 2, 3, 4, 5, 6, 7]
392+
_mm_storeu_si128(reinterpret_cast<__m128i*>(samples), iv);
393+
}
394+
#elif defined(CPU_ARCH_NEON)
395+
const u32 aligned_samples = Common::AlignDownPow2(num_samples, 8);
396+
num_samples -= aligned_samples;
397+
398+
const float32x4_t volume_multv = vdupq_n_f32(m_volume / 100.0f);
399+
const SampleType* const aligned_samples_end = samples + aligned_samples;
400+
for (; samples != aligned_samples_end; samples += 8)
401+
{
402+
int16x8_t iv = vld1q_s16(samples);
403+
int32x4_t iv1 = vreinterpretq_s32_s16(vzip1q_s16(iv, iv)); // [0, 0, 1, 1, 2, 2, 3, 3]
404+
int32x4_t iv2 = vreinterpretq_s32_s16(vzip2q_s16(iv, iv)); // [4, 4, 5, 5, 6, 6, 7, 7]
405+
iv1 = vshrq_n_s32(iv1, 16); // [0, 1, 2, 3]
406+
iv2 = vshrq_n_s32(iv2, 16); // [4, 5, 6, 7]
407+
float32x4_t fv1 = vcvtq_f32_s32(iv1); // [f0, f1, f2, f3]
408+
float32x4_t fv2 = vcvtq_f32_s32(iv2); // [f4, f5, f6, f7]
409+
fv1 = vmulq_f32(fv1, volume_multv); // [f0, f1, f2, f3]
410+
fv2 = vmulq_f32(fv2, volume_multv); // [f4, f5, f6, f7]
411+
iv1 = vcvtq_s32_f32(fv1); // [0, 1, 2, 3]
412+
iv2 = vcvtq_s32_f32(fv2); // [4, 5, 6, 7]
413+
iv = vcombine_s16(vqmovn_s32(iv1), vqmovn_s32(iv2)); // [0, 1, 2, 3, 4, 5, 6, 7]
414+
vst1q_s16(samples, iv);
415+
}
416+
#endif
417+
418+
const float volume_mult = static_cast<float>(m_volume) / 100.0f;
373419
while (num_samples > 0)
374420
{
375-
*samples = static_cast<s16>((static_cast<s32>(*samples) * volume_mult) >> 15);
421+
*samples = static_cast<s16>(std::clamp(static_cast<float>(*samples) * volume_mult, -32768.0f, 32767.0f));
376422
samples++;
377423
num_samples--;
378424
}
@@ -572,7 +618,7 @@ static void S16ChunkToFloat(const s16* src, float* dst, u32 num_samples)
572618
const u32 iterations = (num_samples + 7) / 8;
573619
for (u32 i = 0; i < iterations; i++)
574620
{
575-
const int16x8_t sv = vreinterpretq_s16_s32(vld1q_s16(src));
621+
const int16x8_t sv = vld1q_s16(src);
576622
src += 8;
577623

578624
int32x4_t iv1 = vreinterpretq_s32_s16(vzip1q_s16(sv, sv)); // [0, 0, 1, 1, 2, 2, 3, 3]

0 commit comments

Comments
 (0)