@@ -88,7 +88,8 @@ std::vector<std::pair<std::string, std::string>> AudioStream::GetDriverNames(Aud
88
88
return ret;
89
89
}
90
90
91
- std::vector<AudioStream::DeviceInfo> AudioStream::GetOutputDevices (AudioBackend backend, const char * driver, u32 sample_rate)
91
+ std::vector<AudioStream::DeviceInfo> AudioStream::GetOutputDevices (AudioBackend backend, const char * driver,
92
+ u32 sample_rate)
92
93
{
93
94
std::vector<AudioStream::DeviceInfo> ret;
94
95
switch (backend)
@@ -367,12 +368,57 @@ void AudioStream::ReadFrames(SampleType* samples, u32 num_frames)
367
368
368
369
if (m_volume != 100 )
369
370
{
370
- const s32 volume_mult = static_cast <s32>((static_cast <float >(m_volume) / 100 .0f ) * 32768 .0f );
371
-
372
371
u32 num_samples = num_frames * m_output_channels;
372
+ #if defined(CPU_ARCH_SSE)
373
+ const u32 aligned_samples = Common::AlignDownPow2 (num_samples, 8 );
374
+ num_samples -= aligned_samples;
375
+
376
+ const __m128 volume_multv = _mm_set1_ps (m_volume / 100 .0f );
377
+ const SampleType* const aligned_samples_end = samples + aligned_samples;
378
+ for (; samples != aligned_samples_end; samples += 8 )
379
+ {
380
+ __m128i iv = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(samples));
381
+ __m128i iv1 = _mm_unpacklo_epi16 (iv, iv); // [0, 0, 1, 1, 2, 2, 3, 3]
382
+ __m128i iv2 = _mm_unpackhi_epi16 (iv, iv); // [4, 4, 5, 5, 6, 6, 7, 7]
383
+ iv1 = _mm_srai_epi32 (iv1, 16 ); // [0, 1, 2, 3]
384
+ iv2 = _mm_srai_epi32 (iv2, 16 ); // [4, 5, 6, 7]
385
+ __m128 fv1 = _mm_cvtepi32_ps (iv1); // [f0, f1, f2, f3]
386
+ __m128 fv2 = _mm_cvtepi32_ps (iv2); // [f4, f5, f6, f7]
387
+ fv1 = _mm_mul_ps (fv1, volume_multv); // [f0, f1, f2, f3]
388
+ fv2 = _mm_mul_ps (fv2, volume_multv); // [f4, f5, f6, f7]
389
+ iv1 = _mm_cvtps_epi32 (fv1); // [0, 1, 2, 3]
390
+ iv2 = _mm_cvtps_epi32 (fv2); // [4, 5, 6, 7]
391
+ iv = _mm_packs_epi32 (iv1, iv2); // [0, 1, 2, 3, 4, 5, 6, 7]
392
+ _mm_storeu_si128 (reinterpret_cast <__m128i*>(samples), iv);
393
+ }
394
+ #elif defined(CPU_ARCH_NEON)
395
+ const u32 aligned_samples = Common::AlignDownPow2 (num_samples, 8 );
396
+ num_samples -= aligned_samples;
397
+
398
+ const float32x4_t volume_multv = vdupq_n_f32 (m_volume / 100 .0f );
399
+ const SampleType* const aligned_samples_end = samples + aligned_samples;
400
+ for (; samples != aligned_samples_end; samples += 8 )
401
+ {
402
+ int16x8_t iv = vld1q_s16 (samples);
403
+ int32x4_t iv1 = vreinterpretq_s32_s16 (vzip1q_s16 (iv, iv)); // [0, 0, 1, 1, 2, 2, 3, 3]
404
+ int32x4_t iv2 = vreinterpretq_s32_s16 (vzip2q_s16 (iv, iv)); // [4, 4, 5, 5, 6, 6, 7, 7]
405
+ iv1 = vshrq_n_s32 (iv1, 16 ); // [0, 1, 2, 3]
406
+ iv2 = vshrq_n_s32 (iv2, 16 ); // [4, 5, 6, 7]
407
+ float32x4_t fv1 = vcvtq_f32_s32 (iv1); // [f0, f1, f2, f3]
408
+ float32x4_t fv2 = vcvtq_f32_s32 (iv2); // [f4, f5, f6, f7]
409
+ fv1 = vmulq_f32 (fv1, volume_multv); // [f0, f1, f2, f3]
410
+ fv2 = vmulq_f32 (fv2, volume_multv); // [f4, f5, f6, f7]
411
+ iv1 = vcvtq_s32_f32 (fv1); // [0, 1, 2, 3]
412
+ iv2 = vcvtq_s32_f32 (fv2); // [4, 5, 6, 7]
413
+ iv = vcombine_s16 (vqmovn_s32 (iv1), vqmovn_s32 (iv2)); // [0, 1, 2, 3, 4, 5, 6, 7]
414
+ vst1q_s16 (samples, iv);
415
+ }
416
+ #endif
417
+
418
+ const float volume_mult = static_cast <float >(m_volume) / 100 .0f ;
373
419
while (num_samples > 0 )
374
420
{
375
- *samples = static_cast <s16>((static_cast <s32 >(*samples) * volume_mult) >> 15 );
421
+ *samples = static_cast <s16>(std::clamp (static_cast <float >(*samples) * volume_mult, - 32768 . 0f , 32767 . 0f ) );
376
422
samples++;
377
423
num_samples--;
378
424
}
@@ -572,7 +618,7 @@ static void S16ChunkToFloat(const s16* src, float* dst, u32 num_samples)
572
618
const u32 iterations = (num_samples + 7 ) / 8 ;
573
619
for (u32 i = 0 ; i < iterations; i++)
574
620
{
575
- const int16x8_t sv = vreinterpretq_s16_s32 ( vld1q_s16 (src) );
621
+ const int16x8_t sv = vld1q_s16 (src);
576
622
src += 8 ;
577
623
578
624
int32x4_t iv1 = vreinterpretq_s32_s16 (vzip1q_s16 (sv, sv)); // [0, 0, 1, 1, 2, 2, 3, 3]
0 commit comments