From 5c358a75f323b5c60e464561f2445ef905d441e5 Mon Sep 17 00:00:00 2001 From: David Reid Date: Wed, 23 Nov 2022 19:54:28 +1000 Subject: [PATCH] Some vectorization improvements to ma_gainer. --- miniaudio.h | 131 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 45 deletions(-) diff --git a/miniaudio.h b/miniaudio.h index f04ecea9..28058fd5 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -3899,6 +3899,14 @@ typedef ma_uint16 wchar_t; #endif #endif +#ifndef MA_RESTRICT + #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER) + #define MA_RESTRICT __restrict + #else + #define MA_RESTRICT + #endif +#endif + /* SIMD alignment in bytes. Currently set to 32 bytes in preparation for future AVX optimizations. */ #define MA_SIMD_ALIGNMENT 32 @@ -11520,7 +11528,8 @@ static MA_INLINE ma_bool32 ma_has_neon(void) #define MA_SIMD_NEON 3 #ifndef MA_PREFERRED_SIMD - # if defined(MA_SUPPORT_SSE2) && defined(MA_PREFER_SSE2) + /* Prefer SSE2 over AVX2 if AVX2 has not bee explicitly requested. */ + # if defined(MA_SUPPORT_SSE2) && (defined(MA_PREFER_SSE2) || !defined(MA_PREFER_AVX2)) #define MA_PREFERRED_SIMD MA_SIMD_SSE2 #elif defined(MA_SUPPORT_AVX2) && defined(MA_PREFER_AVX2) #define MA_PREFERRED_SIMD MA_SIMD_AVX2 @@ -11549,14 +11558,6 @@ static MA_INLINE ma_bool32 ma_has_neon(void) #endif #endif -#ifndef MA_RESTRICT - #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER) - #define MA_RESTRICT __restrict - #else - #define MA_RESTRICT - #endif -#endif - #if defined(_MSC_VER) && _MSC_VER >= 1400 #define MA_HAS_BYTESWAP16_INTRINSIC #define MA_HAS_BYTESWAP32_INTRINSIC @@ -48209,27 +48210,44 @@ MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesO } iFrame = unrolledLoopCount << 1; - } else if (pGainer->config.channels == 8) { + } + else if (pGainer->config.channels == 8) { /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */ - for (; iFrame < interpolatedFrameCount; iFrame += 1) { - pFramesOutF32[iFrame*8 + 0] = pFramesInF32[iFrame*8 + 0] * pRunningGain[0]; - pFramesOutF32[iFrame*8 + 1] = pFramesInF32[iFrame*8 + 1] * pRunningGain[1]; - pFramesOutF32[iFrame*8 + 2] = pFramesInF32[iFrame*8 + 2] * pRunningGain[2]; - pFramesOutF32[iFrame*8 + 3] = pFramesInF32[iFrame*8 + 3] * pRunningGain[3]; - pFramesOutF32[iFrame*8 + 4] = pFramesInF32[iFrame*8 + 4] * pRunningGain[4]; - pFramesOutF32[iFrame*8 + 5] = pFramesInF32[iFrame*8 + 5] * pRunningGain[5]; - pFramesOutF32[iFrame*8 + 6] = pFramesInF32[iFrame*8 + 6] * pRunningGain[6]; - pFramesOutF32[iFrame*8 + 7] = pFramesInF32[iFrame*8 + 7] * pRunningGain[7]; + #if defined(MA_SUPPORT_SSE2) + if (ma_has_sse2()) { + __m128 runningGain0 = _mm_loadu_ps(&pRunningGain[0]); + __m128 runningGain1 = _mm_loadu_ps(&pRunningGain[4]); + __m128 runningGainDelta0 = _mm_loadu_ps(&pRunningGainDelta[0]); + __m128 runningGainDelta1 = _mm_loadu_ps(&pRunningGainDelta[4]); - /* Move the running gain forward towards the new gain. */ - pRunningGain[0] += pRunningGainDelta[0]; - pRunningGain[1] += pRunningGainDelta[1]; - pRunningGain[2] += pRunningGainDelta[2]; - pRunningGain[3] += pRunningGainDelta[3]; - pRunningGain[4] += pRunningGainDelta[4]; - pRunningGain[5] += pRunningGainDelta[5]; - pRunningGain[6] += pRunningGainDelta[6]; - pRunningGain[7] += pRunningGainDelta[7]; + for (; iFrame < interpolatedFrameCount; iFrame += 1) { + _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 0]), runningGain0)); + _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 4]), runningGain1)); + + runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0); + runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1); + } + } + else + #endif + { + /* This is crafted so that it auto-vectorizes when compiled with Clang. */ + for (; iFrame < interpolatedFrameCount; iFrame += 1) { + /* This temp buffer is required to allow Clang to generate efficient auto-vectorized code. */ + float temp[8]; + for (iChannel = 0; iChannel < 8; iChannel += 1) { + temp[iChannel] = pFramesInF32[iFrame*8 + iChannel]; + } + + for (iChannel = 0; iChannel < 8; iChannel += 1) { + pFramesOutF32[iFrame*8 + iChannel] = temp[iChannel] * pRunningGain[iChannel]; + } + + /* Move the running gain forward towards the new gain. */ + for (iChannel = 0; iChannel < 8; iChannel += 1) { + pRunningGain[iChannel] += pRunningGainDelta[iChannel]; + } + } } } @@ -52165,15 +52183,14 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann accumulation[6] += pFramesIn[iFrame*2 + 1] * weights[6][1]; accumulation[7] += pFramesIn[iFrame*2 + 1] * weights[7][1]; - - pFramesOut[iFrame * 8 + 0] = accumulation[0]; - pFramesOut[iFrame * 8 + 1] = accumulation[1]; - pFramesOut[iFrame * 8 + 2] = accumulation[2]; - pFramesOut[iFrame * 8 + 3] = accumulation[3]; - pFramesOut[iFrame * 8 + 4] = accumulation[4]; - pFramesOut[iFrame * 8 + 5] = accumulation[5]; - pFramesOut[iFrame * 8 + 6] = accumulation[6]; - pFramesOut[iFrame * 8 + 7] = accumulation[7]; + pFramesOut[iFrame*8 + 0] = accumulation[0]; + pFramesOut[iFrame*8 + 1] = accumulation[1]; + pFramesOut[iFrame*8 + 2] = accumulation[2]; + pFramesOut[iFrame*8 + 3] = accumulation[3]; + pFramesOut[iFrame*8 + 4] = accumulation[4]; + pFramesOut[iFrame*8 + 5] = accumulation[5]; + pFramesOut[iFrame*8 + 6] = accumulation[6]; + pFramesOut[iFrame*8 + 7] = accumulation[7]; } } else { /* When outputting to 8 channels, we can do everything in groups of two 4x SIMD operations. */ @@ -52191,16 +52208,40 @@ static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChann accumulation[7] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[7][iChannelIn]; } - pFramesOut[iFrame * 8 + 0] = accumulation[0]; - pFramesOut[iFrame * 8 + 1] = accumulation[1]; - pFramesOut[iFrame * 8 + 2] = accumulation[2]; - pFramesOut[iFrame * 8 + 3] = accumulation[3]; - pFramesOut[iFrame * 8 + 4] = accumulation[4]; - pFramesOut[iFrame * 8 + 5] = accumulation[5]; - pFramesOut[iFrame * 8 + 6] = accumulation[6]; - pFramesOut[iFrame * 8 + 7] = accumulation[7]; + pFramesOut[iFrame*8 + 0] = accumulation[0]; + pFramesOut[iFrame*8 + 1] = accumulation[1]; + pFramesOut[iFrame*8 + 2] = accumulation[2]; + pFramesOut[iFrame*8 + 3] = accumulation[3]; + pFramesOut[iFrame*8 + 4] = accumulation[4]; + pFramesOut[iFrame*8 + 5] = accumulation[5]; + pFramesOut[iFrame*8 + 6] = accumulation[6]; + pFramesOut[iFrame*8 + 7] = accumulation[7]; } } + } else if (channelsOut == 6) { + /* + When outputting to 6 channels we unfortunately don't have a nice multiple of 4 to do 4x SIMD operations. Instead we'll + expand our weights and do two frames at a time. + */ + for (; iFrame < frameCount; iFrame += 1) { + float accumulation[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) { + accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn]; + accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn]; + accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn]; + accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn]; + accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn]; + accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn]; + } + + pFramesOut[iFrame*6 + 0] = accumulation[0]; + pFramesOut[iFrame*6 + 1] = accumulation[1]; + pFramesOut[iFrame*6 + 2] = accumulation[2]; + pFramesOut[iFrame*6 + 3] = accumulation[3]; + pFramesOut[iFrame*6 + 4] = accumulation[4]; + pFramesOut[iFrame*6 + 5] = accumulation[5]; + } } /* Leftover frames. */