From 22d7b7403aa64d2fb55c602ba8343cc5966dc93e Mon Sep 17 00:00:00 2001 From: David Reid Date: Sat, 26 May 2018 16:29:14 +1000 Subject: [PATCH] Initial work on SSE2 optimizations for sample rate conversion. --- mini_al.h | 143 ++++++++++++++++++-- tests/mal_profiling.c | 308 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 430 insertions(+), 21 deletions(-) diff --git a/mini_al.h b/mini_al.h index 7ec0a405..d1f45220 100644 --- a/mini_al.h +++ b/mini_al.h @@ -3211,9 +3211,20 @@ static MAL_INLINE float mal_mix_f32(float x, float y, float a) } static MAL_INLINE float mal_mix_f32_fast(float x, float y, float a) { - return x + (y - x)*a; + float r0 = (y - x); + float r1 = r0*a; + return x + r1; + //return x + (y - x)*a; } +#if defined(MAL_SUPPORT_SSE2) +static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a) +{ + return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a)); +} +#endif + + static MAL_INLINE double mal_mix_f64(double x, double y, double a) { return x*(1-a) + y*a; @@ -3384,7 +3395,7 @@ void mal_timer_init(mal_timer* pTimer) LARGE_INTEGER counter; QueryPerformanceCounter(&counter); - pTimer->counter = (mal_uint64)counter.QuadPart; + pTimer->counter = counter.QuadPart; } double mal_timer_get_time_in_seconds(mal_timer* pTimer) @@ -3394,7 +3405,7 @@ double mal_timer_get_time_in_seconds(mal_timer* pTimer) return 0; } - return (counter.QuadPart - pTimer->counter) / (double)g_mal_TimerFrequency.QuadPart; + return (double)(counter.QuadPart - pTimer->counter) / g_mal_TimerFrequency.QuadPart; } #elif defined(MAL_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200) uint64_t g_mal_TimerFrequency = 0; @@ -19677,7 +19688,7 @@ void mal_src__build_sinc_table__sinc(mal_src* pSRC) mal_assert(pSRC != NULL); pSRC->sinc.table[0] = 1.0f; - for (int i = 1; i < mal_countof(pSRC->sinc.table); i += 1) { + for (mal_uint32 i = 1; i < mal_countof(pSRC->sinc.table); i += 1) { double x = i*MAL_PI_D / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION; pSRC->sinc.table[i] = (float)(sin(x)/x); } @@ -19693,7 +19704,7 @@ void mal_src__build_sinc_table__hann(mal_src* pSRC) { mal_src__build_sinc_table__sinc(pSRC); - for (int i = 0; i < mal_countof(pSRC->sinc.table); i += 1) { + for (mal_uint32 i = 0; i < mal_countof(pSRC->sinc.table); i += 1) { double x = pSRC->sinc.table[i]; double N = MAL_SRC_SINC_MAX_WINDOW_WIDTH*2; double n = ((double)(i) / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION) + MAL_SRC_SINC_MAX_WINDOW_WIDTH; @@ -20070,7 +20081,7 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC, float xabs = (float)fabs(x); if (xabs >= MAL_SRC_SINC_MAX_WINDOW_WIDTH /*pSRC->config.sinc.windowWidth*/) { - return 0; + xabs = 1; // <-- A non-zero integer will always return 0. } xabs = xabs * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION; @@ -20084,6 +20095,60 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC, #endif } +#if defined(MAL_SUPPORT_SSE2) +static MAL_INLINE __m128 mal_fabsf_sse2(__m128 x) +{ + static MAL_ALIGN(16) mal_uint32 mask[4] = { + 0x7FFFFFFF, + 0x7FFFFFFF, + 0x7FFFFFFF, + 0x7FFFFFFF + }; + + return _mm_and_ps(*(__m128*)mask, x); +} + +static MAL_INLINE __m128 mal_truncf_sse2(__m128 x) +{ + return _mm_cvtepi32_ps(_mm_cvttps_epi32(x)); +} + +static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128* x) +{ + __m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH); + __m128 resolution128 = _mm_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION); + __m128 one = _mm_set1_ps(1); + + __m128 xabs = mal_fabsf_sse2(*x); + + // if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs; + __m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps. + xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs; + + xabs = _mm_mul_ps(xabs, resolution128); + __m128i ixabs = _mm_cvttps_epi32(xabs); + + __m128 lo = _mm_set_ps( + pSRC->sinc.table[((int*)&ixabs)[3]], + pSRC->sinc.table[((int*)&ixabs)[2]], + pSRC->sinc.table[((int*)&ixabs)[1]], + pSRC->sinc.table[((int*)&ixabs)[0]] + ); + + __m128 hi = _mm_set_ps( + pSRC->sinc.table[((int*)&ixabs)[3]+1], + pSRC->sinc.table[((int*)&ixabs)[2]+1], + pSRC->sinc.table[((int*)&ixabs)[1]+1], + pSRC->sinc.table[((int*)&ixabs)[0]+1] + ); + + __m128 a = _mm_sub_ps(xabs, _mm_cvtepi32_ps(ixabs)); + __m128 r = mal_mix_f32_fast__sse2(lo, hi, a); + + return r; +} +#endif + mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount, void** ppSamplesOut, void* pUserData) { mal_assert(pSRC != NULL); @@ -20122,21 +20187,66 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount outputFramesToRead = maxOutputFramesToRead; } + float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; + float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); + + float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; + float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); + for (mal_int32 i = 0; i < windowWidth2; ++i) { + iWindowF[i] = (float)(i - windowWidth); + } + for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) { // Do SRC. float timeIn = timeInBeg; for (mal_uint32 iSample = 0; iSample < outputFramesToRead; iSample += 1) { - mal_int32 iTimeIn = (mal_int32)timeIn; + float sampleOut = 0; + float iTimeInF = mal_floorf(timeIn); + mal_uint32 iTimeIn = (mal_uint32)iTimeInF; + + //mal_int32 iWindowBeg = -windowWidth+1; + //mal_int32 iWindowEnd = windowWidth; + mal_int32 iWindow = 0; + + // Pre-load the window samples into an aligned buffer to begin with. Need to put these into an aligned buffer to make SIMD easier. + windowSamples[0] = 0; // <-- The first sample is always zero. + for (mal_int32 i = 1; i < windowWidth2; ++i) { + windowSamples[i] = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, i - windowWidth); + } + +#if defined(MAL_SUPPORT_SSE2) + if (pSRC->useSSE2) { + __m128 t = _mm_set1_ps((timeIn - iTimeInF)); + + mal_int32 windowWidth4 = windowWidth2 >> 2; + for (mal_int32 iWindow4 = 0; iWindow4 < windowWidth4; iWindow4 += 1) { + __m128* s = (__m128*)windowSamples + iWindow4; + __m128* w = (__m128*)iWindowF + iWindow4; + + __m128 x = _mm_sub_ps(t, *w); + __m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, &x); + __m128 r = _mm_mul_ps(*s, a); + + sampleOut += ((float*)(&r))[0]; + sampleOut += ((float*)(&r))[1]; + sampleOut += ((float*)(&r))[2]; + sampleOut += ((float*)(&r))[3]; + } + + iWindow += windowWidth4 * 4; + } +#endif + + // Non-SIMD/Reference implementation. + for (; iWindow < windowWidth2; iWindow += 1) { + float s = windowSamples[iWindow]; - float sampleOut = 0; - for (mal_int32 iWindow = -windowWidth+1; iWindow < windowWidth; iWindow += 1) { float t = (timeIn - iTimeIn); - float w = (float)(iWindow); - + float w = iWindowF[iWindow]; float a = mal_src_sinc__interpolation_factor(pSRC, (t - w)); - float s = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, iWindow); + float r = s * a; - sampleOut += s * a; + sampleOut += r; } ppNextSamplesOut[iChannel][iSample] = (float)sampleOut; @@ -21902,6 +22012,13 @@ mal_result mal_decoder_init__internal(mal_decoder_read_proc onRead, mal_decoder_ mal_assert(pConfig != NULL); mal_assert(pDecoder != NULL); + // Silence some warnings in the case that we don't have any decoder backends enabled. + (void)onRead; + (void)onSeek; + (void)pUserData; + (void)pConfig; + (void)pDecoder; + // We use trial and error to open a decoder. mal_result result = MAL_NO_BACKEND; diff --git a/tests/mal_profiling.c b/tests/mal_profiling.c index 08938c80..006d74ca 100644 --- a/tests/mal_profiling.c +++ b/tests/mal_profiling.c @@ -1,6 +1,40 @@ #define MINI_AL_IMPLEMENTATION #include "../mini_al.h" +typedef enum +{ + simd_mode_scalar = 0, + simd_mode_sse2, + simd_mode_avx, + simd_mode_avx512, + simd_mode_neon +} simd_mode; + +const char* simd_mode_to_string(simd_mode mode) +{ + switch (mode) { + case simd_mode_scalar: return "Reference"; + case simd_mode_sse2: return "SSE2"; + case simd_mode_avx: return "AVX"; + case simd_mode_avx512: return "AVX-512"; + case simd_mode_neon: return "NEON"; + } + + return "Unknown"; +} + +const char* mal_src_algorithm_to_string(mal_src_algorithm algorithm) +{ + switch (algorithm) { + case mal_src_algorithm_none: return "Passthrough"; + case mal_src_algorithm_linear: return "Linear"; + case mal_src_algorithm_sinc: return "Sinc"; + } + + return "Unknown"; +} + + float g_ChannelRouterProfilingOutputBenchmark[8][48000]; float g_ChannelRouterProfilingOutput[8][48000]; double g_ChannelRouterTime_Reference = 0; @@ -9,7 +43,7 @@ double g_ChannelRouterTime_AVX = 0; double g_ChannelRouterTime_AVX512 = 0; double g_ChannelRouterTime_NEON = 0; -mal_sine_wave sineWave; +mal_sine_wave g_sineWave; mal_bool32 channel_router_test(mal_uint32 channels, mal_uint64 frameCount, float** ppFramesA, float** ppFramesB) { @@ -32,8 +66,8 @@ mal_uint32 channel_router_on_read(mal_channel_router* pRouter, mal_uint32 frameC float** ppSamplesOutF = (float**)ppSamplesOut; for (mal_uint32 iChannel = 0; iChannel < pRouter->config.channelsIn; ++iChannel) { - mal_sine_wave_init(1/(iChannel+1), 400, 48000, &sineWave); - mal_sine_wave_read(&sineWave, frameCount, ppSamplesOutF[iChannel]); + mal_sine_wave_init(1/(iChannel+1), 400, 48000, &g_sineWave); + mal_sine_wave_read(&g_sineWave, frameCount, ppSamplesOutF[iChannel]); } return frameCount; @@ -75,7 +109,7 @@ int do_profiling__channel_routing() ppOutBenchmark[i] = (void*)g_ChannelRouterProfilingOutputBenchmark[i]; } - mal_sine_wave_init(1, 400, 48000, &sineWave); + mal_sine_wave_init(1, 400, 48000, &g_sineWave); mal_uint64 framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOutBenchmark, NULL); if (framesRead != framesToRead) { printf("Channel Router: An error occurred while reading benchmark data.\n"); @@ -183,9 +217,263 @@ int do_profiling__channel_routing() printf("NEON: %.4fms (%.2f%%)\n", g_ChannelRouterTime_NEON*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_NEON*100); } - return 1; + return 0; } + +/////////////////////////////////////////////////////////////////////////////// +// +// SRC +// +/////////////////////////////////////////////////////////////////////////////// + +typedef struct +{ + float* pFrameData[MAL_MAX_CHANNELS]; + mal_uint64 frameCount; + mal_uint32 channels; + double timeTaken; +} src_reference_data; + +typedef struct +{ + float* pFrameData[MAL_MAX_CHANNELS]; + mal_uint64 frameCount; + mal_uint64 iNextFrame; + mal_uint32 channels; +} src_data; + +mal_uint32 do_profiling__src__on_read(mal_src* pSRC, mal_uint32 frameCount, void** ppSamplesOut, void* pUserData) +{ + src_data* pBaseData = (src_data*)pUserData; + mal_assert(pBaseData != NULL); + mal_assert(pBaseData->iNextFrame <= pBaseData->frameCount); + + mal_uint64 framesToRead = frameCount; + + mal_uint64 framesAvailable = pBaseData->frameCount - pBaseData->iNextFrame; + if (framesToRead > framesAvailable) { + framesToRead = framesAvailable; + } + + if (framesToRead > 0) { + for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) { + mal_copy_memory(ppSamplesOut[iChannel], pBaseData->pFrameData[iChannel], (size_t)(framesToRead * sizeof(float))); + } + } + + pBaseData->iNextFrame += framesToRead; + return (mal_uint32)framesToRead; +} + +mal_result init_src(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm, simd_mode mode, mal_src* pSRC) +{ + mal_assert(pBaseData != NULL); + mal_assert(pSRC != NULL); + + mal_src_config srcConfig = mal_src_config_init(sampleRateIn, sampleRateOut, pBaseData->channels, do_profiling__src__on_read, pBaseData); + srcConfig.sinc.windowWidth = 17; // <-- Make this an odd number to test unaligned section in the SIMD implementations. + srcConfig.algorithm = algorithm; + srcConfig.noSSE2 = MAL_TRUE; + srcConfig.noAVX = MAL_TRUE; + srcConfig.noAVX512 = MAL_TRUE; + srcConfig.noNEON = MAL_TRUE; + switch (mode) { + case simd_mode_sse2: srcConfig.noSSE2 = MAL_FALSE; break; + case simd_mode_avx: srcConfig.noAVX = MAL_FALSE; break; + case simd_mode_avx512: srcConfig.noAVX512 = MAL_FALSE; break; + case simd_mode_neon: srcConfig.noNEON = MAL_FALSE; break; + case simd_mode_scalar: + default: break; + } + + mal_result result = mal_src_init(&srcConfig, pSRC); + if (result != MAL_SUCCESS) { + printf("Failed to initialize sample rate converter.\n"); + return (int)result; + } + + return result; +} + +int do_profiling__src__profile_individual(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm, simd_mode mode, src_reference_data* pReferenceData) +{ + mal_assert(pBaseData != NULL); + mal_assert(pReferenceData != NULL); + + mal_result result = MAL_ERROR; + + // Make sure the base data is moved back to the start. + pBaseData->iNextFrame = 0; + + mal_src src; + result = init_src(pBaseData, sampleRateIn, sampleRateOut, algorithm, mode, &src); + if (result != MAL_SUCCESS) { + return (int)result; + } + + + // Profiling. + mal_uint64 sz = pReferenceData->frameCount * sizeof(float); + mal_assert(sz <= SIZE_MAX); + + float* pFrameData[MAL_MAX_CHANNELS]; + for (mal_uint32 iChannel = 0; iChannel < pBaseData->channels; iChannel += 1) { + pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)sz, MAL_SIMD_ALIGNMENT); + if (pFrameData[iChannel] == NULL) { + printf("Out of memory.\n"); + return -2; + } + mal_zero_memory(pFrameData[iChannel], (size_t)sz); + } + + mal_timer timer; + mal_timer_init(&timer); + double startTime = mal_timer_get_time_in_seconds(&timer); + { + mal_src_read_deinterleaved(&src, pReferenceData->frameCount, (void**)pFrameData, pBaseData); + } + double timeTaken = mal_timer_get_time_in_seconds(&timer) - startTime; + + + // Correctness test. + mal_bool32 passed = MAL_TRUE; + for (mal_uint32 iChannel = 0; iChannel < pReferenceData->channels; iChannel += 1) { + for (mal_uint32 iFrame = 0; iFrame < pReferenceData->frameCount; iFrame += 1) { + float s0 = pReferenceData->pFrameData[iChannel][iFrame]; + float s1 = pFrameData[iChannel][iFrame]; + if (s0 != s1) { + printf("(Channel %d, Sample %d) %f != %f\n", iChannel, iFrame, s0, s1); + passed = MAL_FALSE; + } + } + } + + + // Print results. + if (passed) { + printf(" [PASSED] "); + } else { + printf(" [FAILED] "); + } + printf("%s %d -> %d (%s): %.4fms (%.2f%%)\n", mal_src_algorithm_to_string(algorithm), sampleRateIn, sampleRateOut, simd_mode_to_string(mode), timeTaken*1000, pReferenceData->timeTaken/timeTaken*100); + + + for (mal_uint32 iChannel = 0; iChannel < pBaseData->channels; iChannel += 1) { + mal_aligned_free(pFrameData[iChannel]); + } + + return (int)result; +} + +int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm) +{ + mal_assert(pBaseData != NULL); + + // Make sure the base data is back at the start. + pBaseData->iNextFrame = 0; + + src_reference_data referenceData; + mal_zero_object(&referenceData); + referenceData.channels = pBaseData->channels; + + // The first thing to do is to perform a sample rate conversion using the scalar/reference implementation. This reference is used to compare + // the results of the optimized implementation. + referenceData.frameCount = mal_calculate_frame_count_after_src(sampleRateOut, sampleRateIn, pBaseData->frameCount); + if (referenceData.frameCount == 0) { + printf("Failed to calculate output frame count.\n"); + return -1; + } + + mal_uint64 sz = referenceData.frameCount * sizeof(float); + mal_assert(sz <= SIZE_MAX); + + for (mal_uint32 iChannel = 0; iChannel < referenceData.channels; iChannel += 1) { + referenceData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)sz, MAL_SIMD_ALIGNMENT); + if (referenceData.pFrameData[iChannel] == NULL) { + printf("Out of memory.\n"); + return -2; + } + mal_zero_memory(referenceData.pFrameData[iChannel], (size_t)sz); + } + + + // Generate the reference data. + mal_src src; + mal_result result = init_src(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_scalar, &src); + if (result != MAL_SUCCESS) { + return (int)result; + } + + mal_timer timer; + mal_timer_init(&timer); + double startTime = mal_timer_get_time_in_seconds(&timer); + { + mal_src_read_deinterleaved(&src, referenceData.frameCount, (void**)referenceData.pFrameData, pBaseData); + } + referenceData.timeTaken = mal_timer_get_time_in_seconds(&timer) - startTime; + + + // Now that we have the reference data to compare against we can go ahead and measure the SIMD optimizations. + if (mal_has_sse2()) { + do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData); + } + if (mal_has_avx()) { + do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx, &referenceData); + } + if (mal_has_avx512f()) { + do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx512, &referenceData); + } + if (mal_has_neon()) { + do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_neon, &referenceData); + } + + + for (mal_uint32 iChannel = 0; iChannel < referenceData.channels; iChannel += 1) { + mal_aligned_free(referenceData.pFrameData[iChannel]); + } + + return 0; +} + +int do_profiling__src() +{ + printf("Sample Rate Conversion\n"); + printf("======================\n"); + + // Set up base data. + src_data baseData; + mal_zero_object(&baseData); + baseData.channels = 8; + baseData.frameCount = 10000; + for (mal_uint32 iChannel = 0; iChannel < baseData.channels; ++iChannel) { + baseData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)(baseData.frameCount * sizeof(float)), MAL_SIMD_ALIGNMENT); + if (baseData.pFrameData[iChannel] == NULL) { + printf("Out of memory.\n"); + return -1; + } + + mal_sine_wave sineWave; + mal_sine_wave_init(1.0f, 400 + (iChannel*50), 48000, &sineWave); + mal_sine_wave_read(&sineWave, baseData.frameCount, baseData.pFrameData[iChannel]); + } + + + // Upsampling. + do_profiling__src__profile_set(&baseData, 44100, 48000, mal_src_algorithm_sinc); + + // Downsampling. + do_profiling__src__profile_set(&baseData, 48000, 44100, mal_src_algorithm_sinc); + + + for (mal_uint32 iChannel = 0; iChannel < baseData.channels; iChannel += 1) { + mal_aligned_free(baseData.pFrameData[iChannel]); + } + + return 0; +} + + int main(int argc, char** argv) { (void)argc; @@ -197,19 +485,16 @@ int main(int argc, char** argv) } else { printf("Has SSE: NO\n"); } - if (mal_has_avx()) { printf("Has AVX: YES\n"); } else { printf("Has AVX: NO\n"); } - if (mal_has_avx512f()) { printf("Has AVX-512F: YES\n"); } else { printf("Has AVX-512F: NO\n"); } - if (mal_has_neon()) { printf("Has NEON: YES\n"); } else { @@ -221,7 +506,14 @@ int main(int argc, char** argv) // Channel routing. do_profiling__channel_routing(); + printf("\n\n"); + // Sample rate conversion. + do_profiling__src(); + printf("\n\n"); + + + printf("Press any key to quit...\n"); getchar(); return 0;