From dfe27a64a81b605df8ab83b5bef25c9a17722512 Mon Sep 17 00:00:00 2001 From: David Reid Date: Sun, 27 May 2018 08:56:43 +1000 Subject: [PATCH] Experimental work on AVX. As of this commit there's no significant benefit. --- mini_al.h | 223 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 194 insertions(+), 29 deletions(-) diff --git a/mini_al.h b/mini_al.h index 4c938441..be57ef1a 100644 --- a/mini_al.h +++ b/mini_al.h @@ -926,7 +926,7 @@ MAL_ALIGNED_STRUCT(MAL_SIMD_ALIGNMENT) mal_src float timeIn; mal_uint32 inputFrameCount; // The number of frames sitting in the input buffer, not including the first half of the window. mal_uint32 windowPosInSamples; // An offset of . - float table[MAL_SRC_SINC_MAX_WINDOW_WIDTH * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION]; // Precomputed lookup table. + float table[MAL_SRC_SINC_MAX_WINDOW_WIDTH*1 * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION]; // Precomputed lookup table. The +1 is used to avoid the need for an overflow check. } sinc; }; @@ -3223,6 +3223,18 @@ static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a) return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a)); } #endif +#if defined(MAL_SUPPORT_AVX) +static MAL_INLINE __m256 mal_mix_f32_fast__avx(__m256 x, __m256 y, __m256 a) +{ + return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a)); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +static MAL_INLINE __m512 mal_mix_f32_fast__avx512(__m512 x, __m512 y, __m512 a) +{ + return _mm512_add_ps(x, _mm512_mul_ps(_mm512_sub_ps(y, x), a)); +} +#endif static MAL_INLINE double mal_mix_f64(double x, double y, double a) @@ -20116,17 +20128,17 @@ static MAL_INLINE __m128 mal_truncf_sse2(__m128 x) return _mm_cvtepi32_ps(_mm_cvttps_epi32(x)); } -static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128* x) +static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128 x) { - __m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH); + //__m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH); __m128 resolution128 = _mm_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION); - __m128 one = _mm_set1_ps(1); + //__m128 one = _mm_set1_ps(1); - __m128 xabs = mal_fabsf_sse2(*x); + __m128 xabs = mal_fabsf_sse2(x); // if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs; - __m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps. - xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs; + //__m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps. + //xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs; xabs = _mm_mul_ps(xabs, resolution128); __m128i ixabs = _mm_cvttps_epi32(xabs); @@ -20154,6 +20166,63 @@ static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* } #endif +#if defined(MAL_SUPPORT_AVX) +static MAL_INLINE __m256 mal_fabsf_avx(__m256 x) +{ + return _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)), x); +} + +#if 0 +static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src* pSRC, __m256 x) +{ + __m256 windowWidth256 = _mm256_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH); + __m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION); + __m256 one = _mm256_set1_ps(1); + + __m256 xabs = mal_fabsf_avx(x); + + // if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs; + __m256 xcmp = _mm256_cmp_ps(windowWidth256, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps. + xabs = _mm256_or_ps(_mm256_and_ps(one, xcmp), _mm256_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs; + + xabs = _mm256_mul_ps(xabs, resolution256); + + __m256i ixabs = _mm256_cvttps_epi32(xabs); + __m256 a = _mm256_sub_ps(xabs, _mm256_cvtepi32_ps(ixabs)); + + + int* ixabsv = (int*)&ixabs; + + __m256 lo = _mm256_set_ps( + pSRC->sinc.table[ixabsv[7]], + pSRC->sinc.table[ixabsv[6]], + pSRC->sinc.table[ixabsv[5]], + pSRC->sinc.table[ixabsv[4]], + pSRC->sinc.table[ixabsv[3]], + pSRC->sinc.table[ixabsv[2]], + pSRC->sinc.table[ixabsv[1]], + pSRC->sinc.table[ixabsv[0]] + ); + + __m256 hi = _mm256_set_ps( + pSRC->sinc.table[ixabsv[7]+1], + pSRC->sinc.table[ixabsv[6]+1], + pSRC->sinc.table[ixabsv[5]+1], + pSRC->sinc.table[ixabsv[4]+1], + pSRC->sinc.table[ixabsv[3]+1], + pSRC->sinc.table[ixabsv[2]+1], + pSRC->sinc.table[ixabsv[1]+1], + pSRC->sinc.table[ixabsv[0]+1] + ); + + __m256 r = mal_mix_f32_fast__avx(lo, hi, a); + + return r; +} +#endif + +#endif + mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount, void** ppSamplesOut, void* pUserData) { mal_assert(pSRC != NULL); @@ -20166,9 +20235,48 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount mal_int32 windowWidth = (mal_int32)pSRC->config.sinc.windowWidth; mal_int32 windowWidth2 = windowWidth*2; + // There are cases where it's actually more efficient to increase the window width so that it's aligned with the respective + // SIMD pipeline being used. + mal_int32 windowWidthSIMD = windowWidth; +#if defined(MAL_SUPPORT_NEON) + if (pSRC->useNEON) { + windowWidthSIMD = (windowWidthSIMD + 1) & ~(1); + } +#endif +#if defined(MAL_SUPPORT_AVX512) + if (pSRC->useAVX512) { + windowWidthSIMD = (windowWidthSIMD + 7) & ~(7); + } + else +#endif +#if defined(MAL_SUPPORT_AVX) + if (pSRC->useAVX) { + windowWidthSIMD = (windowWidthSIMD + 3) & ~(3); + } + else +#endif +#if defined(MAL_SUPPORT_SSE2) + if (pSRC->useSSE2) { + windowWidthSIMD = (windowWidthSIMD + 1) & ~(1); + } +#endif + mal_int32 windowWidthSIMD2 = windowWidthSIMD*2; + + float* ppNextSamplesOut[MAL_MAX_CHANNELS]; mal_copy_memory(ppNextSamplesOut, ppSamplesOut, sizeof(void*) * pSRC->config.channels); + float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; + float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); + mal_zero_memory(windowSamples, MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 * sizeof(float)); + + float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; + float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); + mal_zero_memory(iWindowF, MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 * sizeof(float)); + for (mal_int32 i = 0; i < windowWidth2; ++i) { + iWindowF[i] = (float)(i - windowWidth); + } + mal_uint64 totalOutputFramesRead = 0; while (totalOutputFramesRead < frameCount) { // The maximum number of frames we can read this iteration depends on how many input samples we have available to us. This is the number @@ -20192,15 +20300,6 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount outputFramesToRead = maxOutputFramesToRead; } - float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; - float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); - - float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT]; - float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1)); - for (mal_int32 i = 0; i < windowWidth2; ++i) { - iWindowF[i] = (float)(i - windowWidth); - } - for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) { // Do SRC. float timeIn = timeInBeg; @@ -20216,38 +20315,104 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount // Pre-load the window samples into an aligned buffer to begin with. Need to put these into an aligned buffer to make SIMD easier. windowSamples[0] = 0; // <-- The first sample is always zero. for (mal_int32 i = 1; i < windowWidth2; ++i) { - windowSamples[i] = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, i - windowWidth); + windowSamples[i] = pSRC->sinc.input[iChannel][iTimeIn + i]; } +#if defined(MAL_SUPPORT_AVX) + if (pSRC->useAVX) { + __m256i ixabs[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8]; + __m256 a[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8]; + __m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION); + + __m256 t = _mm256_set1_ps((timeIn - iTimeInF)); + __m256 r = _mm256_set1_ps(0); + + mal_int32 windowWidth8 = windowWidthSIMD2 >> 3; + for (mal_int32 iWindow8 = 0; iWindow8 < windowWidth8; iWindow8 += 1) { + __m256 w = *((__m256*)iWindowF + iWindow8); + + __m256 xabs = _mm256_sub_ps(t, w); + xabs = mal_fabsf_avx(xabs); + xabs = _mm256_mul_ps(xabs, resolution256); + + ixabs[iWindow8] = _mm256_cvttps_epi32(xabs); + a[iWindow8] = _mm256_sub_ps(xabs, _mm256_cvtepi32_ps(ixabs[iWindow8])); + } + + __m256 lo[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8]; + __m256 hi[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8]; + for (mal_int32 iWindow8 = 0; iWindow8 < windowWidth8; iWindow8 += 1) { + int* ixabsv = (int*)&ixabs[iWindow8]; + + lo[iWindow8] = _mm256_set_ps( + pSRC->sinc.table[ixabsv[7]], + pSRC->sinc.table[ixabsv[6]], + pSRC->sinc.table[ixabsv[5]], + pSRC->sinc.table[ixabsv[4]], + pSRC->sinc.table[ixabsv[3]], + pSRC->sinc.table[ixabsv[2]], + pSRC->sinc.table[ixabsv[1]], + pSRC->sinc.table[ixabsv[0]] + ); + + hi[iWindow8] = _mm256_set_ps( + pSRC->sinc.table[ixabsv[7]+1], + pSRC->sinc.table[ixabsv[6]+1], + pSRC->sinc.table[ixabsv[5]+1], + pSRC->sinc.table[ixabsv[4]+1], + pSRC->sinc.table[ixabsv[3]+1], + pSRC->sinc.table[ixabsv[2]+1], + pSRC->sinc.table[ixabsv[1]+1], + pSRC->sinc.table[ixabsv[0]+1] + ); + + __m256 s = *((__m256*)windowSamples + iWindow8); + r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx(lo[iWindow8], hi[iWindow8], a[iWindow8]))); + } + + // Horizontal add. + __m256 x = _mm256_hadd_ps(r, _mm256_permute2f128_ps(r, r, 1)); + x = _mm256_hadd_ps(x, x); + x = _mm256_hadd_ps(x, x); + sampleOut += _mm_cvtss_f32(_mm256_castps256_ps128(x)); + + iWindow += windowWidth8 * 8; + } + else +#endif #if defined(MAL_SUPPORT_SSE2) if (pSRC->useSSE2) { __m128 t = _mm_set1_ps((timeIn - iTimeInF)); + __m128 r = _mm_set1_ps(0); - mal_int32 windowWidth4 = windowWidth2 >> 2; + mal_int32 windowWidth4 = windowWidthSIMD2 >> 2; for (mal_int32 iWindow4 = 0; iWindow4 < windowWidth4; iWindow4 += 1) { __m128* s = (__m128*)windowSamples + iWindow4; __m128* w = (__m128*)iWindowF + iWindow4; - __m128 x = _mm_sub_ps(t, *w); - __m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, &x); - __m128 r = _mm_mul_ps(*s, a); - - sampleOut += ((float*)(&r))[0]; - sampleOut += ((float*)(&r))[1]; - sampleOut += ((float*)(&r))[2]; - sampleOut += ((float*)(&r))[3]; + __m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, _mm_sub_ps(t, *w)); + r = _mm_add_ps(r, _mm_mul_ps(*s, a)); } + sampleOut += ((float*)(&r))[0]; + sampleOut += ((float*)(&r))[1]; + sampleOut += ((float*)(&r))[2]; + sampleOut += ((float*)(&r))[3]; + iWindow += windowWidth4 * 4; } + else #endif + { + iWindow += 1; // The first one is a dummy for SIMD alignment purposes. Skip it. + } // Non-SIMD/Reference implementation. + float t = (timeIn - iTimeIn); for (; iWindow < windowWidth2; iWindow += 1) { float s = windowSamples[iWindow]; - - float t = (timeIn - iTimeIn); float w = iWindowF[iWindow]; + float a = mal_src_sinc__interpolation_factor(pSRC, (t - w)); float r = s * a; @@ -21888,7 +22053,7 @@ mal_uint32 mal_decoder_internal_on_read_frames__raw(mal_dsp* pDSP, mal_uint32 fr // For raw decoding we just read directly from the decoder's callbacks. mal_uint32 bpf = mal_get_bytes_per_frame(pDecoder->internalFormat, pDecoder->internalChannels); - return pDecoder->onRead(pDecoder, pSamplesOut, frameCount * bpf) / bpf; + return (mal_uint32)pDecoder->onRead(pDecoder, pSamplesOut, frameCount * bpf) / bpf; } mal_result mal_decoder_init_raw__internal(const mal_decoder_config* pConfigIn, const mal_decoder_config* pConfigOut, mal_decoder* pDecoder)