mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-23 00:34:03 +02:00
Experimental work on AVX.
As of this commit there's no significant benefit.
This commit is contained in:
@@ -926,7 +926,7 @@ MAL_ALIGNED_STRUCT(MAL_SIMD_ALIGNMENT) mal_src
|
||||
float timeIn;
|
||||
mal_uint32 inputFrameCount; // The number of frames sitting in the input buffer, not including the first half of the window.
|
||||
mal_uint32 windowPosInSamples; // An offset of <input>.
|
||||
float table[MAL_SRC_SINC_MAX_WINDOW_WIDTH * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION]; // Precomputed lookup table.
|
||||
float table[MAL_SRC_SINC_MAX_WINDOW_WIDTH*1 * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION]; // Precomputed lookup table. The +1 is used to avoid the need for an overflow check.
|
||||
} sinc;
|
||||
};
|
||||
|
||||
@@ -3223,6 +3223,18 @@ static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
|
||||
return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
|
||||
}
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_AVX)
|
||||
static MAL_INLINE __m256 mal_mix_f32_fast__avx(__m256 x, __m256 y, __m256 a)
|
||||
{
|
||||
return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
|
||||
}
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_AVX512)
|
||||
static MAL_INLINE __m512 mal_mix_f32_fast__avx512(__m512 x, __m512 y, __m512 a)
|
||||
{
|
||||
return _mm512_add_ps(x, _mm512_mul_ps(_mm512_sub_ps(y, x), a));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static MAL_INLINE double mal_mix_f64(double x, double y, double a)
|
||||
@@ -20116,17 +20128,17 @@ static MAL_INLINE __m128 mal_truncf_sse2(__m128 x)
|
||||
return _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
|
||||
}
|
||||
|
||||
static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128* x)
|
||||
static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128 x)
|
||||
{
|
||||
__m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
||||
//__m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
||||
__m128 resolution128 = _mm_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||
__m128 one = _mm_set1_ps(1);
|
||||
//__m128 one = _mm_set1_ps(1);
|
||||
|
||||
__m128 xabs = mal_fabsf_sse2(*x);
|
||||
__m128 xabs = mal_fabsf_sse2(x);
|
||||
|
||||
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
|
||||
__m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
||||
xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs;
|
||||
//__m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
||||
//xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs;
|
||||
|
||||
xabs = _mm_mul_ps(xabs, resolution128);
|
||||
__m128i ixabs = _mm_cvttps_epi32(xabs);
|
||||
@@ -20154,6 +20166,63 @@ static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src*
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(MAL_SUPPORT_AVX)
|
||||
static MAL_INLINE __m256 mal_fabsf_avx(__m256 x)
|
||||
{
|
||||
return _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)), x);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src* pSRC, __m256 x)
|
||||
{
|
||||
__m256 windowWidth256 = _mm256_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
||||
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||
__m256 one = _mm256_set1_ps(1);
|
||||
|
||||
__m256 xabs = mal_fabsf_avx(x);
|
||||
|
||||
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
|
||||
__m256 xcmp = _mm256_cmp_ps(windowWidth256, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
||||
xabs = _mm256_or_ps(_mm256_and_ps(one, xcmp), _mm256_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs;
|
||||
|
||||
xabs = _mm256_mul_ps(xabs, resolution256);
|
||||
|
||||
__m256i ixabs = _mm256_cvttps_epi32(xabs);
|
||||
__m256 a = _mm256_sub_ps(xabs, _mm256_cvtepi32_ps(ixabs));
|
||||
|
||||
|
||||
int* ixabsv = (int*)&ixabs;
|
||||
|
||||
__m256 lo = _mm256_set_ps(
|
||||
pSRC->sinc.table[ixabsv[7]],
|
||||
pSRC->sinc.table[ixabsv[6]],
|
||||
pSRC->sinc.table[ixabsv[5]],
|
||||
pSRC->sinc.table[ixabsv[4]],
|
||||
pSRC->sinc.table[ixabsv[3]],
|
||||
pSRC->sinc.table[ixabsv[2]],
|
||||
pSRC->sinc.table[ixabsv[1]],
|
||||
pSRC->sinc.table[ixabsv[0]]
|
||||
);
|
||||
|
||||
__m256 hi = _mm256_set_ps(
|
||||
pSRC->sinc.table[ixabsv[7]+1],
|
||||
pSRC->sinc.table[ixabsv[6]+1],
|
||||
pSRC->sinc.table[ixabsv[5]+1],
|
||||
pSRC->sinc.table[ixabsv[4]+1],
|
||||
pSRC->sinc.table[ixabsv[3]+1],
|
||||
pSRC->sinc.table[ixabsv[2]+1],
|
||||
pSRC->sinc.table[ixabsv[1]+1],
|
||||
pSRC->sinc.table[ixabsv[0]+1]
|
||||
);
|
||||
|
||||
__m256 r = mal_mix_f32_fast__avx(lo, hi, a);
|
||||
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount, void** ppSamplesOut, void* pUserData)
|
||||
{
|
||||
mal_assert(pSRC != NULL);
|
||||
@@ -20166,9 +20235,48 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
||||
mal_int32 windowWidth = (mal_int32)pSRC->config.sinc.windowWidth;
|
||||
mal_int32 windowWidth2 = windowWidth*2;
|
||||
|
||||
// There are cases where it's actually more efficient to increase the window width so that it's aligned with the respective
|
||||
// SIMD pipeline being used.
|
||||
mal_int32 windowWidthSIMD = windowWidth;
|
||||
#if defined(MAL_SUPPORT_NEON)
|
||||
if (pSRC->useNEON) {
|
||||
windowWidthSIMD = (windowWidthSIMD + 1) & ~(1);
|
||||
}
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_AVX512)
|
||||
if (pSRC->useAVX512) {
|
||||
windowWidthSIMD = (windowWidthSIMD + 7) & ~(7);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_AVX)
|
||||
if (pSRC->useAVX) {
|
||||
windowWidthSIMD = (windowWidthSIMD + 3) & ~(3);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_SSE2)
|
||||
if (pSRC->useSSE2) {
|
||||
windowWidthSIMD = (windowWidthSIMD + 1) & ~(1);
|
||||
}
|
||||
#endif
|
||||
mal_int32 windowWidthSIMD2 = windowWidthSIMD*2;
|
||||
|
||||
|
||||
float* ppNextSamplesOut[MAL_MAX_CHANNELS];
|
||||
mal_copy_memory(ppNextSamplesOut, ppSamplesOut, sizeof(void*) * pSRC->config.channels);
|
||||
|
||||
float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||
float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||
mal_zero_memory(windowSamples, MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 * sizeof(float));
|
||||
|
||||
float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||
float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||
mal_zero_memory(iWindowF, MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 * sizeof(float));
|
||||
for (mal_int32 i = 0; i < windowWidth2; ++i) {
|
||||
iWindowF[i] = (float)(i - windowWidth);
|
||||
}
|
||||
|
||||
mal_uint64 totalOutputFramesRead = 0;
|
||||
while (totalOutputFramesRead < frameCount) {
|
||||
// The maximum number of frames we can read this iteration depends on how many input samples we have available to us. This is the number
|
||||
@@ -20192,15 +20300,6 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
||||
outputFramesToRead = maxOutputFramesToRead;
|
||||
}
|
||||
|
||||
float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||
float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||
|
||||
float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||
float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||
for (mal_int32 i = 0; i < windowWidth2; ++i) {
|
||||
iWindowF[i] = (float)(i - windowWidth);
|
||||
}
|
||||
|
||||
for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) {
|
||||
// Do SRC.
|
||||
float timeIn = timeInBeg;
|
||||
@@ -20216,38 +20315,104 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
||||
// Pre-load the window samples into an aligned buffer to begin with. Need to put these into an aligned buffer to make SIMD easier.
|
||||
windowSamples[0] = 0; // <-- The first sample is always zero.
|
||||
for (mal_int32 i = 1; i < windowWidth2; ++i) {
|
||||
windowSamples[i] = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, i - windowWidth);
|
||||
windowSamples[i] = pSRC->sinc.input[iChannel][iTimeIn + i];
|
||||
}
|
||||
|
||||
#if defined(MAL_SUPPORT_AVX)
|
||||
if (pSRC->useAVX) {
|
||||
__m256i ixabs[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||
__m256 a[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||
|
||||
__m256 t = _mm256_set1_ps((timeIn - iTimeInF));
|
||||
__m256 r = _mm256_set1_ps(0);
|
||||
|
||||
mal_int32 windowWidth8 = windowWidthSIMD2 >> 3;
|
||||
for (mal_int32 iWindow8 = 0; iWindow8 < windowWidth8; iWindow8 += 1) {
|
||||
__m256 w = *((__m256*)iWindowF + iWindow8);
|
||||
|
||||
__m256 xabs = _mm256_sub_ps(t, w);
|
||||
xabs = mal_fabsf_avx(xabs);
|
||||
xabs = _mm256_mul_ps(xabs, resolution256);
|
||||
|
||||
ixabs[iWindow8] = _mm256_cvttps_epi32(xabs);
|
||||
a[iWindow8] = _mm256_sub_ps(xabs, _mm256_cvtepi32_ps(ixabs[iWindow8]));
|
||||
}
|
||||
|
||||
__m256 lo[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||
__m256 hi[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||
for (mal_int32 iWindow8 = 0; iWindow8 < windowWidth8; iWindow8 += 1) {
|
||||
int* ixabsv = (int*)&ixabs[iWindow8];
|
||||
|
||||
lo[iWindow8] = _mm256_set_ps(
|
||||
pSRC->sinc.table[ixabsv[7]],
|
||||
pSRC->sinc.table[ixabsv[6]],
|
||||
pSRC->sinc.table[ixabsv[5]],
|
||||
pSRC->sinc.table[ixabsv[4]],
|
||||
pSRC->sinc.table[ixabsv[3]],
|
||||
pSRC->sinc.table[ixabsv[2]],
|
||||
pSRC->sinc.table[ixabsv[1]],
|
||||
pSRC->sinc.table[ixabsv[0]]
|
||||
);
|
||||
|
||||
hi[iWindow8] = _mm256_set_ps(
|
||||
pSRC->sinc.table[ixabsv[7]+1],
|
||||
pSRC->sinc.table[ixabsv[6]+1],
|
||||
pSRC->sinc.table[ixabsv[5]+1],
|
||||
pSRC->sinc.table[ixabsv[4]+1],
|
||||
pSRC->sinc.table[ixabsv[3]+1],
|
||||
pSRC->sinc.table[ixabsv[2]+1],
|
||||
pSRC->sinc.table[ixabsv[1]+1],
|
||||
pSRC->sinc.table[ixabsv[0]+1]
|
||||
);
|
||||
|
||||
__m256 s = *((__m256*)windowSamples + iWindow8);
|
||||
r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx(lo[iWindow8], hi[iWindow8], a[iWindow8])));
|
||||
}
|
||||
|
||||
// Horizontal add.
|
||||
__m256 x = _mm256_hadd_ps(r, _mm256_permute2f128_ps(r, r, 1));
|
||||
x = _mm256_hadd_ps(x, x);
|
||||
x = _mm256_hadd_ps(x, x);
|
||||
sampleOut += _mm_cvtss_f32(_mm256_castps256_ps128(x));
|
||||
|
||||
iWindow += windowWidth8 * 8;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
#if defined(MAL_SUPPORT_SSE2)
|
||||
if (pSRC->useSSE2) {
|
||||
__m128 t = _mm_set1_ps((timeIn - iTimeInF));
|
||||
__m128 r = _mm_set1_ps(0);
|
||||
|
||||
mal_int32 windowWidth4 = windowWidth2 >> 2;
|
||||
mal_int32 windowWidth4 = windowWidthSIMD2 >> 2;
|
||||
for (mal_int32 iWindow4 = 0; iWindow4 < windowWidth4; iWindow4 += 1) {
|
||||
__m128* s = (__m128*)windowSamples + iWindow4;
|
||||
__m128* w = (__m128*)iWindowF + iWindow4;
|
||||
|
||||
__m128 x = _mm_sub_ps(t, *w);
|
||||
__m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, &x);
|
||||
__m128 r = _mm_mul_ps(*s, a);
|
||||
|
||||
sampleOut += ((float*)(&r))[0];
|
||||
sampleOut += ((float*)(&r))[1];
|
||||
sampleOut += ((float*)(&r))[2];
|
||||
sampleOut += ((float*)(&r))[3];
|
||||
__m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, _mm_sub_ps(t, *w));
|
||||
r = _mm_add_ps(r, _mm_mul_ps(*s, a));
|
||||
}
|
||||
|
||||
sampleOut += ((float*)(&r))[0];
|
||||
sampleOut += ((float*)(&r))[1];
|
||||
sampleOut += ((float*)(&r))[2];
|
||||
sampleOut += ((float*)(&r))[3];
|
||||
|
||||
iWindow += windowWidth4 * 4;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
iWindow += 1; // The first one is a dummy for SIMD alignment purposes. Skip it.
|
||||
}
|
||||
|
||||
// Non-SIMD/Reference implementation.
|
||||
float t = (timeIn - iTimeIn);
|
||||
for (; iWindow < windowWidth2; iWindow += 1) {
|
||||
float s = windowSamples[iWindow];
|
||||
|
||||
float t = (timeIn - iTimeIn);
|
||||
float w = iWindowF[iWindow];
|
||||
|
||||
float a = mal_src_sinc__interpolation_factor(pSRC, (t - w));
|
||||
float r = s * a;
|
||||
|
||||
@@ -21888,7 +22053,7 @@ mal_uint32 mal_decoder_internal_on_read_frames__raw(mal_dsp* pDSP, mal_uint32 fr
|
||||
|
||||
// For raw decoding we just read directly from the decoder's callbacks.
|
||||
mal_uint32 bpf = mal_get_bytes_per_frame(pDecoder->internalFormat, pDecoder->internalChannels);
|
||||
return pDecoder->onRead(pDecoder, pSamplesOut, frameCount * bpf) / bpf;
|
||||
return (mal_uint32)pDecoder->onRead(pDecoder, pSamplesOut, frameCount * bpf) / bpf;
|
||||
}
|
||||
|
||||
mal_result mal_decoder_init_raw__internal(const mal_decoder_config* pConfigIn, const mal_decoder_config* pConfigOut, mal_decoder* pDecoder)
|
||||
|
||||
Reference in New Issue
Block a user