diff --git a/miniaudio.h b/miniaudio.h index 1d7e1659..4fc438d3 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -60517,40 +60517,223 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r /* Experimental loop unrolling to make it easier for SIMD-ification. */ #if 1 { - while (framesProcessedOut + 4 <= frameCountOut) { - ma_uint32 inTimeIntTemp; - ma_uint32 inTimeFracTemp; - ma_uint32 inTimeInt4[4]; - ma_uint32 inTimeFrac4[4]; - int i; + if (channels == 1) { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + ma_int32 x[4]; + ma_int32 y[4]; + ma_int32 a[4]; + ma_int32 d[4]; + ma_int32 n[4]; + ma_int32 r[4]; + int i; - inTimeIntTemp = inTimeInt; - inTimeFracTemp = inTimeFrac; + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; - for (i = 0; i < 4; i += 1) { - inTimeInt4[i] = inTimeIntTemp; - inTimeFrac4[i] = inTimeFracTemp; + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; - inTimeIntTemp += pResampler->inAdvanceInt; - inTimeFracTemp += pResampler->inAdvanceFrac; - if (inTimeFracTemp >= pResampler->sampleRateOut) { - inTimeFracTemp -= pResampler->sampleRateOut; - inTimeIntTemp += 1; + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + x[0] = pFramesInS16[inTimeInt4[0] + 0]; + x[1] = pFramesInS16[inTimeInt4[1] + 0]; + x[2] = pFramesInS16[inTimeInt4[2] + 0]; + x[3] = pFramesInS16[inTimeInt4[3] + 0]; + + y[0] = pFramesInS16[inTimeInt4[0] + 1]; + y[1] = pFramesInS16[inTimeInt4[1] + 1]; + y[2] = pFramesInS16[inTimeInt4[2] + 1]; + y[3] = pFramesInS16[inTimeInt4[3] + 1]; + + a[0] = inTimeFrac4[0] * invSampleRateOut; + a[1] = inTimeFrac4[1] * invSampleRateOut; + a[2] = inTimeFrac4[2] * invSampleRateOut; + a[3] = inTimeFrac4[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + ma_linear_resampler_filter_s32_4_mono(pLPF, lpfCount, r); + + pFramesOutS16[0] = (ma_int16)r[0]; + pFramesOutS16[1] = (ma_int16)r[1]; + pFramesOutS16[2] = (ma_int16)r[2]; + pFramesOutS16[3] = (ma_int16)r[3]; + + pFramesOutS16 += 4; + framesProcessedOut += 4; } + } else if (channels == 2) { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + ma_int32 x[8]; + ma_int32 y[8]; + ma_int32 a[8]; + ma_int32 d[8]; + ma_int32 n[8]; + ma_int32 r[8]; + int i; - /* Check that we have one extra sample at the end for doing the interpolation. */ - if (inTimeInt4[3] + 1 >= frameCountIn) { - break; /* Not enough input frames. */ + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0]; + x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1]; + x[2] = pFramesInS16[((inTimeInt4[1] + 0) * 2) + 0]; + x[3] = pFramesInS16[((inTimeInt4[1] + 0) * 2) + 1]; + x[4] = pFramesInS16[((inTimeInt4[2] + 0) * 2) + 0]; + x[5] = pFramesInS16[((inTimeInt4[2] + 0) * 2) + 1]; + x[6] = pFramesInS16[((inTimeInt4[3] + 0) * 2) + 0]; + x[7] = pFramesInS16[((inTimeInt4[3] + 0) * 2) + 1]; + + y[0] = pFramesInS16[((inTimeInt4[0] + 1) * 2) + 0]; + y[1] = pFramesInS16[((inTimeInt4[0] + 1) * 2) + 1]; + y[2] = pFramesInS16[((inTimeInt4[1] + 1) * 2) + 0]; + y[3] = pFramesInS16[((inTimeInt4[1] + 1) * 2) + 1]; + y[4] = pFramesInS16[((inTimeInt4[2] + 1) * 2) + 0]; + y[5] = pFramesInS16[((inTimeInt4[2] + 1) * 2) + 1]; + y[6] = pFramesInS16[((inTimeInt4[3] + 1) * 2) + 0]; + y[7] = pFramesInS16[((inTimeInt4[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac4[0] * invSampleRateOut; + a[1] = inTimeFrac4[0] * invSampleRateOut; + a[2] = inTimeFrac4[1] * invSampleRateOut; + a[3] = inTimeFrac4[1] * invSampleRateOut; + a[4] = inTimeFrac4[2] * invSampleRateOut; + a[5] = inTimeFrac4[2] * invSampleRateOut; + a[6] = inTimeFrac4[3] * invSampleRateOut; + a[7] = inTimeFrac4[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; + d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; + n[7] = d[7] * a[7]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + ma_linear_resampler_filter_s32_4_stereo(pLPF, lpfCount, r); + + pFramesOutS16[0] = (ma_int16)r[0]; + pFramesOutS16[1] = (ma_int16)r[1]; + pFramesOutS16[2] = (ma_int16)r[2]; + pFramesOutS16[3] = (ma_int16)r[3]; + pFramesOutS16[4] = (ma_int16)r[4]; + pFramesOutS16[5] = (ma_int16)r[5]; + pFramesOutS16[6] = (ma_int16)r[6]; + pFramesOutS16[7] = (ma_int16)r[7]; + + pFramesOutS16 += 8; + framesProcessedOut += 4; } + } else { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + int i; - /* Advance the timer. */ - inTimeInt = inTimeIntTemp; - inTimeFrac = inTimeFracTemp; + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; - /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ - { - if (channels == 1) { + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + for (c = 0; c < channels; c += 1) { ma_int32 x[4]; ma_int32 y[4]; ma_int32 a[4]; @@ -60558,15 +60741,15 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r ma_int32 n[4]; ma_int32 r[4]; - x[0] = pFramesInS16[inTimeInt4[0] + 0]; - x[1] = pFramesInS16[inTimeInt4[1] + 0]; - x[2] = pFramesInS16[inTimeInt4[2] + 0]; - x[3] = pFramesInS16[inTimeInt4[3] + 0]; - - y[0] = pFramesInS16[inTimeInt4[0] + 1]; - y[1] = pFramesInS16[inTimeInt4[1] + 1]; - y[2] = pFramesInS16[inTimeInt4[2] + 1]; - y[3] = pFramesInS16[inTimeInt4[3] + 1]; + x[0] = pFramesInS16[((inTimeInt4[0] + 0) * channels) + c]; + x[1] = pFramesInS16[((inTimeInt4[1] + 0) * channels) + c]; + x[2] = pFramesInS16[((inTimeInt4[2] + 0) * channels) + c]; + x[3] = pFramesInS16[((inTimeInt4[3] + 0) * channels) + c]; + + y[0] = pFramesInS16[((inTimeInt4[0] + 1) * channels) + c]; + y[1] = pFramesInS16[((inTimeInt4[1] + 1) * channels) + c]; + y[2] = pFramesInS16[((inTimeInt4[2] + 1) * channels) + c]; + y[3] = pFramesInS16[((inTimeInt4[3] + 1) * channels) + c]; a[0] = inTimeFrac4[0] * invSampleRateOut; a[1] = inTimeFrac4[1] * invSampleRateOut; @@ -60588,132 +60771,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - ma_linear_resampler_filter_s32_4_mono(pLPF, lpfCount, r); - - pFramesOutS16[0] = (ma_int16)r[0]; - pFramesOutS16[1] = (ma_int16)r[1]; - pFramesOutS16[2] = (ma_int16)r[2]; - pFramesOutS16[3] = (ma_int16)r[3]; - } else if (channels == 2) { - ma_int32 x[8]; - ma_int32 y[8]; - ma_int32 a[8]; - ma_int32 d[8]; - ma_int32 n[8]; - ma_int32 r[8]; - - x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0]; - x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1]; - x[2] = pFramesInS16[((inTimeInt4[1] + 0) * 2) + 0]; - x[3] = pFramesInS16[((inTimeInt4[1] + 0) * 2) + 1]; - x[4] = pFramesInS16[((inTimeInt4[2] + 0) * 2) + 0]; - x[5] = pFramesInS16[((inTimeInt4[2] + 0) * 2) + 1]; - x[6] = pFramesInS16[((inTimeInt4[3] + 0) * 2) + 0]; - x[7] = pFramesInS16[((inTimeInt4[3] + 0) * 2) + 1]; - - y[0] = pFramesInS16[((inTimeInt4[0] + 1) * 2) + 0]; - y[1] = pFramesInS16[((inTimeInt4[0] + 1) * 2) + 1]; - y[2] = pFramesInS16[((inTimeInt4[1] + 1) * 2) + 0]; - y[3] = pFramesInS16[((inTimeInt4[1] + 1) * 2) + 1]; - y[4] = pFramesInS16[((inTimeInt4[2] + 1) * 2) + 0]; - y[5] = pFramesInS16[((inTimeInt4[2] + 1) * 2) + 1]; - y[6] = pFramesInS16[((inTimeInt4[3] + 1) * 2) + 0]; - y[7] = pFramesInS16[((inTimeInt4[3] + 1) * 2) + 1]; - - a[0] = inTimeFrac4[0] * invSampleRateOut; - a[1] = inTimeFrac4[0] * invSampleRateOut; - a[2] = inTimeFrac4[1] * invSampleRateOut; - a[3] = inTimeFrac4[1] * invSampleRateOut; - a[4] = inTimeFrac4[2] * invSampleRateOut; - a[5] = inTimeFrac4[2] * invSampleRateOut; - a[6] = inTimeFrac4[3] * invSampleRateOut; - a[7] = inTimeFrac4[3] * invSampleRateOut; - - d[0] = y[0] - x[0]; - d[1] = y[1] - x[1]; - d[2] = y[2] - x[2]; - d[3] = y[3] - x[3]; - d[4] = y[4] - x[4]; - d[5] = y[5] - x[5]; - d[6] = y[6] - x[6]; - d[7] = y[7] - x[7]; - - n[0] = d[0] * a[0]; - n[1] = d[1] * a[1]; - n[2] = d[2] * a[2]; - n[3] = d[3] * a[3]; - n[4] = d[4] * a[4]; - n[5] = d[5] * a[5]; - n[6] = d[6] * a[6]; - n[7] = d[7] * a[7]; - - r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - - ma_linear_resampler_filter_s32_4_stereo(pLPF, lpfCount, r); - - pFramesOutS16[0] = (ma_int16)r[0]; - pFramesOutS16[1] = (ma_int16)r[1]; - pFramesOutS16[2] = (ma_int16)r[2]; - pFramesOutS16[3] = (ma_int16)r[3]; - pFramesOutS16[4] = (ma_int16)r[4]; - pFramesOutS16[5] = (ma_int16)r[5]; - pFramesOutS16[6] = (ma_int16)r[6]; - pFramesOutS16[7] = (ma_int16)r[7]; - } else { - for (c = 0; c < channels; c += 1) { - ma_int32 x[4]; - ma_int32 y[4]; - ma_int32 a[4]; - ma_int32 d[4]; - ma_int32 n[4]; - ma_int32 r[4]; - - x[0] = pFramesInS16[((inTimeInt4[0] + 0) * channels) + c]; - x[1] = pFramesInS16[((inTimeInt4[1] + 0) * channels) + c]; - x[2] = pFramesInS16[((inTimeInt4[2] + 0) * channels) + c]; - x[3] = pFramesInS16[((inTimeInt4[3] + 0) * channels) + c]; - - y[0] = pFramesInS16[((inTimeInt4[0] + 1) * channels) + c]; - y[1] = pFramesInS16[((inTimeInt4[1] + 1) * channels) + c]; - y[2] = pFramesInS16[((inTimeInt4[2] + 1) * channels) + c]; - y[3] = pFramesInS16[((inTimeInt4[3] + 1) * channels) + c]; - - a[0] = inTimeFrac4[0] * invSampleRateOut; - a[1] = inTimeFrac4[1] * invSampleRateOut; - a[2] = inTimeFrac4[2] * invSampleRateOut; - a[3] = inTimeFrac4[3] * invSampleRateOut; - - d[0] = y[0] - x[0]; - d[1] = y[1] - x[1]; - d[2] = y[2] - x[2]; - d[3] = y[3] - x[3]; - - n[0] = d[0] * a[0]; - n[1] = d[1] * a[1]; - n[2] = d[2] * a[2]; - n[3] = d[3] * a[3]; - - r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); - - pFramesOutS16[(0 * channels) + c] = (ma_int16)r[0]; - pFramesOutS16[(1 * channels) + c] = (ma_int16)r[1]; - pFramesOutS16[(2 * channels) + c] = (ma_int16)r[2]; - pFramesOutS16[(3 * channels) + c] = (ma_int16)r[3]; - } - - ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16); + pFramesOutS16[(0 * channels) + c] = (ma_int16)r[0]; + pFramesOutS16[(1 * channels) + c] = (ma_int16)r[1]; + pFramesOutS16[(2 * channels) + c] = (ma_int16)r[2]; + pFramesOutS16[(3 * channels) + c] = (ma_int16)r[3]; } + ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16); + pFramesOutS16 += 4 * channels; framesProcessedOut += 4; } @@ -60961,40 +61026,223 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r /* Experimental loop unrolling to make it easier for SIMD-ification. */ #if 1 { - while (framesProcessedOut + 4 <= frameCountOut) { - ma_uint32 inTimeIntTemp; - ma_uint32 inTimeFracTemp; - ma_uint32 inTimeInt4[4]; - ma_uint32 inTimeFrac4[4]; - int i; + if (channels == 1) { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + float x[4]; + float y[4]; + float a[4]; + float d[4]; + float n[4]; + float r[4]; + int i; - inTimeIntTemp = inTimeInt; - inTimeFracTemp = inTimeFrac; + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; - for (i = 0; i < 4; i += 1) { - inTimeInt4[i] = inTimeIntTemp; - inTimeFrac4[i] = inTimeFracTemp; + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; - inTimeIntTemp += pResampler->inAdvanceInt; - inTimeFracTemp += pResampler->inAdvanceFrac; - if (inTimeFracTemp >= pResampler->sampleRateOut) { - inTimeFracTemp -= pResampler->sampleRateOut; - inTimeIntTemp += 1; + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + x[0] = pFramesInF32[inTimeInt4[0] + 0]; + x[1] = pFramesInF32[inTimeInt4[1] + 0]; + x[2] = pFramesInF32[inTimeInt4[2] + 0]; + x[3] = pFramesInF32[inTimeInt4[3] + 0]; + + y[0] = pFramesInF32[inTimeInt4[0] + 1]; + y[1] = pFramesInF32[inTimeInt4[1] + 1]; + y[2] = pFramesInF32[inTimeInt4[2] + 1]; + y[3] = pFramesInF32[inTimeInt4[3] + 1]; + + a[0] = inTimeFrac4[0] * invSampleRateOut; + a[1] = inTimeFrac4[1] * invSampleRateOut; + a[2] = inTimeFrac4[2] * invSampleRateOut; + a[3] = inTimeFrac4[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + + ma_linear_resampler_filter_f32_4_mono(pLPF, lpfCount, r); + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + + pFramesOutF32 += 4; + framesProcessedOut += 4; } + } else if (channels == 2) { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + float x[8]; + float y[8]; + float a[8]; + float d[8]; + float n[8]; + float r[8]; + int i; - /* Check that we have one extra sample at the end for doing the interpolation. */ - if (inTimeInt4[3] + 1 >= frameCountIn) { - break; /* Not enough input frames. */ + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0]; + x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1]; + x[2] = pFramesInF32[((inTimeInt4[1] + 0) * 2) + 0]; + x[3] = pFramesInF32[((inTimeInt4[1] + 0) * 2) + 1]; + x[4] = pFramesInF32[((inTimeInt4[2] + 0) * 2) + 0]; + x[5] = pFramesInF32[((inTimeInt4[2] + 0) * 2) + 1]; + x[6] = pFramesInF32[((inTimeInt4[3] + 0) * 2) + 0]; + x[7] = pFramesInF32[((inTimeInt4[3] + 0) * 2) + 1]; + + y[0] = pFramesInF32[((inTimeInt4[0] + 1) * 2) + 0]; + y[1] = pFramesInF32[((inTimeInt4[0] + 1) * 2) + 1]; + y[2] = pFramesInF32[((inTimeInt4[1] + 1) * 2) + 0]; + y[3] = pFramesInF32[((inTimeInt4[1] + 1) * 2) + 1]; + y[4] = pFramesInF32[((inTimeInt4[2] + 1) * 2) + 0]; + y[5] = pFramesInF32[((inTimeInt4[2] + 1) * 2) + 1]; + y[6] = pFramesInF32[((inTimeInt4[3] + 1) * 2) + 0]; + y[7] = pFramesInF32[((inTimeInt4[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac4[0] * invSampleRateOut; + a[1] = inTimeFrac4[0] * invSampleRateOut; + a[2] = inTimeFrac4[1] * invSampleRateOut; + a[3] = inTimeFrac4[1] * invSampleRateOut; + a[4] = inTimeFrac4[2] * invSampleRateOut; + a[5] = inTimeFrac4[2] * invSampleRateOut; + a[6] = inTimeFrac4[3] * invSampleRateOut; + a[7] = inTimeFrac4[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; + d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; + n[7] = d[7] * a[7]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + r[4] = x[4] + n[4]; + r[5] = x[5] + n[5]; + r[6] = x[6] + n[6]; + r[7] = x[7] + n[7]; + + ma_linear_resampler_filter_f32_4_stereo(pLPF, lpfCount, r); + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + pFramesOutF32[4] = r[4]; + pFramesOutF32[5] = r[5]; + pFramesOutF32[6] = r[6]; + pFramesOutF32[7] = r[7]; + + pFramesOutF32 += 8; + framesProcessedOut += 4; } + } else { + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt4[4]; + ma_uint32 inTimeFrac4[4]; + int i; - /* Advance the timer. */ - inTimeInt = inTimeIntTemp; - inTimeFrac = inTimeFracTemp; + inTimeIntTemp = inTimeInt; + inTimeFracTemp = inTimeFrac; - /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ - { - if (channels == 1) { + for (i = 0; i < 4; i += 1) { + inTimeInt4[i] = inTimeIntTemp; + inTimeFrac4[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt4[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + inTimeInt = inTimeIntTemp; + inTimeFrac = inTimeFracTemp; + + for (c = 0; c < channels; c += 1) { float x[4]; float y[4]; float a[4]; @@ -61002,15 +61250,15 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r float n[4]; float r[4]; - x[0] = pFramesInF32[inTimeInt4[0] + 0]; - x[1] = pFramesInF32[inTimeInt4[1] + 0]; - x[2] = pFramesInF32[inTimeInt4[2] + 0]; - x[3] = pFramesInF32[inTimeInt4[3] + 0]; + x[0] = pFramesInF32[((inTimeInt4[0] + 0) * channels) + c]; + x[1] = pFramesInF32[((inTimeInt4[1] + 0) * channels) + c]; + x[2] = pFramesInF32[((inTimeInt4[2] + 0) * channels) + c]; + x[3] = pFramesInF32[((inTimeInt4[3] + 0) * channels) + c]; - y[0] = pFramesInF32[inTimeInt4[0] + 1]; - y[1] = pFramesInF32[inTimeInt4[1] + 1]; - y[2] = pFramesInF32[inTimeInt4[2] + 1]; - y[3] = pFramesInF32[inTimeInt4[3] + 1]; + y[0] = pFramesInF32[((inTimeInt4[0] + 1) * channels) + c]; + y[1] = pFramesInF32[((inTimeInt4[1] + 1) * channels) + c]; + y[2] = pFramesInF32[((inTimeInt4[2] + 1) * channels) + c]; + y[3] = pFramesInF32[((inTimeInt4[3] + 1) * channels) + c]; a[0] = inTimeFrac4[0] * invSampleRateOut; a[1] = inTimeFrac4[1] * invSampleRateOut; @@ -61032,132 +61280,14 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r r[2] = x[2] + n[2]; r[3] = x[3] + n[3]; - ma_linear_resampler_filter_f32_4_mono(pLPF, lpfCount, r); - - pFramesOutF32[0] = r[0]; - pFramesOutF32[1] = r[1]; - pFramesOutF32[2] = r[2]; - pFramesOutF32[3] = r[3]; - } else if (channels == 2) { - float x[8]; - float y[8]; - float a[8]; - float d[8]; - float n[8]; - float r[8]; - - x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0]; - x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1]; - x[2] = pFramesInF32[((inTimeInt4[1] + 0) * 2) + 0]; - x[3] = pFramesInF32[((inTimeInt4[1] + 0) * 2) + 1]; - x[4] = pFramesInF32[((inTimeInt4[2] + 0) * 2) + 0]; - x[5] = pFramesInF32[((inTimeInt4[2] + 0) * 2) + 1]; - x[6] = pFramesInF32[((inTimeInt4[3] + 0) * 2) + 0]; - x[7] = pFramesInF32[((inTimeInt4[3] + 0) * 2) + 1]; - - y[0] = pFramesInF32[((inTimeInt4[0] + 1) * 2) + 0]; - y[1] = pFramesInF32[((inTimeInt4[0] + 1) * 2) + 1]; - y[2] = pFramesInF32[((inTimeInt4[1] + 1) * 2) + 0]; - y[3] = pFramesInF32[((inTimeInt4[1] + 1) * 2) + 1]; - y[4] = pFramesInF32[((inTimeInt4[2] + 1) * 2) + 0]; - y[5] = pFramesInF32[((inTimeInt4[2] + 1) * 2) + 1]; - y[6] = pFramesInF32[((inTimeInt4[3] + 1) * 2) + 0]; - y[7] = pFramesInF32[((inTimeInt4[3] + 1) * 2) + 1]; - - a[0] = inTimeFrac4[0] * invSampleRateOut; - a[1] = inTimeFrac4[0] * invSampleRateOut; - a[2] = inTimeFrac4[1] * invSampleRateOut; - a[3] = inTimeFrac4[1] * invSampleRateOut; - a[4] = inTimeFrac4[2] * invSampleRateOut; - a[5] = inTimeFrac4[2] * invSampleRateOut; - a[6] = inTimeFrac4[3] * invSampleRateOut; - a[7] = inTimeFrac4[3] * invSampleRateOut; - - d[0] = y[0] - x[0]; - d[1] = y[1] - x[1]; - d[2] = y[2] - x[2]; - d[3] = y[3] - x[3]; - d[4] = y[4] - x[4]; - d[5] = y[5] - x[5]; - d[6] = y[6] - x[6]; - d[7] = y[7] - x[7]; - - n[0] = d[0] * a[0]; - n[1] = d[1] * a[1]; - n[2] = d[2] * a[2]; - n[3] = d[3] * a[3]; - n[4] = d[4] * a[4]; - n[5] = d[5] * a[5]; - n[6] = d[6] * a[6]; - n[7] = d[7] * a[7]; - - r[0] = x[0] + n[0]; - r[1] = x[1] + n[1]; - r[2] = x[2] + n[2]; - r[3] = x[3] + n[3]; - r[4] = x[4] + n[4]; - r[5] = x[5] + n[5]; - r[6] = x[6] + n[6]; - r[7] = x[7] + n[7]; - - ma_linear_resampler_filter_f32_4_stereo(pLPF, lpfCount, r); - - pFramesOutF32[0] = r[0]; - pFramesOutF32[1] = r[1]; - pFramesOutF32[2] = r[2]; - pFramesOutF32[3] = r[3]; - pFramesOutF32[4] = r[4]; - pFramesOutF32[5] = r[5]; - pFramesOutF32[6] = r[6]; - pFramesOutF32[7] = r[7]; - } else { - for (c = 0; c < channels; c += 1) { - float x[4]; - float y[4]; - float a[4]; - float d[4]; - float n[4]; - float r[4]; - - x[0] = pFramesInF32[((inTimeInt4[0] + 0) * channels) + c]; - x[1] = pFramesInF32[((inTimeInt4[1] + 0) * channels) + c]; - x[2] = pFramesInF32[((inTimeInt4[2] + 0) * channels) + c]; - x[3] = pFramesInF32[((inTimeInt4[3] + 0) * channels) + c]; - - y[0] = pFramesInF32[((inTimeInt4[0] + 1) * channels) + c]; - y[1] = pFramesInF32[((inTimeInt4[1] + 1) * channels) + c]; - y[2] = pFramesInF32[((inTimeInt4[2] + 1) * channels) + c]; - y[3] = pFramesInF32[((inTimeInt4[3] + 1) * channels) + c]; - - a[0] = inTimeFrac4[0] * invSampleRateOut; - a[1] = inTimeFrac4[1] * invSampleRateOut; - a[2] = inTimeFrac4[2] * invSampleRateOut; - a[3] = inTimeFrac4[3] * invSampleRateOut; - - d[0] = y[0] - x[0]; - d[1] = y[1] - x[1]; - d[2] = y[2] - x[2]; - d[3] = y[3] - x[3]; - - n[0] = d[0] * a[0]; - n[1] = d[1] * a[1]; - n[2] = d[2] * a[2]; - n[3] = d[3] * a[3]; - - r[0] = x[0] + n[0]; - r[1] = x[1] + n[1]; - r[2] = x[2] + n[2]; - r[3] = x[3] + n[3]; - - pFramesOutF32[(0 * channels) + c] = r[0]; - pFramesOutF32[(1 * channels) + c] = r[1]; - pFramesOutF32[(2 * channels) + c] = r[2]; - pFramesOutF32[(3 * channels) + c] = r[3]; - } - - ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32); + pFramesOutF32[(0 * channels) + c] = r[0]; + pFramesOutF32[(1 * channels) + c] = r[1]; + pFramesOutF32[(2 * channels) + c] = r[2]; + pFramesOutF32[(3 * channels) + c] = r[3]; } + ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32); + pFramesOutF32 += 4 * channels; framesProcessedOut += 4; }