From 172f8beae6beaac8bab4d3fce8164b1c4c2cd2fa Mon Sep 17 00:00:00 2001 From: David Reid Date: Sun, 8 Feb 2026 22:01:23 +1000 Subject: [PATCH] Resampler: Optimization for floating point stereo. This applies only to the f32 no-LPF code path. Other code paths will be integrated later. --- miniaudio.h | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/miniaudio.h b/miniaudio.h index 0c0f18b7..cd24f932 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -59387,7 +59387,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ /* The rate must have changed between calls. Ignore the cached frame. */ } - /* Experimental loop unrolling for some SIMD experiments. */ + /* Experimental loop unrolling to make it easier for SIMD-ification. */ #if 1 { ma_uint32 channels = pResampler->channels; @@ -59423,9 +59423,8 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ pResampler->inTimeInt = inTimeIntTemp; pResampler->inTimeFrac = inTimeFracTemp; - /* We should now be able to SIMD-ify the rest. */ + /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ { - /* TODO: Experiment with a stereo specialization. */ if (channels == 1) { float x[4]; float y[4]; @@ -59466,6 +59465,86 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ pFramesOutF32[1] = r[1]; pFramesOutF32[2] = r[2]; pFramesOutF32[3] = r[3]; + } else if (channels == 2) { + float x[8]; + float y[8]; + float a[8]; + float d[8]; + float n[8]; + float r[8]; + + /* Frame 0, Channel 0 */ + x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0]; + y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0]; + a[0] = inTimeFrac[0] * invSampleRateOut; + d[0] = y[0] - x[0]; + n[0] = d[0] * a[0]; + r[0] = x[0] + n[0]; + + /* Frame 0, Channel 1 */ + x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1]; + y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1]; + a[1] = inTimeFrac[0] * invSampleRateOut; + d[1] = y[1] - x[1]; + n[1] = d[1] * a[1]; + r[1] = x[1] + n[1]; + + /* Frame 1, Channel 0 */ + x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0]; + y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0]; + a[2] = inTimeFrac[1] * invSampleRateOut; + d[2] = y[2] - x[2]; + n[2] = d[2] * a[2]; + r[2] = x[2] + n[2]; + + /* Frame 1, Channel 1 */ + x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1]; + y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1]; + a[3] = inTimeFrac[1] * invSampleRateOut; + d[3] = y[3] - x[3]; + n[3] = d[3] * a[3]; + r[3] = x[3] + n[3]; + + /* Frame 2, Channel 0 */ + x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0]; + y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0]; + a[4] = inTimeFrac[2] * invSampleRateOut; + d[4] = y[4] - x[4]; + n[4] = d[4] * a[4]; + r[4] = x[4] + n[4]; + + /* Frame 2, Channel 1 */ + x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1]; + y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1]; + a[5] = inTimeFrac[2] * invSampleRateOut; + d[5] = y[5] - x[5]; + n[5] = d[5] * a[5]; + r[5] = x[5] + n[5]; + + /* Frame 3, Channel 0 */ + x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0]; + y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0]; + a[6] = inTimeFrac[3] * invSampleRateOut; + d[6] = y[6] - x[6]; + n[6] = d[6] * a[6]; + r[6] = x[6] + n[6]; + + /* Frame 3, Channel 1 */ + x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1]; + y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1]; + a[7] = inTimeFrac[3] * invSampleRateOut; + d[7] = y[7] - x[7]; + n[7] = d[7] * a[7]; + r[7] = x[7] + n[7]; + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + pFramesOutF32[4] = r[4]; + pFramesOutF32[5] = r[5]; + pFramesOutF32[6] = r[6]; + pFramesOutF32[7] = r[7]; } else { for (c = 0; c < channels; c += 1) { float x[4];