From cd02ebe39c10666209bd2e3e78e567c8b3191688 Mon Sep 17 00:00:00 2001 From: David Reid Date: Sun, 8 Feb 2026 20:12:57 +1000 Subject: [PATCH] Resampler: A loop unrolling optimization experiment. My idea here is to make it easier to for the compiler to SIMD-ify some of the interpolation code. I have not confirmed that it is actually being SIMD-ified by the compiler, but it is still significantly faster in both debug and release builds. There is a mono specialization here which further improves performance. I have not yet experimented with a stereo specialization, but if it works it'll be added in a future commit. This applies only to the f32 no-LPF code path. Other paths will come later once I'm done with this round of experiments. --- miniaudio.h | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/miniaudio.h b/miniaudio.h index ba071913..0c0f18b7 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -59387,6 +59387,136 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ /* The rate must have changed between calls. Ignore the cached frame. */ } + /* Experimental loop unrolling for some SIMD experiments. */ + #if 1 + { + ma_uint32 channels = pResampler->channels; + + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt[4]; + ma_uint32 inTimeFrac[4]; + int i; + + inTimeIntTemp = pResampler->inTimeInt; + inTimeFracTemp = pResampler->inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt[i] = inTimeIntTemp; + inTimeFrac[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + pResampler->inTimeInt = inTimeIntTemp; + pResampler->inTimeFrac = inTimeFracTemp; + + /* We should now be able to SIMD-ify the rest. */ + { + /* TODO: Experiment with a stereo specialization. */ + if (channels == 1) { + float x[4]; + float y[4]; + float a[4]; + float d[4]; + float n[4]; + float r[4]; + + x[0] = pFramesInF32[inTimeInt[0] + 0]; + y[0] = pFramesInF32[inTimeInt[0] + 1]; + a[0] = inTimeFrac[0] * invSampleRateOut; + d[0] = y[0] - x[0]; + n[0] = d[0] * a[0]; + r[0] = x[0] + n[0]; + + x[1] = pFramesInF32[inTimeInt[1] + 0]; + y[1] = pFramesInF32[inTimeInt[1] + 1]; + a[1] = inTimeFrac[1] * invSampleRateOut; + d[1] = y[1] - x[1]; + n[1] = d[1] * a[1]; + r[1] = x[1] + n[1]; + + x[2] = pFramesInF32[inTimeInt[2] + 0]; + y[2] = pFramesInF32[inTimeInt[2] + 1]; + a[2] = inTimeFrac[2] * invSampleRateOut; + d[2] = y[2] - x[2]; + n[2] = d[2] * a[2]; + r[2] = x[2] + n[2]; + + x[3] = pFramesInF32[inTimeInt[3] + 0]; + y[3] = pFramesInF32[inTimeInt[3] + 1]; + a[3] = inTimeFrac[3] * invSampleRateOut; + d[3] = y[3] - x[3]; + n[3] = d[3] * a[3]; + r[3] = x[3] + n[3]; + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + } else { + for (c = 0; c < channels; c += 1) { + float x[4]; + float y[4]; + float a[4]; + float d[4]; + float n[4]; + float r[4]; + + x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c]; + y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c]; + a[0] = inTimeFrac[0] * invSampleRateOut; + d[0] = y[0] - x[0]; + n[0] = d[0] * a[0]; + r[0] = x[0] + n[0]; + + x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c]; + y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c]; + a[1] = inTimeFrac[1] * invSampleRateOut; + d[1] = y[1] - x[1]; + n[1] = d[1] * a[1]; + r[1] = x[1] + n[1]; + + x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c]; + y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c]; + a[2] = inTimeFrac[2] * invSampleRateOut; + d[2] = y[2] - x[2]; + n[2] = d[2] * a[2]; + r[2] = x[2] + n[2]; + + x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c]; + y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c]; + a[3] = inTimeFrac[3] * invSampleRateOut; + d[3] = y[3] - x[3]; + n[3] = d[3] * a[3]; + r[3] = x[3] + n[3]; + + pFramesOutF32[(0 * channels) + c] = r[0]; + pFramesOutF32[(1 * channels) + c] = r[1]; + pFramesOutF32[(2 * channels) + c] = r[2]; + pFramesOutF32[(3 * channels) + c] = r[3]; + } + } + + pFramesOutF32 += 4 * channels; + framesProcessedOut += 4; + } + } + } + #endif + while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { if (pResampler->inTimeInt + 1 < frameCountIn) { float a = pResampler->inTimeFrac * invSampleRateOut;