Resampler: A loop unrolling optimization experiment.

My idea here is to make it easier to for the compiler to SIMD-ify some
of the interpolation code. I have not confirmed that it is actually
being SIMD-ified by the compiler, but it is still significantly faster
in both debug and release builds.

There is a mono specialization here which further improves performance.
I have not yet experimented with a stereo specialization, but if it
works it'll be added in a future commit.

This applies only to the f32 no-LPF code path. Other paths will come
later once I'm done with this round of experiments.
This commit is contained in:
David Reid
2026-02-08 20:12:57 +10:00
parent 20180b0ae5
commit cd02ebe39c
+130
View File
@@ -59387,6 +59387,136 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
/* The rate must have changed between calls. Ignore the cached frame. */ /* The rate must have changed between calls. Ignore the cached frame. */
} }
/* Experimental loop unrolling for some SIMD experiments. */
#if 1
{
ma_uint32 channels = pResampler->channels;
while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt[4];
ma_uint32 inTimeFrac[4];
int i;
inTimeIntTemp = pResampler->inTimeInt;
inTimeFracTemp = pResampler->inTimeFrac;
for (i = 0; i < 4; i += 1) {
inTimeInt[i] = inTimeIntTemp;
inTimeFrac[i] = inTimeFracTemp;
inTimeIntTemp += pResampler->inAdvanceInt;
inTimeFracTemp += pResampler->inAdvanceFrac;
if (inTimeFracTemp >= pResampler->sampleRateOut) {
inTimeFracTemp -= pResampler->sampleRateOut;
inTimeIntTemp += 1;
}
}
/* Check that we have one extra sample at the end for doing the interpolation. */
if (inTimeInt[3] + 1 >= frameCountIn) {
break; /* Not enough input frames. */
}
/* Advance the timer. */
pResampler->inTimeInt = inTimeIntTemp;
pResampler->inTimeFrac = inTimeFracTemp;
/* We should now be able to SIMD-ify the rest. */
{
/* TODO: Experiment with a stereo specialization. */
if (channels == 1) {
float x[4];
float y[4];
float a[4];
float d[4];
float n[4];
float r[4];
x[0] = pFramesInF32[inTimeInt[0] + 0];
y[0] = pFramesInF32[inTimeInt[0] + 1];
a[0] = inTimeFrac[0] * invSampleRateOut;
d[0] = y[0] - x[0];
n[0] = d[0] * a[0];
r[0] = x[0] + n[0];
x[1] = pFramesInF32[inTimeInt[1] + 0];
y[1] = pFramesInF32[inTimeInt[1] + 1];
a[1] = inTimeFrac[1] * invSampleRateOut;
d[1] = y[1] - x[1];
n[1] = d[1] * a[1];
r[1] = x[1] + n[1];
x[2] = pFramesInF32[inTimeInt[2] + 0];
y[2] = pFramesInF32[inTimeInt[2] + 1];
a[2] = inTimeFrac[2] * invSampleRateOut;
d[2] = y[2] - x[2];
n[2] = d[2] * a[2];
r[2] = x[2] + n[2];
x[3] = pFramesInF32[inTimeInt[3] + 0];
y[3] = pFramesInF32[inTimeInt[3] + 1];
a[3] = inTimeFrac[3] * invSampleRateOut;
d[3] = y[3] - x[3];
n[3] = d[3] * a[3];
r[3] = x[3] + n[3];
pFramesOutF32[0] = r[0];
pFramesOutF32[1] = r[1];
pFramesOutF32[2] = r[2];
pFramesOutF32[3] = r[3];
} else {
for (c = 0; c < channels; c += 1) {
float x[4];
float y[4];
float a[4];
float d[4];
float n[4];
float r[4];
x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c];
y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c];
a[0] = inTimeFrac[0] * invSampleRateOut;
d[0] = y[0] - x[0];
n[0] = d[0] * a[0];
r[0] = x[0] + n[0];
x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c];
y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c];
a[1] = inTimeFrac[1] * invSampleRateOut;
d[1] = y[1] - x[1];
n[1] = d[1] * a[1];
r[1] = x[1] + n[1];
x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c];
y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c];
a[2] = inTimeFrac[2] * invSampleRateOut;
d[2] = y[2] - x[2];
n[2] = d[2] * a[2];
r[2] = x[2] + n[2];
x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c];
y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c];
a[3] = inTimeFrac[3] * invSampleRateOut;
d[3] = y[3] - x[3];
n[3] = d[3] * a[3];
r[3] = x[3] + n[3];
pFramesOutF32[(0 * channels) + c] = r[0];
pFramesOutF32[(1 * channels) + c] = r[1];
pFramesOutF32[(2 * channels) + c] = r[2];
pFramesOutF32[(3 * channels) + c] = r[3];
}
}
pFramesOutF32 += 4 * channels;
framesProcessedOut += 4;
}
}
}
#endif
while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
if (pResampler->inTimeInt + 1 < frameCountIn) { if (pResampler->inTimeInt + 1 < frameCountIn) {
float a = pResampler->inTimeFrac * invSampleRateOut; float a = pResampler->inTimeFrac * invSampleRateOut;