Resampler: Optimization for floating point stereo.

This applies only to the f32 no-LPF code path. Other code paths will be
integrated later.
This commit is contained in:
David Reid
2026-02-08 22:01:23 +10:00
parent cd02ebe39c
commit 172f8beae6
+82 -3
View File
@@ -59387,7 +59387,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
/* The rate must have changed between calls. Ignore the cached frame. */ /* The rate must have changed between calls. Ignore the cached frame. */
} }
/* Experimental loop unrolling for some SIMD experiments. */ /* Experimental loop unrolling to make it easier for SIMD-ification. */
#if 1 #if 1
{ {
ma_uint32 channels = pResampler->channels; ma_uint32 channels = pResampler->channels;
@@ -59423,9 +59423,8 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
pResampler->inTimeInt = inTimeIntTemp; pResampler->inTimeInt = inTimeIntTemp;
pResampler->inTimeFrac = inTimeFracTemp; pResampler->inTimeFrac = inTimeFracTemp;
/* We should now be able to SIMD-ify the rest. */ /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
{ {
/* TODO: Experiment with a stereo specialization. */
if (channels == 1) { if (channels == 1) {
float x[4]; float x[4];
float y[4]; float y[4];
@@ -59466,6 +59465,86 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
pFramesOutF32[1] = r[1]; pFramesOutF32[1] = r[1];
pFramesOutF32[2] = r[2]; pFramesOutF32[2] = r[2];
pFramesOutF32[3] = r[3]; pFramesOutF32[3] = r[3];
} else if (channels == 2) {
float x[8];
float y[8];
float a[8];
float d[8];
float n[8];
float r[8];
/* Frame 0, Channel 0 */
x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0];
y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0];
a[0] = inTimeFrac[0] * invSampleRateOut;
d[0] = y[0] - x[0];
n[0] = d[0] * a[0];
r[0] = x[0] + n[0];
/* Frame 0, Channel 1 */
x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1];
y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1];
a[1] = inTimeFrac[0] * invSampleRateOut;
d[1] = y[1] - x[1];
n[1] = d[1] * a[1];
r[1] = x[1] + n[1];
/* Frame 1, Channel 0 */
x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0];
y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0];
a[2] = inTimeFrac[1] * invSampleRateOut;
d[2] = y[2] - x[2];
n[2] = d[2] * a[2];
r[2] = x[2] + n[2];
/* Frame 1, Channel 1 */
x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1];
y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1];
a[3] = inTimeFrac[1] * invSampleRateOut;
d[3] = y[3] - x[3];
n[3] = d[3] * a[3];
r[3] = x[3] + n[3];
/* Frame 2, Channel 0 */
x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0];
y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0];
a[4] = inTimeFrac[2] * invSampleRateOut;
d[4] = y[4] - x[4];
n[4] = d[4] * a[4];
r[4] = x[4] + n[4];
/* Frame 2, Channel 1 */
x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1];
y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1];
a[5] = inTimeFrac[2] * invSampleRateOut;
d[5] = y[5] - x[5];
n[5] = d[5] * a[5];
r[5] = x[5] + n[5];
/* Frame 3, Channel 0 */
x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0];
y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0];
a[6] = inTimeFrac[3] * invSampleRateOut;
d[6] = y[6] - x[6];
n[6] = d[6] * a[6];
r[6] = x[6] + n[6];
/* Frame 3, Channel 1 */
x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1];
y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1];
a[7] = inTimeFrac[3] * invSampleRateOut;
d[7] = y[7] - x[7];
n[7] = d[7] * a[7];
r[7] = x[7] + n[7];
pFramesOutF32[0] = r[0];
pFramesOutF32[1] = r[1];
pFramesOutF32[2] = r[2];
pFramesOutF32[3] = r[3];
pFramesOutF32[4] = r[4];
pFramesOutF32[5] = r[5];
pFramesOutF32[6] = r[6];
pFramesOutF32[7] = r[7];
} else { } else {
for (c = 0; c < channels; c += 1) { for (c = 0; c < channels; c += 1) {
float x[4]; float x[4];