Resampler: Optimization to the LPF > 0 path.

This moves the channel count checks outside of the loop.
This commit is contained in:
David Reid
2026-02-14 14:06:19 +10:00
parent 5ae52e1a0a
commit e2e6bb6334
+152 -22
View File
@@ -60517,11 +60517,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
/* Experimental loop unrolling to make it easier for SIMD-ification. */ /* Experimental loop unrolling to make it easier for SIMD-ification. */
#if 1 #if 1
{ {
if (channels == 1) {
while (framesProcessedOut + 4 <= frameCountOut) { while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp; ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp; ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4]; ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4]; ma_uint32 inTimeFrac4[4];
ma_int32 x[4];
ma_int32 y[4];
ma_int32 a[4];
ma_int32 d[4];
ma_int32 n[4];
ma_int32 r[4];
int i; int i;
inTimeIntTemp = inTimeInt; inTimeIntTemp = inTimeInt;
@@ -60548,16 +60555,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
inTimeInt = inTimeIntTemp; inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp; inTimeFrac = inTimeFracTemp;
/* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
{
if (channels == 1) {
ma_int32 x[4];
ma_int32 y[4];
ma_int32 a[4];
ma_int32 d[4];
ma_int32 n[4];
ma_int32 r[4];
x[0] = pFramesInS16[inTimeInt4[0] + 0]; x[0] = pFramesInS16[inTimeInt4[0] + 0];
x[1] = pFramesInS16[inTimeInt4[1] + 0]; x[1] = pFramesInS16[inTimeInt4[1] + 0];
x[2] = pFramesInS16[inTimeInt4[2] + 0]; x[2] = pFramesInS16[inTimeInt4[2] + 0];
@@ -60594,13 +60591,47 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
pFramesOutS16[1] = (ma_int16)r[1]; pFramesOutS16[1] = (ma_int16)r[1];
pFramesOutS16[2] = (ma_int16)r[2]; pFramesOutS16[2] = (ma_int16)r[2];
pFramesOutS16[3] = (ma_int16)r[3]; pFramesOutS16[3] = (ma_int16)r[3];
pFramesOutS16 += 4;
framesProcessedOut += 4;
}
} else if (channels == 2) { } else if (channels == 2) {
while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4];
ma_int32 x[8]; ma_int32 x[8];
ma_int32 y[8]; ma_int32 y[8];
ma_int32 a[8]; ma_int32 a[8];
ma_int32 d[8]; ma_int32 d[8];
ma_int32 n[8]; ma_int32 n[8];
ma_int32 r[8]; ma_int32 r[8];
int i;
inTimeIntTemp = inTimeInt;
inTimeFracTemp = inTimeFrac;
for (i = 0; i < 4; i += 1) {
inTimeInt4[i] = inTimeIntTemp;
inTimeFrac4[i] = inTimeFracTemp;
inTimeIntTemp += pResampler->inAdvanceInt;
inTimeFracTemp += pResampler->inAdvanceFrac;
if (inTimeFracTemp >= pResampler->sampleRateOut) {
inTimeFracTemp -= pResampler->sampleRateOut;
inTimeIntTemp += 1;
}
}
/* Check that we have one extra sample at the end for doing the interpolation. */
if (inTimeInt4[3] + 1 >= frameCountIn) {
break; /* Not enough input frames. */
}
/* Advance the timer. */
inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp;
x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0]; x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0];
x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1]; x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1];
@@ -60666,7 +60697,42 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
pFramesOutS16[5] = (ma_int16)r[5]; pFramesOutS16[5] = (ma_int16)r[5];
pFramesOutS16[6] = (ma_int16)r[6]; pFramesOutS16[6] = (ma_int16)r[6];
pFramesOutS16[7] = (ma_int16)r[7]; pFramesOutS16[7] = (ma_int16)r[7];
pFramesOutS16 += 8;
framesProcessedOut += 4;
}
} else { } else {
while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4];
int i;
inTimeIntTemp = inTimeInt;
inTimeFracTemp = inTimeFrac;
for (i = 0; i < 4; i += 1) {
inTimeInt4[i] = inTimeIntTemp;
inTimeFrac4[i] = inTimeFracTemp;
inTimeIntTemp += pResampler->inAdvanceInt;
inTimeFracTemp += pResampler->inAdvanceFrac;
if (inTimeFracTemp >= pResampler->sampleRateOut) {
inTimeFracTemp -= pResampler->sampleRateOut;
inTimeIntTemp += 1;
}
}
/* Check that we have one extra sample at the end for doing the interpolation. */
if (inTimeInt4[3] + 1 >= frameCountIn) {
break; /* Not enough input frames. */
}
/* Advance the timer. */
inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp;
for (c = 0; c < channels; c += 1) { for (c = 0; c < channels; c += 1) {
ma_int32 x[4]; ma_int32 x[4];
ma_int32 y[4]; ma_int32 y[4];
@@ -60712,7 +60778,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
} }
ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16); ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16);
}
pFramesOutS16 += 4 * channels; pFramesOutS16 += 4 * channels;
framesProcessedOut += 4; framesProcessedOut += 4;
@@ -60961,11 +61026,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
/* Experimental loop unrolling to make it easier for SIMD-ification. */ /* Experimental loop unrolling to make it easier for SIMD-ification. */
#if 1 #if 1
{ {
if (channels == 1) {
while (framesProcessedOut + 4 <= frameCountOut) { while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp; ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp; ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4]; ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4]; ma_uint32 inTimeFrac4[4];
float x[4];
float y[4];
float a[4];
float d[4];
float n[4];
float r[4];
int i; int i;
inTimeIntTemp = inTimeInt; inTimeIntTemp = inTimeInt;
@@ -60992,16 +61064,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
inTimeInt = inTimeIntTemp; inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp; inTimeFrac = inTimeFracTemp;
/* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
{
if (channels == 1) {
float x[4];
float y[4];
float a[4];
float d[4];
float n[4];
float r[4];
x[0] = pFramesInF32[inTimeInt4[0] + 0]; x[0] = pFramesInF32[inTimeInt4[0] + 0];
x[1] = pFramesInF32[inTimeInt4[1] + 0]; x[1] = pFramesInF32[inTimeInt4[1] + 0];
x[2] = pFramesInF32[inTimeInt4[2] + 0]; x[2] = pFramesInF32[inTimeInt4[2] + 0];
@@ -61038,13 +61100,47 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
pFramesOutF32[1] = r[1]; pFramesOutF32[1] = r[1];
pFramesOutF32[2] = r[2]; pFramesOutF32[2] = r[2];
pFramesOutF32[3] = r[3]; pFramesOutF32[3] = r[3];
pFramesOutF32 += 4;
framesProcessedOut += 4;
}
} else if (channels == 2) { } else if (channels == 2) {
while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4];
float x[8]; float x[8];
float y[8]; float y[8];
float a[8]; float a[8];
float d[8]; float d[8];
float n[8]; float n[8];
float r[8]; float r[8];
int i;
inTimeIntTemp = inTimeInt;
inTimeFracTemp = inTimeFrac;
for (i = 0; i < 4; i += 1) {
inTimeInt4[i] = inTimeIntTemp;
inTimeFrac4[i] = inTimeFracTemp;
inTimeIntTemp += pResampler->inAdvanceInt;
inTimeFracTemp += pResampler->inAdvanceFrac;
if (inTimeFracTemp >= pResampler->sampleRateOut) {
inTimeFracTemp -= pResampler->sampleRateOut;
inTimeIntTemp += 1;
}
}
/* Check that we have one extra sample at the end for doing the interpolation. */
if (inTimeInt4[3] + 1 >= frameCountIn) {
break; /* Not enough input frames. */
}
/* Advance the timer. */
inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp;
x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0]; x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0];
x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1]; x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1];
@@ -61110,7 +61206,42 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
pFramesOutF32[5] = r[5]; pFramesOutF32[5] = r[5];
pFramesOutF32[6] = r[6]; pFramesOutF32[6] = r[6];
pFramesOutF32[7] = r[7]; pFramesOutF32[7] = r[7];
pFramesOutF32 += 8;
framesProcessedOut += 4;
}
} else { } else {
while (framesProcessedOut + 4 <= frameCountOut) {
ma_uint32 inTimeIntTemp;
ma_uint32 inTimeFracTemp;
ma_uint32 inTimeInt4[4];
ma_uint32 inTimeFrac4[4];
int i;
inTimeIntTemp = inTimeInt;
inTimeFracTemp = inTimeFrac;
for (i = 0; i < 4; i += 1) {
inTimeInt4[i] = inTimeIntTemp;
inTimeFrac4[i] = inTimeFracTemp;
inTimeIntTemp += pResampler->inAdvanceInt;
inTimeFracTemp += pResampler->inAdvanceFrac;
if (inTimeFracTemp >= pResampler->sampleRateOut) {
inTimeFracTemp -= pResampler->sampleRateOut;
inTimeIntTemp += 1;
}
}
/* Check that we have one extra sample at the end for doing the interpolation. */
if (inTimeInt4[3] + 1 >= frameCountIn) {
break; /* Not enough input frames. */
}
/* Advance the timer. */
inTimeInt = inTimeIntTemp;
inTimeFrac = inTimeFracTemp;
for (c = 0; c < channels; c += 1) { for (c = 0; c < channels; c += 1) {
float x[4]; float x[4];
float y[4]; float y[4];
@@ -61156,7 +61287,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
} }
ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32); ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32);
}
pFramesOutF32 += 4 * channels; pFramesOutF32 += 4 * channels;
framesProcessedOut += 4; framesProcessedOut += 4;