mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-21 15:56:58 +02:00
Resampler: A loop unrolling optimization experiment.
My idea here is to make it easier to for the compiler to SIMD-ify some of the interpolation code. I have not confirmed that it is actually being SIMD-ified by the compiler, but it is still significantly faster in both debug and release builds. There is a mono specialization here which further improves performance. I have not yet experimented with a stereo specialization, but if it works it'll be added in a future commit. This applies only to the f32 no-LPF code path. Other paths will come later once I'm done with this round of experiments.
This commit is contained in:
+130
@@ -59387,6 +59387,136 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
|
|||||||
/* The rate must have changed between calls. Ignore the cached frame. */
|
/* The rate must have changed between calls. Ignore the cached frame. */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Experimental loop unrolling for some SIMD experiments. */
|
||||||
|
#if 1
|
||||||
|
{
|
||||||
|
ma_uint32 channels = pResampler->channels;
|
||||||
|
|
||||||
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
|
ma_uint32 inTimeIntTemp;
|
||||||
|
ma_uint32 inTimeFracTemp;
|
||||||
|
ma_uint32 inTimeInt[4];
|
||||||
|
ma_uint32 inTimeFrac[4];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
inTimeIntTemp = pResampler->inTimeInt;
|
||||||
|
inTimeFracTemp = pResampler->inTimeFrac;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i += 1) {
|
||||||
|
inTimeInt[i] = inTimeIntTemp;
|
||||||
|
inTimeFrac[i] = inTimeFracTemp;
|
||||||
|
|
||||||
|
inTimeIntTemp += pResampler->inAdvanceInt;
|
||||||
|
inTimeFracTemp += pResampler->inAdvanceFrac;
|
||||||
|
if (inTimeFracTemp >= pResampler->sampleRateOut) {
|
||||||
|
inTimeFracTemp -= pResampler->sampleRateOut;
|
||||||
|
inTimeIntTemp += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that we have one extra sample at the end for doing the interpolation. */
|
||||||
|
if (inTimeInt[3] + 1 >= frameCountIn) {
|
||||||
|
break; /* Not enough input frames. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance the timer. */
|
||||||
|
pResampler->inTimeInt = inTimeIntTemp;
|
||||||
|
pResampler->inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
|
/* We should now be able to SIMD-ify the rest. */
|
||||||
|
{
|
||||||
|
/* TODO: Experiment with a stereo specialization. */
|
||||||
|
if (channels == 1) {
|
||||||
|
float x[4];
|
||||||
|
float y[4];
|
||||||
|
float a[4];
|
||||||
|
float d[4];
|
||||||
|
float n[4];
|
||||||
|
float r[4];
|
||||||
|
|
||||||
|
x[0] = pFramesInF32[inTimeInt[0] + 0];
|
||||||
|
y[0] = pFramesInF32[inTimeInt[0] + 1];
|
||||||
|
a[0] = inTimeFrac[0] * invSampleRateOut;
|
||||||
|
d[0] = y[0] - x[0];
|
||||||
|
n[0] = d[0] * a[0];
|
||||||
|
r[0] = x[0] + n[0];
|
||||||
|
|
||||||
|
x[1] = pFramesInF32[inTimeInt[1] + 0];
|
||||||
|
y[1] = pFramesInF32[inTimeInt[1] + 1];
|
||||||
|
a[1] = inTimeFrac[1] * invSampleRateOut;
|
||||||
|
d[1] = y[1] - x[1];
|
||||||
|
n[1] = d[1] * a[1];
|
||||||
|
r[1] = x[1] + n[1];
|
||||||
|
|
||||||
|
x[2] = pFramesInF32[inTimeInt[2] + 0];
|
||||||
|
y[2] = pFramesInF32[inTimeInt[2] + 1];
|
||||||
|
a[2] = inTimeFrac[2] * invSampleRateOut;
|
||||||
|
d[2] = y[2] - x[2];
|
||||||
|
n[2] = d[2] * a[2];
|
||||||
|
r[2] = x[2] + n[2];
|
||||||
|
|
||||||
|
x[3] = pFramesInF32[inTimeInt[3] + 0];
|
||||||
|
y[3] = pFramesInF32[inTimeInt[3] + 1];
|
||||||
|
a[3] = inTimeFrac[3] * invSampleRateOut;
|
||||||
|
d[3] = y[3] - x[3];
|
||||||
|
n[3] = d[3] * a[3];
|
||||||
|
r[3] = x[3] + n[3];
|
||||||
|
|
||||||
|
pFramesOutF32[0] = r[0];
|
||||||
|
pFramesOutF32[1] = r[1];
|
||||||
|
pFramesOutF32[2] = r[2];
|
||||||
|
pFramesOutF32[3] = r[3];
|
||||||
|
} else {
|
||||||
|
for (c = 0; c < channels; c += 1) {
|
||||||
|
float x[4];
|
||||||
|
float y[4];
|
||||||
|
float a[4];
|
||||||
|
float d[4];
|
||||||
|
float n[4];
|
||||||
|
float r[4];
|
||||||
|
|
||||||
|
x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c];
|
||||||
|
y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c];
|
||||||
|
a[0] = inTimeFrac[0] * invSampleRateOut;
|
||||||
|
d[0] = y[0] - x[0];
|
||||||
|
n[0] = d[0] * a[0];
|
||||||
|
r[0] = x[0] + n[0];
|
||||||
|
|
||||||
|
x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c];
|
||||||
|
y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c];
|
||||||
|
a[1] = inTimeFrac[1] * invSampleRateOut;
|
||||||
|
d[1] = y[1] - x[1];
|
||||||
|
n[1] = d[1] * a[1];
|
||||||
|
r[1] = x[1] + n[1];
|
||||||
|
|
||||||
|
x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c];
|
||||||
|
y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c];
|
||||||
|
a[2] = inTimeFrac[2] * invSampleRateOut;
|
||||||
|
d[2] = y[2] - x[2];
|
||||||
|
n[2] = d[2] * a[2];
|
||||||
|
r[2] = x[2] + n[2];
|
||||||
|
|
||||||
|
x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c];
|
||||||
|
y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c];
|
||||||
|
a[3] = inTimeFrac[3] * invSampleRateOut;
|
||||||
|
d[3] = y[3] - x[3];
|
||||||
|
n[3] = d[3] * a[3];
|
||||||
|
r[3] = x[3] + n[3];
|
||||||
|
|
||||||
|
pFramesOutF32[(0 * channels) + c] = r[0];
|
||||||
|
pFramesOutF32[(1 * channels) + c] = r[1];
|
||||||
|
pFramesOutF32[(2 * channels) + c] = r[2];
|
||||||
|
pFramesOutF32[(3 * channels) + c] = r[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pFramesOutF32 += 4 * channels;
|
||||||
|
framesProcessedOut += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
|
while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
|
||||||
if (pResampler->inTimeInt + 1 < frameCountIn) {
|
if (pResampler->inTimeInt + 1 < frameCountIn) {
|
||||||
float a = pResampler->inTimeFrac * invSampleRateOut;
|
float a = pResampler->inTimeFrac * invSampleRateOut;
|
||||||
|
|||||||
Reference in New Issue
Block a user