mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-21 15:56:58 +02:00
Resampler: Optimization to the LPF > 0 path.
This moves the channel count checks outside of the loop.
This commit is contained in:
+152
-22
@@ -60517,11 +60517,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
|
|||||||
/* Experimental loop unrolling to make it easier for SIMD-ification. */
|
/* Experimental loop unrolling to make it easier for SIMD-ification. */
|
||||||
#if 1
|
#if 1
|
||||||
{
|
{
|
||||||
|
if (channels == 1) {
|
||||||
while (framesProcessedOut + 4 <= frameCountOut) {
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
ma_uint32 inTimeIntTemp;
|
ma_uint32 inTimeIntTemp;
|
||||||
ma_uint32 inTimeFracTemp;
|
ma_uint32 inTimeFracTemp;
|
||||||
ma_uint32 inTimeInt4[4];
|
ma_uint32 inTimeInt4[4];
|
||||||
ma_uint32 inTimeFrac4[4];
|
ma_uint32 inTimeFrac4[4];
|
||||||
|
ma_int32 x[4];
|
||||||
|
ma_int32 y[4];
|
||||||
|
ma_int32 a[4];
|
||||||
|
ma_int32 d[4];
|
||||||
|
ma_int32 n[4];
|
||||||
|
ma_int32 r[4];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
inTimeIntTemp = inTimeInt;
|
inTimeIntTemp = inTimeInt;
|
||||||
@@ -60548,16 +60555,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
|
|||||||
inTimeInt = inTimeIntTemp;
|
inTimeInt = inTimeIntTemp;
|
||||||
inTimeFrac = inTimeFracTemp;
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
/* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
|
|
||||||
{
|
|
||||||
if (channels == 1) {
|
|
||||||
ma_int32 x[4];
|
|
||||||
ma_int32 y[4];
|
|
||||||
ma_int32 a[4];
|
|
||||||
ma_int32 d[4];
|
|
||||||
ma_int32 n[4];
|
|
||||||
ma_int32 r[4];
|
|
||||||
|
|
||||||
x[0] = pFramesInS16[inTimeInt4[0] + 0];
|
x[0] = pFramesInS16[inTimeInt4[0] + 0];
|
||||||
x[1] = pFramesInS16[inTimeInt4[1] + 0];
|
x[1] = pFramesInS16[inTimeInt4[1] + 0];
|
||||||
x[2] = pFramesInS16[inTimeInt4[2] + 0];
|
x[2] = pFramesInS16[inTimeInt4[2] + 0];
|
||||||
@@ -60594,13 +60591,47 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
|
|||||||
pFramesOutS16[1] = (ma_int16)r[1];
|
pFramesOutS16[1] = (ma_int16)r[1];
|
||||||
pFramesOutS16[2] = (ma_int16)r[2];
|
pFramesOutS16[2] = (ma_int16)r[2];
|
||||||
pFramesOutS16[3] = (ma_int16)r[3];
|
pFramesOutS16[3] = (ma_int16)r[3];
|
||||||
|
|
||||||
|
pFramesOutS16 += 4;
|
||||||
|
framesProcessedOut += 4;
|
||||||
|
}
|
||||||
} else if (channels == 2) {
|
} else if (channels == 2) {
|
||||||
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
|
ma_uint32 inTimeIntTemp;
|
||||||
|
ma_uint32 inTimeFracTemp;
|
||||||
|
ma_uint32 inTimeInt4[4];
|
||||||
|
ma_uint32 inTimeFrac4[4];
|
||||||
ma_int32 x[8];
|
ma_int32 x[8];
|
||||||
ma_int32 y[8];
|
ma_int32 y[8];
|
||||||
ma_int32 a[8];
|
ma_int32 a[8];
|
||||||
ma_int32 d[8];
|
ma_int32 d[8];
|
||||||
ma_int32 n[8];
|
ma_int32 n[8];
|
||||||
ma_int32 r[8];
|
ma_int32 r[8];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
inTimeIntTemp = inTimeInt;
|
||||||
|
inTimeFracTemp = inTimeFrac;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i += 1) {
|
||||||
|
inTimeInt4[i] = inTimeIntTemp;
|
||||||
|
inTimeFrac4[i] = inTimeFracTemp;
|
||||||
|
|
||||||
|
inTimeIntTemp += pResampler->inAdvanceInt;
|
||||||
|
inTimeFracTemp += pResampler->inAdvanceFrac;
|
||||||
|
if (inTimeFracTemp >= pResampler->sampleRateOut) {
|
||||||
|
inTimeFracTemp -= pResampler->sampleRateOut;
|
||||||
|
inTimeIntTemp += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that we have one extra sample at the end for doing the interpolation. */
|
||||||
|
if (inTimeInt4[3] + 1 >= frameCountIn) {
|
||||||
|
break; /* Not enough input frames. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance the timer. */
|
||||||
|
inTimeInt = inTimeIntTemp;
|
||||||
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0];
|
x[0] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 0];
|
||||||
x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1];
|
x[1] = pFramesInS16[((inTimeInt4[0] + 0) * 2) + 1];
|
||||||
@@ -60666,7 +60697,42 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
|
|||||||
pFramesOutS16[5] = (ma_int16)r[5];
|
pFramesOutS16[5] = (ma_int16)r[5];
|
||||||
pFramesOutS16[6] = (ma_int16)r[6];
|
pFramesOutS16[6] = (ma_int16)r[6];
|
||||||
pFramesOutS16[7] = (ma_int16)r[7];
|
pFramesOutS16[7] = (ma_int16)r[7];
|
||||||
|
|
||||||
|
pFramesOutS16 += 8;
|
||||||
|
framesProcessedOut += 4;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
|
ma_uint32 inTimeIntTemp;
|
||||||
|
ma_uint32 inTimeFracTemp;
|
||||||
|
ma_uint32 inTimeInt4[4];
|
||||||
|
ma_uint32 inTimeFrac4[4];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
inTimeIntTemp = inTimeInt;
|
||||||
|
inTimeFracTemp = inTimeFrac;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i += 1) {
|
||||||
|
inTimeInt4[i] = inTimeIntTemp;
|
||||||
|
inTimeFrac4[i] = inTimeFracTemp;
|
||||||
|
|
||||||
|
inTimeIntTemp += pResampler->inAdvanceInt;
|
||||||
|
inTimeFracTemp += pResampler->inAdvanceFrac;
|
||||||
|
if (inTimeFracTemp >= pResampler->sampleRateOut) {
|
||||||
|
inTimeFracTemp -= pResampler->sampleRateOut;
|
||||||
|
inTimeIntTemp += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that we have one extra sample at the end for doing the interpolation. */
|
||||||
|
if (inTimeInt4[3] + 1 >= frameCountIn) {
|
||||||
|
break; /* Not enough input frames. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance the timer. */
|
||||||
|
inTimeInt = inTimeIntTemp;
|
||||||
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
for (c = 0; c < channels; c += 1) {
|
for (c = 0; c < channels; c += 1) {
|
||||||
ma_int32 x[4];
|
ma_int32 x[4];
|
||||||
ma_int32 y[4];
|
ma_int32 y[4];
|
||||||
@@ -60712,7 +60778,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
|
|||||||
}
|
}
|
||||||
|
|
||||||
ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16);
|
ma_linear_resampler_filter_s16_4(pLPF, lpfCount, channels, pFramesOutS16);
|
||||||
}
|
|
||||||
|
|
||||||
pFramesOutS16 += 4 * channels;
|
pFramesOutS16 += 4 * channels;
|
||||||
framesProcessedOut += 4;
|
framesProcessedOut += 4;
|
||||||
@@ -60961,11 +61026,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
|
|||||||
/* Experimental loop unrolling to make it easier for SIMD-ification. */
|
/* Experimental loop unrolling to make it easier for SIMD-ification. */
|
||||||
#if 1
|
#if 1
|
||||||
{
|
{
|
||||||
|
if (channels == 1) {
|
||||||
while (framesProcessedOut + 4 <= frameCountOut) {
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
ma_uint32 inTimeIntTemp;
|
ma_uint32 inTimeIntTemp;
|
||||||
ma_uint32 inTimeFracTemp;
|
ma_uint32 inTimeFracTemp;
|
||||||
ma_uint32 inTimeInt4[4];
|
ma_uint32 inTimeInt4[4];
|
||||||
ma_uint32 inTimeFrac4[4];
|
ma_uint32 inTimeFrac4[4];
|
||||||
|
float x[4];
|
||||||
|
float y[4];
|
||||||
|
float a[4];
|
||||||
|
float d[4];
|
||||||
|
float n[4];
|
||||||
|
float r[4];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
inTimeIntTemp = inTimeInt;
|
inTimeIntTemp = inTimeInt;
|
||||||
@@ -60992,16 +61064,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
|
|||||||
inTimeInt = inTimeIntTemp;
|
inTimeInt = inTimeIntTemp;
|
||||||
inTimeFrac = inTimeFracTemp;
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
/* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
|
|
||||||
{
|
|
||||||
if (channels == 1) {
|
|
||||||
float x[4];
|
|
||||||
float y[4];
|
|
||||||
float a[4];
|
|
||||||
float d[4];
|
|
||||||
float n[4];
|
|
||||||
float r[4];
|
|
||||||
|
|
||||||
x[0] = pFramesInF32[inTimeInt4[0] + 0];
|
x[0] = pFramesInF32[inTimeInt4[0] + 0];
|
||||||
x[1] = pFramesInF32[inTimeInt4[1] + 0];
|
x[1] = pFramesInF32[inTimeInt4[1] + 0];
|
||||||
x[2] = pFramesInF32[inTimeInt4[2] + 0];
|
x[2] = pFramesInF32[inTimeInt4[2] + 0];
|
||||||
@@ -61038,13 +61100,47 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
|
|||||||
pFramesOutF32[1] = r[1];
|
pFramesOutF32[1] = r[1];
|
||||||
pFramesOutF32[2] = r[2];
|
pFramesOutF32[2] = r[2];
|
||||||
pFramesOutF32[3] = r[3];
|
pFramesOutF32[3] = r[3];
|
||||||
|
|
||||||
|
pFramesOutF32 += 4;
|
||||||
|
framesProcessedOut += 4;
|
||||||
|
}
|
||||||
} else if (channels == 2) {
|
} else if (channels == 2) {
|
||||||
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
|
ma_uint32 inTimeIntTemp;
|
||||||
|
ma_uint32 inTimeFracTemp;
|
||||||
|
ma_uint32 inTimeInt4[4];
|
||||||
|
ma_uint32 inTimeFrac4[4];
|
||||||
float x[8];
|
float x[8];
|
||||||
float y[8];
|
float y[8];
|
||||||
float a[8];
|
float a[8];
|
||||||
float d[8];
|
float d[8];
|
||||||
float n[8];
|
float n[8];
|
||||||
float r[8];
|
float r[8];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
inTimeIntTemp = inTimeInt;
|
||||||
|
inTimeFracTemp = inTimeFrac;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i += 1) {
|
||||||
|
inTimeInt4[i] = inTimeIntTemp;
|
||||||
|
inTimeFrac4[i] = inTimeFracTemp;
|
||||||
|
|
||||||
|
inTimeIntTemp += pResampler->inAdvanceInt;
|
||||||
|
inTimeFracTemp += pResampler->inAdvanceFrac;
|
||||||
|
if (inTimeFracTemp >= pResampler->sampleRateOut) {
|
||||||
|
inTimeFracTemp -= pResampler->sampleRateOut;
|
||||||
|
inTimeIntTemp += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that we have one extra sample at the end for doing the interpolation. */
|
||||||
|
if (inTimeInt4[3] + 1 >= frameCountIn) {
|
||||||
|
break; /* Not enough input frames. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance the timer. */
|
||||||
|
inTimeInt = inTimeIntTemp;
|
||||||
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0];
|
x[0] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 0];
|
||||||
x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1];
|
x[1] = pFramesInF32[((inTimeInt4[0] + 0) * 2) + 1];
|
||||||
@@ -61110,7 +61206,42 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
|
|||||||
pFramesOutF32[5] = r[5];
|
pFramesOutF32[5] = r[5];
|
||||||
pFramesOutF32[6] = r[6];
|
pFramesOutF32[6] = r[6];
|
||||||
pFramesOutF32[7] = r[7];
|
pFramesOutF32[7] = r[7];
|
||||||
|
|
||||||
|
pFramesOutF32 += 8;
|
||||||
|
framesProcessedOut += 4;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
|
while (framesProcessedOut + 4 <= frameCountOut) {
|
||||||
|
ma_uint32 inTimeIntTemp;
|
||||||
|
ma_uint32 inTimeFracTemp;
|
||||||
|
ma_uint32 inTimeInt4[4];
|
||||||
|
ma_uint32 inTimeFrac4[4];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
inTimeIntTemp = inTimeInt;
|
||||||
|
inTimeFracTemp = inTimeFrac;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i += 1) {
|
||||||
|
inTimeInt4[i] = inTimeIntTemp;
|
||||||
|
inTimeFrac4[i] = inTimeFracTemp;
|
||||||
|
|
||||||
|
inTimeIntTemp += pResampler->inAdvanceInt;
|
||||||
|
inTimeFracTemp += pResampler->inAdvanceFrac;
|
||||||
|
if (inTimeFracTemp >= pResampler->sampleRateOut) {
|
||||||
|
inTimeFracTemp -= pResampler->sampleRateOut;
|
||||||
|
inTimeIntTemp += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check that we have one extra sample at the end for doing the interpolation. */
|
||||||
|
if (inTimeInt4[3] + 1 >= frameCountIn) {
|
||||||
|
break; /* Not enough input frames. */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance the timer. */
|
||||||
|
inTimeInt = inTimeIntTemp;
|
||||||
|
inTimeFrac = inTimeFracTemp;
|
||||||
|
|
||||||
for (c = 0; c < channels; c += 1) {
|
for (c = 0; c < channels; c += 1) {
|
||||||
float x[4];
|
float x[4];
|
||||||
float y[4];
|
float y[4];
|
||||||
@@ -61156,7 +61287,6 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
|
|||||||
}
|
}
|
||||||
|
|
||||||
ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32);
|
ma_linear_resampler_filter_f32_4(pLPF, lpfCount, channels, pFramesOutF32);
|
||||||
}
|
|
||||||
|
|
||||||
pFramesOutF32 += 4 * channels;
|
pFramesOutF32 += 4 * channels;
|
||||||
framesProcessedOut += 4;
|
framesProcessedOut += 4;
|
||||||
|
|||||||
Reference in New Issue
Block a user