Resampler: Optimization to the filtering stage.

This seems to trigger a fast optimization strategy when compiling with
GCC. With this change the filtered s16 mono path is almost 2x faster.
This commit is contained in:
David Reid
2026-02-14 06:20:37 +10:00
parent b3340e629a
commit d4382ce478
+58 -50
View File
@@ -59989,11 +59989,10 @@ static MA_INLINE void ma_linear_resampler_filter_s16_4_stereo(ma_linear_resample
}
#endif
static MA_INLINE void ma_linear_resampler_filter_s32(ma_linear_resampler* pResampler, ma_uint32 channels, ma_int32* pFrame)
static MA_INLINE void ma_linear_resampler_filter_s32(ma_linear_resampler* pResampler, ma_uint32 channels, ma_uint32 lpfCount, ma_int32* pFrame)
{
ma_int32* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_S32(pResampler, 0);
@@ -60025,11 +60024,10 @@ static MA_INLINE void ma_linear_resampler_filter_s32(ma_linear_resampler* pResam
}
}
static MA_INLINE void ma_linear_resampler_filter_s32_mono(ma_linear_resampler* pResampler, ma_int32* pFrame)
static MA_INLINE void ma_linear_resampler_filter_s32_mono(ma_linear_resampler* pResampler, ma_uint32 lpfCount, ma_int32* pFrame)
{
ma_int32* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_S32(pResampler, 0);
@@ -60061,11 +60059,10 @@ static MA_INLINE void ma_linear_resampler_filter_s32_mono(ma_linear_resampler* p
}
}
static MA_INLINE void ma_linear_resampler_filter_s32_stereo(ma_linear_resampler* pResampler, ma_int32* pFrame)
static MA_INLINE void ma_linear_resampler_filter_s32_stereo(ma_linear_resampler* pResampler, ma_uint32 lpfCount, ma_int32* pFrame)
{
ma_int32* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_S32(pResampler, 0);
@@ -60097,35 +60094,34 @@ static MA_INLINE void ma_linear_resampler_filter_s32_stereo(ma_linear_resampler*
}
}
static MA_INLINE void ma_linear_resampler_filter_s32_4(ma_linear_resampler* pResampler, ma_int32* pFrames)
static MA_INLINE void ma_linear_resampler_filter_s32_4(ma_linear_resampler* pResampler, ma_uint32 lpfCount, ma_int32* pFrames)
{
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (0 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (1 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (2 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (3 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, lpfCount, pFrames + (0 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, lpfCount, pFrames + (1 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, lpfCount, pFrames + (2 * pResampler->channels));
ma_linear_resampler_filter_s32(pResampler, pResampler->channels, lpfCount, pFrames + (3 * pResampler->channels));
}
static MA_INLINE void ma_linear_resampler_filter_s32_4_mono(ma_linear_resampler* pResampler, ma_int32* pFrames)
static MA_INLINE void ma_linear_resampler_filter_s32_4_mono(ma_linear_resampler* pResampler, ma_uint32 lpfCount, ma_int32* pFrames)
{
ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 0);
ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 1);
ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 2);
ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 3);
ma_linear_resampler_filter_s32_mono(pResampler, lpfCount, pFrames + 0);
ma_linear_resampler_filter_s32_mono(pResampler, lpfCount, pFrames + 1);
ma_linear_resampler_filter_s32_mono(pResampler, lpfCount, pFrames + 2);
ma_linear_resampler_filter_s32_mono(pResampler, lpfCount, pFrames + 3);
}
static MA_INLINE void ma_linear_resampler_filter_s32_4_stereo(ma_linear_resampler* pResampler, ma_int32* pFrames)
static MA_INLINE void ma_linear_resampler_filter_s32_4_stereo(ma_linear_resampler* pResampler, ma_uint32 lpfCount, ma_int32* pFrames)
{
ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 0);
ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 2);
ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 4);
ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 6);
ma_linear_resampler_filter_s32_stereo(pResampler, lpfCount, pFrames + 0);
ma_linear_resampler_filter_s32_stereo(pResampler, lpfCount, pFrames + 2);
ma_linear_resampler_filter_s32_stereo(pResampler, lpfCount, pFrames + 4);
ma_linear_resampler_filter_s32_stereo(pResampler, lpfCount, pFrames + 6);
}
static MA_INLINE void ma_linear_resampler_filter_f32(ma_linear_resampler* pResampler, float* pFrame)
static MA_INLINE void ma_linear_resampler_filter_f32(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrame)
{
float* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_F32(pResampler, 0);
@@ -60157,11 +60153,10 @@ static MA_INLINE void ma_linear_resampler_filter_f32(ma_linear_resampler* pResam
}
}
static MA_INLINE void ma_linear_resampler_filter_f32_mono(ma_linear_resampler* pResampler, float* pFrame)
static MA_INLINE void ma_linear_resampler_filter_f32_mono(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrame)
{
float* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_F32(pResampler, 0);
@@ -60193,11 +60188,10 @@ static MA_INLINE void ma_linear_resampler_filter_f32_mono(ma_linear_resampler* p
}
}
static MA_INLINE void ma_linear_resampler_filter_f32_stereo(ma_linear_resampler* pResampler, float* pFrame)
static MA_INLINE void ma_linear_resampler_filter_f32_stereo(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrame)
{
float* pLPF;
ma_uint32 iLPF;
ma_uint32 lpfCount = pResampler->lpfOrder >> 1;
ma_uint32 iChannel;
pLPF = MA_LINEAR_RESAMPLER_GET_LPF_STATE_F32(pResampler, 0);
@@ -60229,28 +60223,28 @@ static MA_INLINE void ma_linear_resampler_filter_f32_stereo(ma_linear_resampler*
}
}
static MA_INLINE void ma_linear_resampler_filter_f32_4(ma_linear_resampler* pResampler, float* pFrames)
static MA_INLINE void ma_linear_resampler_filter_f32_4(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrames)
{
ma_linear_resampler_filter_f32(pResampler, pFrames + (0 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, pFrames + (1 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, pFrames + (2 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, pFrames + (3 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFrames + (0 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFrames + (1 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFrames + (2 * pResampler->channels));
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFrames + (3 * pResampler->channels));
}
static MA_INLINE void ma_linear_resampler_filter_f32_4_mono(ma_linear_resampler* pResampler, float* pFrames)
static MA_INLINE void ma_linear_resampler_filter_f32_4_mono(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrames)
{
ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 0);
ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 1);
ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 2);
ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 3);
ma_linear_resampler_filter_f32_mono(pResampler, lpfCount, pFrames + 0);
ma_linear_resampler_filter_f32_mono(pResampler, lpfCount, pFrames + 1);
ma_linear_resampler_filter_f32_mono(pResampler, lpfCount, pFrames + 2);
ma_linear_resampler_filter_f32_mono(pResampler, lpfCount, pFrames + 3);
}
static MA_INLINE void ma_linear_resampler_filter_f32_4_stereo(ma_linear_resampler* pResampler, float* pFrames)
static MA_INLINE void ma_linear_resampler_filter_f32_4_stereo(ma_linear_resampler* pResampler, ma_uint32 lpfCount, float* pFrames)
{
ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 0);
ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 2);
ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 4);
ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 6);
ma_linear_resampler_filter_f32_stereo(pResampler, lpfCount, pFrames + 0);
ma_linear_resampler_filter_f32_stereo(pResampler, lpfCount, pFrames + 2);
ma_linear_resampler_filter_f32_stereo(pResampler, lpfCount, pFrames + 4);
ma_linear_resampler_filter_f32_stereo(pResampler, lpfCount, pFrames + 6);
}
@@ -60264,6 +60258,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear
ma_uint64 framesProcessedOut;
ma_uint32 c;
ma_uint32 invSampleRateOut;
ma_uint32 lpfCount;
MA_ASSERT(pResampler != NULL);
MA_ASSERT(pFrameCountIn != NULL);
@@ -60276,6 +60271,13 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear
framesProcessedIn = 0;
framesProcessedOut = 0;
invSampleRateOut = (1 << MA_LINEAR_RESAMPLER_LERP_SHIFT) / pResampler->sampleRateOut;
lpfCount = pResampler->lpfOrder >> 1;
/*
Not currently using lpfCount. If I update the `filter_s16()` function to accept a lpfCount parameter it results
in the mono s16 path when upsampling going twice as slow when compiled with GCC.
*/
(void)lpfCount;
if (pResampler->lpfOrder == 0) {
return ma_linear_resampler_process_pcm_frames_s16_no_lpf(pResampler, pFramesInS16, pFrameCountIn, pFramesOutS16, pFrameCountOut, invSampleRateOut);
@@ -60336,6 +60338,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
ma_uint64 framesProcessedOut;
ma_uint32 c;
ma_uint32 invSampleRateOut;
ma_uint32 lpfCount;
MA_ASSERT(pResampler != NULL);
MA_ASSERT(pFramesIn != NULL);
@@ -60350,6 +60353,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
framesProcessedIn = 0;
framesProcessedOut = 0;
invSampleRateOut = (1 << MA_LINEAR_RESAMPLER_LERP_SHIFT) / pResampler->sampleRateOut;
lpfCount = pResampler->lpfOrder >> 1;
/* We can run an optimized path when the low-pass filter is not being used. */
if (pResampler->lpfOrder == 0) {
@@ -60469,7 +60473,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
ma_linear_resampler_filter_s32_4_mono(pResampler, r);
ma_linear_resampler_filter_s32_4_mono(pResampler, lpfCount, r);
pFramesOutS16[0] = (ma_int16)r[0];
pFramesOutS16[1] = (ma_int16)r[1];
@@ -60537,7 +60541,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
ma_linear_resampler_filter_s32_4_stereo(pResampler, r);
ma_linear_resampler_filter_s32_4_stereo(pResampler, lpfCount, r);
pFramesOutS16[0] = (ma_int16)r[0];
pFramesOutS16[1] = (ma_int16)r[1];
@@ -60736,6 +60740,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
ma_uint64 framesProcessedOut;
ma_uint32 c;
float invSampleRateOut;
ma_uint32 lpfCount;
MA_ASSERT(pResampler != NULL);
MA_ASSERT(pFrameCountIn != NULL);
@@ -60748,6 +60753,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
framesProcessedIn = 0;
framesProcessedOut = 0;
invSampleRateOut = 1.0f / pResampler->sampleRateOut;
lpfCount = pResampler->lpfOrder >> 1;
if (pResampler->lpfOrder == 0) {
/* Fast path. No LPF needed. */
@@ -60768,7 +60774,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
pFramesInF32 += pResampler->channels;
/* Filter. */
ma_linear_resampler_filter_f32(pResampler, pResampler->x1.f32);
ma_linear_resampler_filter_f32(pResampler, lpfCount, pResampler->x1.f32);
framesProcessedIn += 1;
pResampler->inTimeInt -= 1;
@@ -60812,6 +60818,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
ma_uint64 framesProcessedOut;
ma_uint32 c;
double invSampleRateOut;
ma_uint32 lpfCount;
MA_ASSERT(pResampler != NULL);
MA_ASSERT(pFrameCountIn != NULL);
@@ -60824,6 +60831,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
framesProcessedIn = 0;
framesProcessedOut = 0;
invSampleRateOut = (1.0 / pResampler->sampleRateOut);
lpfCount = pResampler->lpfOrder >> 1;
if (pResampler->lpfOrder == 0) {
/* Fast path. No LPF needed. */
@@ -60843,7 +60851,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
for (c = 0; c < pResampler->channels; c += 1) {
pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pFramesInF32[c], a);
}
ma_linear_resampler_filter_f32(pResampler, pFramesOutF32);
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFramesOutF32);
pFramesOutF32 += pResampler->channels;
@@ -60943,7 +60951,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
r[2] = x[2] + n[2];
r[3] = x[3] + n[3];
ma_linear_resampler_filter_f32_4_mono(pResampler, r);
ma_linear_resampler_filter_f32_4_mono(pResampler, lpfCount, r);
pFramesOutF32[0] = r[0];
pFramesOutF32[1] = r[1];
@@ -61011,7 +61019,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
r[6] = x[6] + n[6];
r[7] = x[7] + n[7];
ma_linear_resampler_filter_f32_4_stereo(pResampler, r);
ma_linear_resampler_filter_f32_4_stereo(pResampler, lpfCount, r);
pFramesOutF32[0] = r[0];
pFramesOutF32[1] = r[1];
@@ -61066,7 +61074,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
pFramesOutF32[(3 * channels) + c] = r[3];
}
ma_linear_resampler_filter_f32_4(pResampler, pFramesOutF32);
ma_linear_resampler_filter_f32_4(pResampler, lpfCount, pFramesOutF32);
}
pFramesOutF32 += 4 * channels;
@@ -61083,7 +61091,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
for (c = 0; c < pResampler->channels; c += 1) {
pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a);
}
ma_linear_resampler_filter_f32(pResampler, pFramesOutF32);
ma_linear_resampler_filter_f32(pResampler, lpfCount, pFramesOutF32);
pFramesOutF32 += pResampler->channels;