Resampler: Optimization to the no-LPF path.

This moves some checks outside the loop. A bit more code duplication, but does improve speed.
2026-07-22 21:02:42 +02:00 · 2026-02-14 13:00:52 +10:00
parent 0fe2f7effd
commit 0615ce28f1
1 changed files with 434 additions and 304 deletions
@@ -59378,11 +59378,18 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
    /* Experimental loop unrolling to make it easier for SIMD-ification. */
    #if 1
    {
        if (channels == 1) {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                ma_int32 x[4];
                ma_int32 y[4];
                ma_int32 a[4];
                ma_int32 d[4];
                ma_int32 n[4];
                ma_int32 r[4];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
@@ -59409,16 +59416,6 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
            /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
            {
                if (channels == 1) {
                    ma_int32 x[4];
                    ma_int32 y[4];
                    ma_int32 a[4];
                    ma_int32 d[4];
                    ma_int32 n[4];
                    ma_int32 r[4];
                x[0] = pFramesInS16[inTimeInt[0] + 0];
                x[1] = pFramesInS16[inTimeInt[1] + 0];
                x[2] = pFramesInS16[inTimeInt[2] + 0];
@@ -59453,13 +59450,47 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
                pFramesOutS16[1] = r[1];
                pFramesOutS16[2] = r[2];
                pFramesOutS16[3] = r[3];
                pFramesOutS16 += 4;
                framesProcessedOut += 4;
            }
        } else if (channels == 2) {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                ma_int32 x[8];
                ma_int32 y[8];
                ma_int32 a[8];
                ma_int32 d[8];
                ma_int32 n[8];
                ma_int32 r[8];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
                inTimeFracTemp = pResampler->inTimeFrac;
                for (i = 0; i < 4; i += 1) {
                    inTimeInt[i]  = inTimeIntTemp;
                    inTimeFrac[i] = inTimeFracTemp;
                    inTimeIntTemp  += pResampler->inAdvanceInt;
                    inTimeFracTemp += pResampler->inAdvanceFrac;
                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
                        inTimeFracTemp -= pResampler->sampleRateOut;
                        inTimeIntTemp  += 1;
                    }
                }
                /* Check that we have one extra sample at the end for doing the interpolation. */
                if (inTimeInt[3] + 1 >= frameCountIn) {
                    break;  /* Not enough input frames. */
                }
                /* Advance the timer. */
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
                x[0] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 0];
                x[1] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 1];
@@ -59523,7 +59554,42 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
                pFramesOutS16[5] = r[5];
                pFramesOutS16[6] = r[6];
                pFramesOutS16[7] = r[7];
                pFramesOutS16 += 8;
                framesProcessedOut += 4;
            }
        } else {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
                inTimeFracTemp = pResampler->inTimeFrac;
                for (i = 0; i < 4; i += 1) {
                    inTimeInt[i]  = inTimeIntTemp;
                    inTimeFrac[i] = inTimeFracTemp;
                    inTimeIntTemp  += pResampler->inAdvanceInt;
                    inTimeFracTemp += pResampler->inAdvanceFrac;
                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
                        inTimeFracTemp -= pResampler->sampleRateOut;
                        inTimeIntTemp  += 1;
                    }
                }
                /* Check that we have one extra sample at the end for doing the interpolation. */
                if (inTimeInt[3] + 1 >= frameCountIn) {
                    break;  /* Not enough input frames. */
                }
                /* Advance the timer. */
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
                for (c = 0; c < channels; c += 1) {
                    ma_int32 x[4];
                    ma_int32 y[4];
@@ -59567,7 +59633,6 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
                    pFramesOutS16[(2 * channels) + c] = r[2];
                    pFramesOutS16[(3 * channels) + c] = r[3];
                }
                }
                pFramesOutS16 += 4 * channels;
                framesProcessedOut += 4;
@@ -59684,11 +59749,18 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
    /* Experimental loop unrolling to make it easier for SIMD-ification. */
    #if 1
    {
        if (channels == 1) {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                float x[4];
                float y[4];
                float a[4];
                float d[4];
                float n[4];
                float r[4];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
@@ -59715,16 +59787,6 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
            /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
            {
                if (channels == 1) {
                    float x[4];
                    float y[4];
                    float a[4];
                    float d[4];
                    float n[4];
                    float r[4];
                x[0] = pFramesInF32[inTimeInt[0] + 0];
                x[1] = pFramesInF32[inTimeInt[1] + 0];
                x[2] = pFramesInF32[inTimeInt[2] + 0];
@@ -59759,13 +59821,47 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                pFramesOutF32[1] = r[1];
                pFramesOutF32[2] = r[2];
                pFramesOutF32[3] = r[3];
                pFramesOutF32 += 4;
                framesProcessedOut += 4;
            }
        } else if (channels == 2) {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                float x[8];
                float y[8];
                float a[8];
                float d[8];
                float n[8];
                float r[8];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
                inTimeFracTemp = pResampler->inTimeFrac;
                for (i = 0; i < 4; i += 1) {
                    inTimeInt[i]  = inTimeIntTemp;
                    inTimeFrac[i] = inTimeFracTemp;
                    inTimeIntTemp  += pResampler->inAdvanceInt;
                    inTimeFracTemp += pResampler->inAdvanceFrac;
                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
                        inTimeFracTemp -= pResampler->sampleRateOut;
                        inTimeIntTemp  += 1;
                    }
                }
                /* Check that we have one extra sample at the end for doing the interpolation. */
                if (inTimeInt[3] + 1 >= frameCountIn) {
                    break;  /* Not enough input frames. */
                }
                /* Advance the timer. */
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
                x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0];
                x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1];
@@ -59829,7 +59925,42 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                pFramesOutF32[5] = r[5];
                pFramesOutF32[6] = r[6];
                pFramesOutF32[7] = r[7];
                pFramesOutF32 += 8;
                framesProcessedOut += 4;
            }
        } else {
            while (framesProcessedOut + 4 <= frameCountOut) {
                ma_uint32 inTimeIntTemp;
                ma_uint32 inTimeFracTemp;
                ma_uint32 inTimeInt[4];
                ma_uint32 inTimeFrac[4];
                int i;
                inTimeIntTemp  = pResampler->inTimeInt;
                inTimeFracTemp = pResampler->inTimeFrac;
                for (i = 0; i < 4; i += 1) {
                    inTimeInt[i]  = inTimeIntTemp;
                    inTimeFrac[i] = inTimeFracTemp;
                    inTimeIntTemp  += pResampler->inAdvanceInt;
                    inTimeFracTemp += pResampler->inAdvanceFrac;
                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
                        inTimeFracTemp -= pResampler->sampleRateOut;
                        inTimeIntTemp  += 1;
                    }
                }
                /* Check that we have one extra sample at the end for doing the interpolation. */
                if (inTimeInt[3] + 1 >= frameCountIn) {
                    break;  /* Not enough input frames. */
                }
                /* Advance the timer. */
                pResampler->inTimeInt  = inTimeIntTemp;
                pResampler->inTimeFrac = inTimeFracTemp;
                for (c = 0; c < channels; c += 1) {
                    float x[4];
                    float y[4];
@@ -59873,7 +60004,6 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                    pFramesOutF32[(2 * channels) + c] = r[2];
                    pFramesOutF32[(3 * channels) + c] = r[3];
                }
                }
                pFramesOutF32 += 4 * channels;
                framesProcessedOut += 4;