diff --git a/miniaudio.h b/miniaudio.h index cd24f932..e65d0cf4 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -3001,6 +3001,19 @@ Due to the nature of how resampling works, the resampler introduces some latency retrieved in terms of both the input rate and the output rate with `ma_resampler_get_input_latency()` and `ma_resampler_get_output_latency()`. +Below are some guidelines for some common scenarios: + + +-----------------------------------------------+-------------------------------------+ + | Scenario | Usage | + +-----------------------------------------------+-------------------------------------+ + | Fasest possible, quality not an issue | Linear, LPF == 0 | + | Fast, decent quality, infrequent rate changes | Linear, LPF >= 4 | + | Decent quality, frequent rate changes | Don't use miniaudio for resampling | + | High quality | Don't use miniaudio for resampling | + +-----------------------------------------------+-------------------------------------+ + +See below for how to configure the linear resampler's low-pass filter (LPF). + 10.3.1. Resampling Algorithms ----------------------------- @@ -3019,11 +3032,13 @@ low-pass filter will be applied before downsampling. When increasing the rate it after upsampling. By default a fourth order low-pass filter will be applied. This can be configured via the `lpfOrder` configuration variable. Setting this to 0 will disable filtering. It should be set to a multiple of 2, such as 2, 4, 6, 8, etc. There are diminishing returns the higher you go. -The maximum is `MA_MAX_FILTER_ORDER` which is 8 by default (it can be configured at compile time). The low-pass filter has a cutoff frequency which defaults to half the sample rate of the lowest of the input and output sample rates (Nyquist Frequency). +When the low-pass filter is disabled (`lpfOrder` = 0), the resampler will run on an optimized code +path and should be efficient. + The API for the linear resampler is the same as the main resampler API, only it's called `ma_linear_resampler`. @@ -3033,6 +3048,15 @@ for the input and output sample rates. It's OK to do make it large like 8000 -> OK to use stupid numbers like 8000 -> 89999. If you need ratios like this, use the f32 path, or use a different resampler. +The linear resampler can support dynamic rate adjustments, but there are some tradeoffs to be aware +of. When not using low-pass filtering (LPF order = 0), rate changes work fine, however when the +low-pass filter is enabled (LPF order > 0), rate changes can sometimes result in some +discontinuities due to abrubt changes to the low-pass filter parameters. If you keep rate changes +small and infrequent it can sound decent, but you should do your own testing to ensure it meets +your specification. If you need a high quality resampler with support for dynamic rate adjustment +you should avoid using miniaudio's linear resampler. + + 10.3.2. Custom Resamplers ------------------------- @@ -3072,7 +3096,8 @@ The `onGetRequiredInputFrameCount` callback is used to give miniaudio a hint as frames are required to be available to produce the given number of output frames. Likewise, the `onGetExpectedOutputFrameCount` callback is used to determine how many output frames will be produced given the specified number of input frames. miniaudio will use these as a hint, but they -are optional and can be set to NULL if you're unable to implement them. +are optional and can be set to NULL if you're unable to implement them. The returned values should +be 100% accurate. If the best you can do is an estimate, do not implement these callbacks at all. @@ -5613,6 +5638,34 @@ typedef struct } x1; /* The next input frame. */ ma_lpf lpf; + /* + We have some heap allocated data for the sample cache and the LPF state. This is an array of + either floats of int32s depending on whether or not `format` is f32 or s16. Below is the + structure: + + | Cached Samples | `channels` | + | LPF State | (`lpfOrder` / 2) * (4 + (`channels` * 2)) | + + The low-pass filter is achieved with a series of 2nd-order biquads. This means there is one + LPF state for each `lpfOrder`, divided by two. So if `lpfOrder` is 4, there will be 2 LPF + states in the array. The structure of each LPF state is as follows: + + | Biquad b1 | 1 | + | Biquad b2 | 1 | + | Biquad a1 | 1 | + | Biquad a2 | 1 | + | Biquad register 1 | `channels` | + | Biquad register 2 | `channels` | + + If you are familiar with biquads, you'll note that b0 and a0 are missing. This is because b0 + is set the same value as b2, so we just reuse b2, and a0 is just not used. + */ + union + { + float* f32; + ma_int32* s32; + } heap; + /* Memory management. */ void* _pHeap; ma_bool32 _ownsHeap; @@ -58904,6 +58957,8 @@ typedef struct size_t x0Offset; size_t x1Offset; size_t lpfOffset; + size_t cachedSamplesOffset; + size_t lpfStateOffset; } ma_linear_resampler_heap_layout; @@ -58925,6 +58980,10 @@ static void ma_linear_resampler_adjust_timer_for_new_rate(ma_linear_resampler* p pResampler->inTimeFrac = pResampler->inTimeFrac % newSampleRateOut; } +/* A cache of samples unrelated to the LPF comes first and needs to be skipped. */ +#define MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, lpfIndex) pResampler->heap.f32 + pResampler->channels + (lpfIndex * (4 + (pResampler->channels*2))) +#define MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, lpfIndex) pResampler->heap.s32 + pResampler->channels + (lpfIndex * (4 + (pResampler->channels*2))) + static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pResampler, void* pHeap, ma_linear_resampler_heap_layout* pHeapLayout, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_bool32 isResamplerAlreadyInitialized) { ma_result result; @@ -58933,6 +58992,10 @@ static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pRes double lpfCutoffFrequency; ma_lpf_config lpfConfig; ma_uint32 oldSampleRateOut; /* Required for adjusting time advance down the bottom. */ + ma_uint32 minSampleRate; + ma_uint32 maxSampleRate; + ma_uint32 lpfCount; + ma_uint32 iLPF; if (pResampler == NULL) { return MA_INVALID_ARGS; @@ -58952,14 +59015,78 @@ static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pRes pResampler->sampleRateIn /= gcf; pResampler->sampleRateOut /= gcf; - /* Always initialize the low-pass filter, even when the order is 0. */ - if (pResampler->lpfOrder > MA_MAX_FILTER_ORDER) { - return MA_INVALID_ARGS; + /* LPF. */ + minSampleRate = ma_min(pResampler->sampleRateIn, pResampler->sampleRateOut); + maxSampleRate = ma_max(pResampler->sampleRateIn, pResampler->sampleRateOut); + + lpfSampleRate = maxSampleRate; + lpfCutoffFrequency = minSampleRate * 0.5 * pResampler->lpfNyquistFactor; + + /* + When the input and output sample rates are the same, there's an edge case with the way the filter works + where we could have a singularity due to `sin(2*pi * cutoff/rate) = sin(pi) = 0`. I'm going to apply + a small clamp in an attempt to avoid hitting this case. + */ + lpfCutoffFrequency = ma_min(lpfCutoffFrequency, 0.5 * minSampleRate * (1.0 - 1e-6)); + lpfCutoffFrequency = ma_max(lpfCutoffFrequency, minSampleRate * 1e-6); + + /* We now need to update our LPF parameters. */ + lpfCount = pResampler->lpfOrder / 2; + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + /* + For our Q value, it's very temping to just use 0.707107 but that won't actually result in a true + Butterworth filter when the order is > 2. Instead it needs to be distributed. + */ + double q = 1 / (2*ma_cosd((1 + iLPF*2) * (MA_PI_D/(pResampler->lpfOrder*2)))); /* <-- This is just distributing 0.707107 over each of our cascading filters. */ + double w = 2 * MA_PI_D * lpfCutoffFrequency / lpfSampleRate; + double s = ma_sind(w); + double c = ma_cosd(w); + double a = s / (2*q); + double b1 = 1 - c; + double b2 = (1 - c) / 2; + double a0 = 1 + a; /* Only used for normalizing below. */ + double a1 = -2 * c; + double a2 = 1 - a; + + /* Biquad parameters need to be normalized. */ + b1 /= a0; + b2 /= a0; + a1 /= a0; + a2 /= a0; + + if (pResampler->format == ma_format_f32) { + float* pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, iLPF); + + /* We can now initialize our biquad parameters. */ + pLPF[0] = b1; + pLPF[1] = b2; + pLPF[2] = a1; + pLPF[3] = a2; + + /* + For safety, make sure the registers are cleared if this is being called because the resampler + is being initialized fresh. If it's already been initialized, we must not clear out the LPF + state or else we'll get glitching. We want to have smooth transitions between rate changes. + */ + if (!isResamplerAlreadyInitialized) { + MA_ZERO_MEMORY(pLPF + 4, sizeof(float) * pResampler->channels * 2); + } + } else { + ma_int32* pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, iLPF); + + pLPF[0] = ma_biquad_float_to_fp(b1); + pLPF[1] = ma_biquad_float_to_fp(b2); + pLPF[2] = ma_biquad_float_to_fp(a1); + pLPF[3] = ma_biquad_float_to_fp(a2); + + if (!isResamplerAlreadyInitialized) { + MA_ZERO_MEMORY(pLPF + 4, sizeof(ma_int32) * pResampler->channels * 2); + } + } } - lpfSampleRate = (ma_uint32)(ma_max(pResampler->sampleRateIn, pResampler->sampleRateOut)); - lpfCutoffFrequency = ( double)(ma_min(pResampler->sampleRateIn, pResampler->sampleRateOut) * 0.5 * pResampler->lpfNyquistFactor); + /* Old LPF. Will be removed later. */ lpfConfig = ma_lpf_config_init(pResampler->format, pResampler->channels, lpfSampleRate, lpfCutoffFrequency, pResampler->lpfOrder); /* @@ -59029,8 +59156,11 @@ static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_c pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels; } + pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes); + + /* LPF */ - pHeapLayout->lpfOffset = ma_align_64(pHeapLayout->sizeInBytes); + pHeapLayout->lpfOffset = pHeapLayout->sizeInBytes; { ma_result result; size_t lpfHeapSizeInBytes; @@ -59044,6 +59174,20 @@ static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_c pHeapLayout->sizeInBytes += lpfHeapSizeInBytes; } + + /* Cached samples. These are always stored as either f32 or s32, so either way it's 4 bytes per sample, even when the format is s16. */ + pHeapLayout->cachedSamplesOffset = pHeapLayout->sizeInBytes; + { + pHeapLayout->sizeInBytes += sizeof(ma_int32) * pConfig->channels; + } + + /* LPF state. */ + pHeapLayout->lpfStateOffset = pHeapLayout->sizeInBytes; + { + pHeapLayout->sizeInBytes += sizeof(ma_int32) * ((lpfOrder / 2) * (4 + (pConfig->channels * 2))); + } + + /* Make sure allocation size is aligned. */ pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes); @@ -59099,6 +59243,11 @@ MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler pResampler->lpfOrder += 1; /* Round up to even. */ } + /* It does not make sense for the Nyquist factor to go beyond 1. */ + if (pResampler->lpfNyquistFactor > 1) { + pResampler->lpfNyquistFactor = 0; + } + pResampler->_pHeap = pHeap; MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes); @@ -59110,6 +59259,8 @@ MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler pResampler->x1.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x1Offset); } + pResampler->heap.s32 = (ma_int32*)ma_offset_ptr(pHeap, heapLayout.cachedSamplesOffset); + /* Setting the rate will set up the filter and time advances for us. */ result = ma_linear_resampler_set_rate_internal(pResampler, pHeap, &heapLayout, pConfig->sampleRateIn, pConfig->sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_FALSE); if (result != MA_SUCCESS) { @@ -59171,7 +59322,7 @@ MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma } } -#define MA_LINEAR_RESAMPLER_LERP_SHIFT 12 +#define MA_LINEAR_RESAMPLER_LERP_SHIFT MA_BIQUAD_FIXED_POINT_SHIFT static MA_INLINE ma_int16 ma_linear_resampler_mix_s16(ma_int16 x, ma_int16 y, ma_uint32 a) { @@ -59202,16 +59353,7 @@ static MA_INLINE void ma_linear_resampler_interpolate_frame_s16(ma_linear_resamp MA_ASSUME(channels > 0); for (c = 0; c < channels; c += 1) { - ma_int16 x, y; - ma_int32 d; - ma_int32 n; - - x = pResampler->x0.s16[c]; - y = pResampler->x1.s16[c]; - d = y - x; - n = d * a; - - pFrameOut[c] = (ma_int16)(x + (n >> MA_LINEAR_RESAMPLER_LERP_SHIFT)); + pFrameOut[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a); } } @@ -59224,7 +59366,7 @@ static MA_INLINE void ma_linear_resampler_interpolate_frame_f32(ma_linear_resamp MA_ASSERT(pResampler != NULL); MA_ASSERT(pFrameOut != NULL); - a = (float)pResampler->inTimeFrac * invSampleRateOut; + a = pResampler->inTimeFrac * invSampleRateOut; MA_ASSUME(channels > 0); for (c = 0; c < channels; c += 1) { @@ -59284,6 +59426,209 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_ /* The rate must have changed between calls. Ignore the cached frame. */ } + /* Experimental loop unrolling to make it easier for SIMD-ification. */ + #if 1 + { + ma_uint32 channels = pResampler->channels; + + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt[4]; + ma_uint32 inTimeFrac[4]; + int i; + + inTimeIntTemp = pResampler->inTimeInt; + inTimeFracTemp = pResampler->inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt[i] = inTimeIntTemp; + inTimeFrac[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + pResampler->inTimeInt = inTimeIntTemp; + pResampler->inTimeFrac = inTimeFracTemp; + + /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ + { + if (channels == 1) { + ma_int32 x[4]; + ma_int32 y[4]; + ma_int32 a[4]; + ma_int32 d[4]; + ma_int32 n[4]; + ma_int32 r[4]; + + x[0] = pFramesInS16[inTimeInt[0] + 0]; + x[1] = pFramesInS16[inTimeInt[1] + 0]; + x[2] = pFramesInS16[inTimeInt[2] + 0]; + x[3] = pFramesInS16[inTimeInt[3] + 0]; + + y[0] = pFramesInS16[inTimeInt[0] + 1]; + y[1] = pFramesInS16[inTimeInt[1] + 1]; + y[2] = pFramesInS16[inTimeInt[2] + 1]; + y[3] = pFramesInS16[inTimeInt[3] + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + pFramesOutS16[0] = r[0]; + pFramesOutS16[1] = r[1]; + pFramesOutS16[2] = r[2]; + pFramesOutS16[3] = r[3]; + } else if (channels == 2) { + ma_int32 x[8]; + ma_int32 y[8]; + ma_int32 a[8]; + ma_int32 d[8]; + ma_int32 n[8]; + ma_int32 r[8]; + + x[0] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 0]; + x[1] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 1]; + x[2] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 0]; + x[3] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 1]; + x[4] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 0]; + x[5] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 1]; + x[6] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 0]; + x[7] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 1]; + + y[0] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 0]; + y[1] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 1]; + y[2] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 0]; + y[3] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 1]; + y[4] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 0]; + y[5] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 1]; + y[6] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 0]; + y[7] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[0] * invSampleRateOut; + a[2] = inTimeFrac[1] * invSampleRateOut; + a[3] = inTimeFrac[1] * invSampleRateOut; + a[4] = inTimeFrac[2] * invSampleRateOut; + a[5] = inTimeFrac[2] * invSampleRateOut; + a[6] = inTimeFrac[3] * invSampleRateOut; + a[7] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; + d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; + n[7] = d[7] * a[7]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + pFramesOutS16[0] = r[0]; + pFramesOutS16[1] = r[1]; + pFramesOutS16[2] = r[2]; + pFramesOutS16[3] = r[3]; + pFramesOutS16[4] = r[4]; + pFramesOutS16[5] = r[5]; + pFramesOutS16[6] = r[6]; + pFramesOutS16[7] = r[7]; + } else { + for (c = 0; c < channels; c += 1) { + ma_int32 x[4]; + ma_int32 y[4]; + ma_int32 a[4]; + ma_int32 d[4]; + ma_int32 n[4]; + ma_int32 r[4]; + + x[0] = pFramesInS16[((inTimeInt[0] + 0) * channels) + c]; + x[1] = pFramesInS16[((inTimeInt[1] + 0) * channels) + c]; + x[2] = pFramesInS16[((inTimeInt[2] + 0) * channels) + c]; + x[3] = pFramesInS16[((inTimeInt[3] + 0) * channels) + c]; + + y[0] = pFramesInS16[((inTimeInt[0] + 1) * channels) + c]; + y[1] = pFramesInS16[((inTimeInt[1] + 1) * channels) + c]; + y[2] = pFramesInS16[((inTimeInt[2] + 1) * channels) + c]; + y[3] = pFramesInS16[((inTimeInt[3] + 1) * channels) + c]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + pFramesOutS16[(0 * channels) + c] = r[0]; + pFramesOutS16[(1 * channels) + c] = r[1]; + pFramesOutS16[(2 * channels) + c] = r[2]; + pFramesOutS16[(3 * channels) + c] = r[3]; + } + } + + pFramesOutS16 += 4 * channels; + framesProcessedOut += 4; + } + } + } + #endif + while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { if (pResampler->inTimeInt + 1 < frameCountIn) { ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut; @@ -59336,7 +59681,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_ return MA_SUCCESS; } -static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_linear_resampler* pResampler, const float* pFramesInF32, ma_uint64* pFrameCountIn, float* pFramesOutF32, ma_uint64* pFrameCountOut, float invSampleRateOut) +static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_linear_resampler* pResampler, const float* pFramesInF32, ma_uint64* pFrameCountIn, float* pFramesOutF32, ma_uint64* pFrameCountOut, double invSampleRateOut) { ma_uint64 frameCountIn; ma_uint64 frameCountOut; @@ -59360,7 +59705,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ MA_ASSERT(pResampler->cachedFrameCount <= 1); /* There is at most one cached frame. */ while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) { - float a = pResampler->inTimeFrac * invSampleRateOut; + float a = (double)(pResampler->inTimeFrac * invSampleRateOut); for (c = 0; c < pResampler->channels; c += 1) { pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pFramesInF32[c], a); @@ -59434,31 +59779,33 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ float r[4]; x[0] = pFramesInF32[inTimeInt[0] + 0]; - y[0] = pFramesInF32[inTimeInt[0] + 1]; - a[0] = inTimeFrac[0] * invSampleRateOut; - d[0] = y[0] - x[0]; - n[0] = d[0] * a[0]; - r[0] = x[0] + n[0]; - x[1] = pFramesInF32[inTimeInt[1] + 0]; - y[1] = pFramesInF32[inTimeInt[1] + 1]; - a[1] = inTimeFrac[1] * invSampleRateOut; - d[1] = y[1] - x[1]; - n[1] = d[1] * a[1]; - r[1] = x[1] + n[1]; - x[2] = pFramesInF32[inTimeInt[2] + 0]; - y[2] = pFramesInF32[inTimeInt[2] + 1]; - a[2] = inTimeFrac[2] * invSampleRateOut; - d[2] = y[2] - x[2]; - n[2] = d[2] * a[2]; - r[2] = x[2] + n[2]; - x[3] = pFramesInF32[inTimeInt[3] + 0]; + + y[0] = pFramesInF32[inTimeInt[0] + 1]; + y[1] = pFramesInF32[inTimeInt[1] + 1]; + y[2] = pFramesInF32[inTimeInt[2] + 1]; y[3] = pFramesInF32[inTimeInt[3] + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; n[3] = d[3] * a[3]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; r[3] = x[3] + n[3]; pFramesOutF32[0] = r[0]; @@ -59472,69 +59819,59 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ float d[8]; float n[8]; float r[8]; - - /* Frame 0, Channel 0 */ + x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0]; - y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0]; - a[0] = inTimeFrac[0] * invSampleRateOut; - d[0] = y[0] - x[0]; - n[0] = d[0] * a[0]; - r[0] = x[0] + n[0]; - - /* Frame 0, Channel 1 */ x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1]; - y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1]; - a[1] = inTimeFrac[0] * invSampleRateOut; - d[1] = y[1] - x[1]; - n[1] = d[1] * a[1]; - r[1] = x[1] + n[1]; - - /* Frame 1, Channel 0 */ x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0]; - y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0]; - a[2] = inTimeFrac[1] * invSampleRateOut; - d[2] = y[2] - x[2]; - n[2] = d[2] * a[2]; - r[2] = x[2] + n[2]; - - /* Frame 1, Channel 1 */ x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1]; - y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1]; - a[3] = inTimeFrac[1] * invSampleRateOut; - d[3] = y[3] - x[3]; - n[3] = d[3] * a[3]; - r[3] = x[3] + n[3]; - - /* Frame 2, Channel 0 */ x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0]; - y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0]; - a[4] = inTimeFrac[2] * invSampleRateOut; - d[4] = y[4] - x[4]; - n[4] = d[4] * a[4]; - r[4] = x[4] + n[4]; - - /* Frame 2, Channel 1 */ x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1]; - y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1]; - a[5] = inTimeFrac[2] * invSampleRateOut; - d[5] = y[5] - x[5]; - n[5] = d[5] * a[5]; - r[5] = x[5] + n[5]; - - /* Frame 3, Channel 0 */ x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0]; - y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0]; - a[6] = inTimeFrac[3] * invSampleRateOut; - d[6] = y[6] - x[6]; - n[6] = d[6] * a[6]; - r[6] = x[6] + n[6]; - - /* Frame 3, Channel 1 */ x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1]; + + y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0]; + y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1]; + y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0]; + y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1]; + y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0]; + y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1]; + y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0]; y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[0] * invSampleRateOut; + a[2] = inTimeFrac[1] * invSampleRateOut; + a[3] = inTimeFrac[1] * invSampleRateOut; + a[4] = inTimeFrac[2] * invSampleRateOut; + a[5] = inTimeFrac[2] * invSampleRateOut; + a[6] = inTimeFrac[3] * invSampleRateOut; a[7] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; n[7] = d[7] * a[7]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + r[4] = x[4] + n[4]; + r[5] = x[5] + n[5]; + r[6] = x[6] + n[6]; r[7] = x[7] + n[7]; pFramesOutF32[0] = r[0]; @@ -59553,33 +59890,35 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ float d[4]; float n[4]; float r[4]; - + x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c]; - y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c]; - a[0] = inTimeFrac[0] * invSampleRateOut; - d[0] = y[0] - x[0]; - n[0] = d[0] * a[0]; - r[0] = x[0] + n[0]; - x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c]; - y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c]; - a[1] = inTimeFrac[1] * invSampleRateOut; - d[1] = y[1] - x[1]; - n[1] = d[1] * a[1]; - r[1] = x[1] + n[1]; - x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c]; - y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c]; - a[2] = inTimeFrac[2] * invSampleRateOut; - d[2] = y[2] - x[2]; - n[2] = d[2] * a[2]; - r[2] = x[2] + n[2]; - x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c]; + + y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c]; + y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c]; + y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c]; y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; n[3] = d[3] * a[3]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; r[3] = x[3] + n[3]; pFramesOutF32[(0 * channels) + c] = r[0]; @@ -59598,10 +59937,10 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { if (pResampler->inTimeInt + 1 < frameCountIn) { - float a = pResampler->inTimeFrac * invSampleRateOut; + float a = (float)(pResampler->inTimeFrac * invSampleRateOut); for (c = 0; c < pResampler->channels; c += 1) { - pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[(pResampler->inTimeInt * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a); + pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a); } pFramesOutF32 += pResampler->channels; @@ -59620,7 +59959,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ the frame to ensure we make some forward progress. */ for (c = 0; c < pResampler->channels; c += 1) { - pResampler->x0.f32[c] = pFramesInF32[(pResampler->inTimeInt * pResampler->channels) + c]; + pResampler->x0.f32[c] = pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c]; } pResampler->cachedFrameCount = 1; @@ -59648,6 +59987,326 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_ return MA_SUCCESS; } + +static MA_INLINE void ma_linear_resampler_filter_s16(ma_linear_resampler* pResampler, ma_int16* pFrame) +{ + ma_int32* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + ma_int32 b1 = pLPF[0]; + ma_int32 b2 = pLPF[1]; + ma_int32 a1 = pLPF[2]; + ma_int32 a2 = pLPF[3]; + ma_int32* pR0 = pLPF + 4; + ma_int32* pR1 = pLPF + 4 + pResampler->channels; + + for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { + ma_int32 r0 = pR0[iChannel]; + ma_int32 r1 = pR1[iChannel]; + ma_int32 x = pFrame[iChannel]; + ma_int32 y; + + y = (b2*x + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT; + r0 = (b1*x - a1*y + r1); + r1 = (b2*x - a2*y); + + pFrame[iChannel] = (ma_int16)ma_clamp(y, -32768, 32767); + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (pResampler->channels * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_s16_4(ma_linear_resampler* pResampler, ma_int16* pFrames) +{ + ma_linear_resampler_filter_s16(pResampler, pFrames + (0 * pResampler->channels)); + ma_linear_resampler_filter_s16(pResampler, pFrames + (1 * pResampler->channels)); + ma_linear_resampler_filter_s16(pResampler, pFrames + (2 * pResampler->channels)); + ma_linear_resampler_filter_s16(pResampler, pFrames + (3 * pResampler->channels)); +} + +#if 1 +static MA_INLINE void ma_linear_resampler_filter_s16_4_stereo(ma_linear_resampler* pResampler, ma_int16* pFrames) +{ + ma_linear_resampler_filter_s16(pResampler, pFrames + 0); + ma_linear_resampler_filter_s16(pResampler, pFrames + 2); + ma_linear_resampler_filter_s16(pResampler, pFrames + 4); + ma_linear_resampler_filter_s16(pResampler, pFrames + 6); +} +#endif + +static MA_INLINE void ma_linear_resampler_filter_s32(ma_linear_resampler* pResampler, ma_uint32 channels, ma_int32* pFrame) +{ + ma_int32* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + ma_int32 b1 = pLPF[0]; + ma_int32 b2 = pLPF[1]; + ma_int32 a1 = pLPF[2]; + ma_int32 a2 = pLPF[3]; + ma_int32* pR0 = pLPF + 4; + ma_int32* pR1 = pLPF + 4 + channels; + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ma_int32 r0 = pR0[iChannel]; + ma_int32 r1 = pR1[iChannel]; + ma_int32 x = pFrame[iChannel]; + ma_int32 y; + + y = (b2*x + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT; + r0 = (b1*x - a1*y + r1); + r1 = (b2*x - a2*y); + + pFrame[iChannel] = ma_clamp(y, -32768, 32767); + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (channels * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_s32_mono(ma_linear_resampler* pResampler, ma_int32* pFrame) +{ + ma_int32* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + ma_int32 b1 = pLPF[0]; + ma_int32 b2 = pLPF[1]; + ma_int32 a1 = pLPF[2]; + ma_int32 a2 = pLPF[3]; + ma_int32* pR0 = pLPF + 4; + ma_int32* pR1 = pLPF + 4 + 1; + + for (iChannel = 0; iChannel < 1; iChannel += 1) { + ma_int32 r0 = pR0[iChannel]; + ma_int32 r1 = pR1[iChannel]; + ma_int32 x = pFrame[iChannel]; + ma_int32 y; + + y = (b2*x + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT; + r0 = (b1*x - a1*y + r1); + r1 = (b2*x - a2*y); + + pFrame[iChannel] = ma_clamp(y, -32768, 32767); + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (1 * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_s32_stereo(ma_linear_resampler* pResampler, ma_int32* pFrame) +{ + ma_int32* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + ma_int32 b1 = pLPF[0]; + ma_int32 b2 = pLPF[1]; + ma_int32 a1 = pLPF[2]; + ma_int32 a2 = pLPF[3]; + ma_int32* pR0 = pLPF + 4; + ma_int32* pR1 = pLPF + 4 + 2; + + for (iChannel = 0; iChannel < 2; iChannel += 1) { + ma_int32 r0 = pR0[iChannel]; + ma_int32 r1 = pR1[iChannel]; + ma_int32 x = pFrame[iChannel]; + ma_int32 y; + + y = (b2*x + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT; + r0 = (b1*x - a1*y + r1); + r1 = (b2*x - a2*y); + + pFrame[iChannel] = ma_clamp(y, -32768, 32767); + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (2 * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_s32_4(ma_linear_resampler* pResampler, ma_int32* pFrames) +{ + ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (0 * pResampler->channels)); + ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (1 * pResampler->channels)); + ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (2 * pResampler->channels)); + ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (3 * pResampler->channels)); +} + +static MA_INLINE void ma_linear_resampler_filter_s32_4_mono(ma_linear_resampler* pResampler, ma_int32* pFrames) +{ + ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 0); + ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 1); + ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 2); + ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 3); +} + +static MA_INLINE void ma_linear_resampler_filter_s32_4_stereo(ma_linear_resampler* pResampler, ma_int32* pFrames) +{ + ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 0); + ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 2); + ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 4); + ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 6); +} + +static MA_INLINE void ma_linear_resampler_filter_f32(ma_linear_resampler* pResampler, float* pFrame) +{ + float* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + float b1 = pLPF[0]; + float b2 = pLPF[1]; + float a1 = pLPF[2]; + float a2 = pLPF[3]; + float* pR0 = pLPF + 4; + float* pR1 = pLPF + 4 + pResampler->channels; + + for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { + float r0 = pR0[iChannel]; + float r1 = pR1[iChannel]; + float x = pFrame[iChannel]; + float y; + + y = b2*x + r0; + r0 = b1*x - a1*y + r1; + r1 = b2*x - a2*y; + + pFrame[iChannel] = y; + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (pResampler->channels * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_f32_mono(ma_linear_resampler* pResampler, float* pFrame) +{ + float* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + float b1 = pLPF[0]; + float b2 = pLPF[1]; + float a1 = pLPF[2]; + float a2 = pLPF[3]; + float* pR0 = pLPF + 4; + float* pR1 = pLPF + 4 + 1; + + for (iChannel = 0; iChannel < 1; iChannel += 1) { + float r0 = pR0[iChannel]; + float r1 = pR1[iChannel]; + float x = pFrame[iChannel]; + float y; + + y = b2*x + r0; + r0 = b1*x - a1*y + r1; + r1 = b2*x - a2*y; + + pFrame[iChannel] = y; + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (1 * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_f32_stereo(ma_linear_resampler* pResampler, float* pFrame) +{ + float* pLPF; + ma_uint32 iLPF; + ma_uint32 lpfCount = pResampler->lpfOrder >> 1; /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */ + ma_uint32 iChannel; + + pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0); + for (iLPF = 0; iLPF < lpfCount; iLPF += 1) { + float b1 = pLPF[0]; + float b2 = pLPF[1]; + float a1 = pLPF[2]; + float a2 = pLPF[3]; + float* pR0 = pLPF + 4; + float* pR1 = pLPF + 4 + 2; + + for (iChannel = 0; iChannel < 2; iChannel += 1) { + float r0 = pR0[iChannel]; + float r1 = pR1[iChannel]; + float x = pFrame[iChannel]; + float y; + + y = b2*x + r0; + r0 = b1*x - a1*y + r1; + r1 = b2*x - a2*y; + + pFrame[iChannel] = y; + pR0[iChannel] = r0; + pR1[iChannel] = r1; + } + + /* Go do the next LPF state. */ + pLPF += 4 + (2 * 2); + } +} + +static MA_INLINE void ma_linear_resampler_filter_f32_4(ma_linear_resampler* pResampler, float* pFrames) +{ + ma_linear_resampler_filter_f32(pResampler, pFrames + (0 * pResampler->channels)); + ma_linear_resampler_filter_f32(pResampler, pFrames + (1 * pResampler->channels)); + ma_linear_resampler_filter_f32(pResampler, pFrames + (2 * pResampler->channels)); + ma_linear_resampler_filter_f32(pResampler, pFrames + (3 * pResampler->channels)); +} + +static MA_INLINE void ma_linear_resampler_filter_f32_4_mono(ma_linear_resampler* pResampler, float* pFrames) +{ + ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 0); + ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 1); + ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 2); + ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 3); +} + +static MA_INLINE void ma_linear_resampler_filter_f32_4_stereo(ma_linear_resampler* pResampler, float* pFrames) +{ + ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 0); + ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 2); + ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 4); + ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 6); +} + + static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut) { const ma_int16* pFramesInS16; @@ -59656,6 +60315,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear ma_uint64 frameCountOut; ma_uint64 framesProcessedIn; ma_uint64 framesProcessedOut; + ma_uint32 c; ma_uint32 invSampleRateOut; MA_ASSERT(pResampler != NULL); @@ -59674,27 +60334,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear return ma_linear_resampler_process_pcm_frames_s16_no_lpf(pResampler, pFramesInS16, pFrameCountIn, pFramesOutS16, pFrameCountOut, invSampleRateOut); } else { while (framesProcessedOut < frameCountOut) { + ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut; + /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */ while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) { - ma_uint32 iChannel; - - if (pFramesInS16 != NULL) { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel]; - pResampler->x1.s16[iChannel] = pFramesInS16[iChannel]; - } - pFramesInS16 += pResampler->channels; - } else { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel]; - pResampler->x1.s16[iChannel] = 0; - } + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x0.s16[c] = pResampler->x1.s16[c]; + pResampler->x1.s16[c] = pFramesInS16[c]; } + pFramesInS16 += pResampler->channels; - /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */ - if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) { - ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pResampler->x1.s16, pResampler->x1.s16); - } + /* Filter. */ + ma_linear_resampler_filter_s16(pResampler, pResampler->x1.s16); framesProcessedIn += 1; pResampler->inTimeInt -= 1; @@ -59705,12 +60356,10 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear } /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */ - if (pFramesOutS16 != NULL) { - MA_ASSERT(pResampler->inTimeInt == 0); - ma_linear_resampler_interpolate_frame_s16(pResampler, invSampleRateOut, pFramesOutS16); - - pFramesOutS16 += pResampler->channels; + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutS16[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a); } + pFramesOutS16 += pResampler->channels; framesProcessedOut += 1; @@ -59738,6 +60387,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r ma_uint64 frameCountOut; ma_uint64 framesProcessedIn; ma_uint64 framesProcessedOut; + ma_uint32 c; ma_uint32 invSampleRateOut; MA_ASSERT(pResampler != NULL); @@ -59760,23 +60410,324 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r return ma_linear_resampler_process_pcm_frames_s16_no_lpf(pResampler, pFramesInS16, pFrameCountIn, pFramesOutS16, pFrameCountOut, invSampleRateOut); } else { /* Slow path. Need LPF. */ + + #if 1 + /* If there's a cached frame we need to process it. */ + if (pResampler->inTimeInt == 0) { + MA_ASSERT(pResampler->cachedFrameCount <= 1); /* There is at most one cached frame. */ + + while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) { + ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut; + + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutS16[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pFramesInS16[c], a); + } + ma_linear_resampler_filter_s16(pResampler, pFramesOutS16); + + pFramesOutS16 += pResampler->channels; + + framesProcessedOut += 1; + + /* Advance time forward. */ + pResampler->inTimeInt += pResampler->inAdvanceInt; + pResampler->inTimeFrac += pResampler->inAdvanceFrac; + if (pResampler->inTimeFrac >= pResampler->sampleRateOut) { + pResampler->inTimeFrac -= pResampler->sampleRateOut; + pResampler->inTimeInt += 1; + } + + /* Subtract one from the time to account for the cached frame, but only if the entire frame was processed. */ + if (pResampler->inTimeInt > 0) { + pResampler->inTimeInt -= 1; + pResampler->cachedFrameCount = 0; + } + } + } else { + /* The rate must have changed between calls. Ignore the cached frame. */ + } + + /* Experimental loop unrolling to make it easier for SIMD-ification. */ + #if 1 + { + ma_uint32 channels = pResampler->channels; + + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt[4]; + ma_uint32 inTimeFrac[4]; + int i; + + inTimeIntTemp = pResampler->inTimeInt; + inTimeFracTemp = pResampler->inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt[i] = inTimeIntTemp; + inTimeFrac[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + pResampler->inTimeInt = inTimeIntTemp; + pResampler->inTimeFrac = inTimeFracTemp; + + /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ + { + if (channels == 1) { + ma_int32 x[4]; + ma_int32 y[4]; + ma_int32 a[4]; + ma_int32 d[4]; + ma_int32 n[4]; + ma_int32 r[4]; + + x[0] = pFramesInS16[inTimeInt[0] + 0]; + x[1] = pFramesInS16[inTimeInt[1] + 0]; + x[2] = pFramesInS16[inTimeInt[2] + 0]; + x[3] = pFramesInS16[inTimeInt[3] + 0]; + + y[0] = pFramesInS16[inTimeInt[0] + 1]; + y[1] = pFramesInS16[inTimeInt[1] + 1]; + y[2] = pFramesInS16[inTimeInt[2] + 1]; + y[3] = pFramesInS16[inTimeInt[3] + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + ma_linear_resampler_filter_s32_4_mono(pResampler, r); + + pFramesOutS16[0] = (ma_int16)r[0]; + pFramesOutS16[1] = (ma_int16)r[1]; + pFramesOutS16[2] = (ma_int16)r[2]; + pFramesOutS16[3] = (ma_int16)r[3]; + } else if (channels == 2) { + ma_int32 x[8]; + ma_int32 y[8]; + ma_int32 a[8]; + ma_int32 d[8]; + ma_int32 n[8]; + ma_int32 r[8]; + + x[0] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 0]; + x[1] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 1]; + x[2] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 0]; + x[3] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 1]; + x[4] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 0]; + x[5] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 1]; + x[6] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 0]; + x[7] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 1]; + + y[0] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 0]; + y[1] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 1]; + y[2] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 0]; + y[3] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 1]; + y[4] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 0]; + y[5] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 1]; + y[6] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 0]; + y[7] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[0] * invSampleRateOut; + a[2] = inTimeFrac[1] * invSampleRateOut; + a[3] = inTimeFrac[1] * invSampleRateOut; + a[4] = inTimeFrac[2] * invSampleRateOut; + a[5] = inTimeFrac[2] * invSampleRateOut; + a[6] = inTimeFrac[3] * invSampleRateOut; + a[7] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; + d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; + n[7] = d[7] * a[7]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + ma_linear_resampler_filter_s32_4_stereo(pResampler, r); + + pFramesOutS16[0] = (ma_int16)r[0]; + pFramesOutS16[1] = (ma_int16)r[1]; + pFramesOutS16[2] = (ma_int16)r[2]; + pFramesOutS16[3] = (ma_int16)r[3]; + pFramesOutS16[4] = (ma_int16)r[4]; + pFramesOutS16[5] = (ma_int16)r[5]; + pFramesOutS16[6] = (ma_int16)r[6]; + pFramesOutS16[7] = (ma_int16)r[7]; + } else { + for (c = 0; c < channels; c += 1) { + ma_int32 x[4]; + ma_int32 y[4]; + ma_int32 a[4]; + ma_int32 d[4]; + ma_int32 n[4]; + ma_int32 r[4]; + + x[0] = pFramesInS16[((inTimeInt[0] + 0) * channels) + c]; + x[1] = pFramesInS16[((inTimeInt[1] + 0) * channels) + c]; + x[2] = pFramesInS16[((inTimeInt[2] + 0) * channels) + c]; + x[3] = pFramesInS16[((inTimeInt[3] + 0) * channels) + c]; + + y[0] = pFramesInS16[((inTimeInt[0] + 1) * channels) + c]; + y[1] = pFramesInS16[((inTimeInt[1] + 1) * channels) + c]; + y[2] = pFramesInS16[((inTimeInt[2] + 1) * channels) + c]; + y[3] = pFramesInS16[((inTimeInt[3] + 1) * channels) + c]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT); + + pFramesOutS16[(0 * channels) + c] = (ma_int16)r[0]; + pFramesOutS16[(1 * channels) + c] = (ma_int16)r[1]; + pFramesOutS16[(2 * channels) + c] = (ma_int16)r[2]; + pFramesOutS16[(3 * channels) + c] = (ma_int16)r[3]; + } + + ma_linear_resampler_filter_s16_4(pResampler, pFramesOutS16); + } + + pFramesOutS16 += 4 * channels; + framesProcessedOut += 4; + } + } + } + #endif + + while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { + if (pResampler->inTimeInt + 1 < frameCountIn) { + ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut; + + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutS16[c] = ma_linear_resampler_mix_s16(pFramesInS16[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInS16[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a); + } + ma_linear_resampler_filter_s16(pResampler, pFramesOutS16); + + pFramesOutS16 += pResampler->channels; + + framesProcessedOut += 1; + + /* Advance time forward. */ + pResampler->inTimeInt += pResampler->inAdvanceInt; + pResampler->inTimeFrac += pResampler->inAdvanceFrac; + if (pResampler->inTimeFrac >= pResampler->sampleRateOut) { + pResampler->inTimeFrac -= pResampler->sampleRateOut; + pResampler->inTimeInt += 1; + } + } else { + /* + There is not enough input frames to interpolate. We'll need to stop here. But it's important that we cache + the frame to ensure we make some forward progress. + */ + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x0.s16[c] = pFramesInS16[((pResampler->inTimeInt + 0) * pResampler->channels) + c]; + } + + pResampler->cachedFrameCount = 1; + pResampler->inTimeInt += 1; + + break; + } + } + + /* The number of frames we processed is simply the difference between our current time and previous time, clamped. */ + framesProcessedIn = pResampler->inTimeInt; + if (framesProcessedIn > frameCountIn) { /* Should never overshoot when upsampling. Downsampling could overshoot. */ + framesProcessedIn = frameCountIn; + } + + if (pResampler->inTimeInt >= framesProcessedIn) { + pResampler->inTimeInt -= framesProcessedIn; + } else { + pResampler->inTimeInt = 0; + } + + /* + Now matter what, we want to cache the last input frame. The reason is that if the sample rate changes from upsampling to downsampling, the + downsampling process will be expecting an input frame. + */ + if (framesProcessedIn > 0) { + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x1.s16[c] = pFramesInS16[((framesProcessedIn - 1) * pResampler->channels) + c]; + } + } + + *pFrameCountIn = framesProcessedIn; + *pFrameCountOut = framesProcessedOut; + + return MA_SUCCESS; + #else while (framesProcessedOut < frameCountOut) { /* Before interpolating we need to load the buffers. */ while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) { ma_uint32 iChannel; - if (pFramesInS16 != NULL) { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel]; - pResampler->x1.s16[iChannel] = pFramesInS16[iChannel]; - } - pFramesInS16 += pResampler->channels; - } else { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel]; - pResampler->x1.s16[iChannel] = 0; - } + for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { + pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel]; + pResampler->x1.s16[iChannel] = pFramesInS16[iChannel]; } + pFramesInS16 += pResampler->channels; framesProcessedIn += 1; pResampler->inTimeInt -= 1; @@ -59791,10 +60742,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r MA_ASSERT(pResampler->inTimeInt == 0); ma_linear_resampler_interpolate_frame_s16(pResampler, invSampleRateOut, pFramesOutS16); - /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */ - if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) { - ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pFramesOutS16, pFramesOutS16); - } + /* Filter. */ + ma_linear_resampler_filter_s16(pResampler, pFramesOutS16); pFramesOutS16 += pResampler->channels; } @@ -59814,6 +60763,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r *pFrameCountOut = framesProcessedOut; return MA_SUCCESS; + #endif } } @@ -59837,6 +60787,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear ma_uint64 frameCountOut; ma_uint64 framesProcessedIn; ma_uint64 framesProcessedOut; + ma_uint32 c; float invSampleRateOut; MA_ASSERT(pResampler != NULL); @@ -59856,28 +60807,21 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear return ma_linear_resampler_process_pcm_frames_f32_no_lpf(pResampler, pFramesInF32, pFrameCountIn, pFramesOutF32, pFrameCountOut, invSampleRateOut); } else { /* Slow path. Need LPF. */ + /*printf("DOWN\n");*/ + while (framesProcessedOut < frameCountOut) { + float a = (float)(pResampler->inTimeFrac * invSampleRateOut); + /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */ while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) { - ma_uint32 iChannel; - - if (pFramesInF32 != NULL) { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel]; - pResampler->x1.f32[iChannel] = pFramesInF32[iChannel]; - } - pFramesInF32 += pResampler->channels; - } else { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel]; - pResampler->x1.f32[iChannel] = 0; - } + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x0.f32[c] = pResampler->x1.f32[c]; + pResampler->x1.f32[c] = pFramesInF32[c]; } + pFramesInF32 += pResampler->channels; - /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */ - if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) { - ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pResampler->x1.f32, pResampler->x1.f32); - } + /* Filter. */ + ma_linear_resampler_filter_f32(pResampler, pResampler->x1.f32); framesProcessedIn += 1; pResampler->inTimeInt -= 1; @@ -59888,12 +60832,10 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear } /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */ - if (pFramesOutF32 != NULL) { - MA_ASSERT(pResampler->inTimeInt == 0); - ma_linear_resampler_interpolate_frame_f32(pResampler, invSampleRateOut, pFramesOutF32); - - pFramesOutF32 += pResampler->channels; + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a); } + pFramesOutF32 += pResampler->channels; framesProcessedOut += 1; @@ -59921,7 +60863,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r ma_uint64 frameCountOut; ma_uint64 framesProcessedIn; ma_uint64 framesProcessedOut; - float invSampleRateOut; + ma_uint32 c; + double invSampleRateOut; MA_ASSERT(pResampler != NULL); MA_ASSERT(pFrameCountIn != NULL); @@ -59933,30 +60876,332 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r frameCountOut = *pFrameCountOut; framesProcessedIn = 0; framesProcessedOut = 0; - invSampleRateOut = 1.0f / pResampler->sampleRateOut; + invSampleRateOut = (1.0 / pResampler->sampleRateOut); if (pResampler->lpfOrder == 0) { /* Fast path. No LPF needed. */ return ma_linear_resampler_process_pcm_frames_f32_no_lpf(pResampler, pFramesInF32, pFrameCountIn, pFramesOutF32, pFrameCountOut, invSampleRateOut); } else { /* Slow path. Need LPF. */ + /*printf("UP %u\n", pResampler->inTimeInt);*/ + + #if 1 + /* If there's a cached frame we need to process it. */ + if (pResampler->inTimeInt == 0) { + MA_ASSERT(pResampler->cachedFrameCount <= 1); /* There is at most one cached frame. */ + + while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) { + float a = (float)(pResampler->inTimeFrac * invSampleRateOut); + + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pFramesInF32[c], a); + } + ma_linear_resampler_filter_f32(pResampler, pFramesOutF32); + + pFramesOutF32 += pResampler->channels; + + framesProcessedOut += 1; + + /* Advance time forward. */ + pResampler->inTimeInt += pResampler->inAdvanceInt; + pResampler->inTimeFrac += pResampler->inAdvanceFrac; + if (pResampler->inTimeFrac >= pResampler->sampleRateOut) { + pResampler->inTimeFrac -= pResampler->sampleRateOut; + pResampler->inTimeInt += 1; + } + + /* Subtract one from the time to account for the cached frame, but only if the entire frame was processed. */ + if (pResampler->inTimeInt > 0) { + pResampler->inTimeInt -= 1; + pResampler->cachedFrameCount = 0; + } + } + } else { + /* The rate must have changed between calls. Ignore the cached frame. */ + } + + /* Experimental loop unrolling to make it easier for SIMD-ification. */ + #if 1 + { + ma_uint32 channels = pResampler->channels; + + while (framesProcessedOut + 4 <= frameCountOut) { + ma_uint32 inTimeIntTemp; + ma_uint32 inTimeFracTemp; + ma_uint32 inTimeInt[4]; + ma_uint32 inTimeFrac[4]; + int i; + + inTimeIntTemp = pResampler->inTimeInt; + inTimeFracTemp = pResampler->inTimeFrac; + + for (i = 0; i < 4; i += 1) { + inTimeInt[i] = inTimeIntTemp; + inTimeFrac[i] = inTimeFracTemp; + + inTimeIntTemp += pResampler->inAdvanceInt; + inTimeFracTemp += pResampler->inAdvanceFrac; + if (inTimeFracTemp >= pResampler->sampleRateOut) { + inTimeFracTemp -= pResampler->sampleRateOut; + inTimeIntTemp += 1; + } + } + + /* Check that we have one extra sample at the end for doing the interpolation. */ + if (inTimeInt[3] + 1 >= frameCountIn) { + break; /* Not enough input frames. */ + } + + /* Advance the timer. */ + pResampler->inTimeInt = inTimeIntTemp; + pResampler->inTimeFrac = inTimeFracTemp; + + /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */ + { + if (channels == 1) { + float x[4]; + float y[4]; + float a[4]; + float d[4]; + float n[4]; + float r[4]; + + x[0] = pFramesInF32[inTimeInt[0] + 0]; + x[1] = pFramesInF32[inTimeInt[1] + 0]; + x[2] = pFramesInF32[inTimeInt[2] + 0]; + x[3] = pFramesInF32[inTimeInt[3] + 0]; + + y[0] = pFramesInF32[inTimeInt[0] + 1]; + y[1] = pFramesInF32[inTimeInt[1] + 1]; + y[2] = pFramesInF32[inTimeInt[2] + 1]; + y[3] = pFramesInF32[inTimeInt[3] + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + + ma_linear_resampler_filter_f32_4_mono(pResampler, r); + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + } else if (channels == 2) { + float x[8]; + float y[8]; + float a[8]; + float d[8]; + float n[8]; + float r[8]; + + x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0]; + x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1]; + x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0]; + x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1]; + x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0]; + x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1]; + x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0]; + x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1]; + + y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0]; + y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1]; + y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0]; + y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1]; + y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0]; + y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1]; + y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0]; + y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[0] * invSampleRateOut; + a[2] = inTimeFrac[1] * invSampleRateOut; + a[3] = inTimeFrac[1] * invSampleRateOut; + a[4] = inTimeFrac[2] * invSampleRateOut; + a[5] = inTimeFrac[2] * invSampleRateOut; + a[6] = inTimeFrac[3] * invSampleRateOut; + a[7] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + d[4] = y[4] - x[4]; + d[5] = y[5] - x[5]; + d[6] = y[6] - x[6]; + d[7] = y[7] - x[7]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + n[4] = d[4] * a[4]; + n[5] = d[5] * a[5]; + n[6] = d[6] * a[6]; + n[7] = d[7] * a[7]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + r[4] = x[4] + n[4]; + r[5] = x[5] + n[5]; + r[6] = x[6] + n[6]; + r[7] = x[7] + n[7]; + + ma_linear_resampler_filter_f32_4_stereo(pResampler, r); + + pFramesOutF32[0] = r[0]; + pFramesOutF32[1] = r[1]; + pFramesOutF32[2] = r[2]; + pFramesOutF32[3] = r[3]; + pFramesOutF32[4] = r[4]; + pFramesOutF32[5] = r[5]; + pFramesOutF32[6] = r[6]; + pFramesOutF32[7] = r[7]; + } else { + for (c = 0; c < channels; c += 1) { + float x[4]; + float y[4]; + float a[4]; + float d[4]; + float n[4]; + float r[4]; + + x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c]; + x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c]; + x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c]; + x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c]; + + y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c]; + y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c]; + y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c]; + y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c]; + + a[0] = inTimeFrac[0] * invSampleRateOut; + a[1] = inTimeFrac[1] * invSampleRateOut; + a[2] = inTimeFrac[2] * invSampleRateOut; + a[3] = inTimeFrac[3] * invSampleRateOut; + + d[0] = y[0] - x[0]; + d[1] = y[1] - x[1]; + d[2] = y[2] - x[2]; + d[3] = y[3] - x[3]; + + n[0] = d[0] * a[0]; + n[1] = d[1] * a[1]; + n[2] = d[2] * a[2]; + n[3] = d[3] * a[3]; + + r[0] = x[0] + n[0]; + r[1] = x[1] + n[1]; + r[2] = x[2] + n[2]; + r[3] = x[3] + n[3]; + + pFramesOutF32[(0 * channels) + c] = r[0]; + pFramesOutF32[(1 * channels) + c] = r[1]; + pFramesOutF32[(2 * channels) + c] = r[2]; + pFramesOutF32[(3 * channels) + c] = r[3]; + } + + ma_linear_resampler_filter_f32_4(pResampler, pFramesOutF32); + } + + pFramesOutF32 += 4 * channels; + framesProcessedOut += 4; + } + } + } + #endif + + while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) { + if (pResampler->inTimeInt + 1 < frameCountIn) { + float a = (float)(pResampler->inTimeFrac * invSampleRateOut); + + for (c = 0; c < pResampler->channels; c += 1) { + pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a); + } + ma_linear_resampler_filter_f32(pResampler, pFramesOutF32); + + pFramesOutF32 += pResampler->channels; + + framesProcessedOut += 1; + + /* Advance time forward. */ + pResampler->inTimeInt += pResampler->inAdvanceInt; + pResampler->inTimeFrac += pResampler->inAdvanceFrac; + if (pResampler->inTimeFrac >= pResampler->sampleRateOut) { + pResampler->inTimeFrac -= pResampler->sampleRateOut; + pResampler->inTimeInt += 1; + } + } else { + /* + There is not enough input frames to interpolate. We'll need to stop here. But it's important that we cache + the frame to ensure we make some forward progress. + */ + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x0.f32[c] = pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c]; + } + + pResampler->cachedFrameCount = 1; + pResampler->inTimeInt += 1; + + break; + } + } + + /* The number of frames we processed is simply the difference between our current time and previous time, clamped. */ + framesProcessedIn = pResampler->inTimeInt; + if (framesProcessedIn > frameCountIn) { /* Should never overshoot when upsampling. Downsampling could overshoot. */ + framesProcessedIn = frameCountIn; + } + + if (pResampler->inTimeInt >= framesProcessedIn) { + pResampler->inTimeInt -= framesProcessedIn; + } else { + pResampler->inTimeInt = 0; + } + + /* + Now matter what, we want to cache the last input frame. The reason is that if the sample rate changes from upsampling to downsampling, the + downsampling process will be expecting an input frame. + */ + if (framesProcessedIn > 0) { + for (c = 0; c < pResampler->channels; c += 1) { + pResampler->x1.f32[c] = pFramesInF32[((framesProcessedIn - 1) * pResampler->channels) + c]; + } + } + + *pFrameCountIn = framesProcessedIn; + *pFrameCountOut = framesProcessedOut; + + return MA_SUCCESS; + #else while (framesProcessedOut < frameCountOut) { /* Before interpolating we need to load the buffers. */ while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) { ma_uint32 iChannel; - if (pFramesInF32 != NULL) { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel]; - pResampler->x1.f32[iChannel] = pFramesInF32[iChannel]; - } - pFramesInF32 += pResampler->channels; - } else { - for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { - pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel]; - pResampler->x1.f32[iChannel] = 0; - } + for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) { + pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel]; + pResampler->x1.f32[iChannel] = pFramesInF32[iChannel]; } + pFramesInF32 += pResampler->channels; framesProcessedIn += 1; pResampler->inTimeInt -= 1; @@ -59971,10 +61216,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r MA_ASSERT(pResampler->inTimeInt == 0); ma_linear_resampler_interpolate_frame_f32(pResampler, invSampleRateOut, pFramesOutF32); - /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */ - if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) { - ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pFramesOutF32, pFramesOutF32); - } + /* Filter. */ + ma_linear_resampler_filter_f32(pResampler, pFramesOutF32); pFramesOutF32 += pResampler->channels; } @@ -59994,6 +61237,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r *pFrameCountOut = framesProcessedOut; return MA_SUCCESS; + #endif } } @@ -60045,7 +61289,7 @@ MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResamp return MA_INVALID_ARGS; } - d = 1000000; + d = 1000; n = (ma_uint32)(ratioInOut * d); if (n == 0) {