diff --git a/miniaudio.h b/miniaudio.h
index cd24f932..e65d0cf4 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -3001,6 +3001,19 @@ Due to the nature of how resampling works, the resampler introduces some latency
 retrieved in terms of both the input rate and the output rate with
 `ma_resampler_get_input_latency()` and `ma_resampler_get_output_latency()`.
 
+Below are some guidelines for some common scenarios:
+
+    +-----------------------------------------------+-------------------------------------+
+    | Scenario                                      | Usage                               |
+    +-----------------------------------------------+-------------------------------------+
+    | Fasest possible, quality not an issue         | Linear, LPF == 0                    |
+    | Fast, decent quality, infrequent rate changes | Linear, LPF >= 4                    |
+    | Decent quality, frequent rate changes         | Don't use miniaudio for resampling  |
+    | High quality                                  | Don't use miniaudio for resampling  |
+    +-----------------------------------------------+-------------------------------------+
+
+See below for how to configure the linear resampler's low-pass filter (LPF).
+
 
 10.3.1. Resampling Algorithms
 -----------------------------
@@ -3019,11 +3032,13 @@ low-pass filter will be applied before downsampling. When increasing the rate it
 after upsampling. By default a fourth order low-pass filter will be applied. This can be configured
 via the `lpfOrder` configuration variable. Setting this to 0 will disable filtering. It should be
 set to a multiple of 2, such as 2, 4, 6, 8, etc. There are diminishing returns the higher you go.
-The maximum is `MA_MAX_FILTER_ORDER` which is 8 by default (it can be configured at compile time).
 
 The low-pass filter has a cutoff frequency which defaults to half the sample rate of the lowest of
 the input and output sample rates (Nyquist Frequency).
 
+When the low-pass filter is disabled (`lpfOrder` = 0), the resampler will run on an optimized code
+path and should be efficient.
+
 The API for the linear resampler is the same as the main resampler API, only it's called
 `ma_linear_resampler`.
 
@@ -3033,6 +3048,15 @@ for the input and output sample rates. It's OK to do make it large like 8000 ->
 OK to use stupid numbers like 8000 -> 89999. If you need ratios like this, use the f32 path, or use
 a different resampler.
 
+The linear resampler can support dynamic rate adjustments, but there are some tradeoffs to be aware
+of. When not using low-pass filtering (LPF order = 0), rate changes work fine, however when the
+low-pass filter is enabled (LPF order > 0), rate changes can sometimes result in some
+discontinuities due to abrubt changes to the low-pass filter parameters. If you keep rate changes
+small and infrequent it can sound decent, but you should do your own testing to ensure it meets
+your specification. If you need a high quality resampler with support for dynamic rate adjustment
+you should avoid using miniaudio's linear resampler.
+
+
 
 10.3.2. Custom Resamplers
 -------------------------
@@ -3072,7 +3096,8 @@ The `onGetRequiredInputFrameCount` callback is used to give miniaudio a hint as
 frames are required to be available to produce the given number of output frames. Likewise, the
 `onGetExpectedOutputFrameCount` callback is used to determine how many output frames will be
 produced given the specified number of input frames. miniaudio will use these as a hint, but they
-are optional and can be set to NULL if you're unable to implement them.
+are optional and can be set to NULL if you're unable to implement them. The returned values should
+be 100% accurate. If the best you can do is an estimate, do not implement these callbacks at all.
 
 
 
@@ -5613,6 +5638,34 @@ typedef struct
     } x1; /* The next input frame. */
     ma_lpf lpf;
 
+    /*
+    We have some heap allocated data for the sample cache and the LPF state. This is an array of
+    either floats of int32s depending on whether or not `format` is f32 or s16. Below is the
+    structure:
+
+        | Cached Samples | `channels`                                |
+        | LPF State      | (`lpfOrder` / 2) * (4 + (`channels` * 2)) |
+
+    The low-pass filter is achieved with a series of 2nd-order biquads. This means there is one
+    LPF state for each `lpfOrder`, divided by two. So if `lpfOrder` is 4, there will be 2 LPF
+    states in the array. The structure of each LPF state is as follows:
+
+        | Biquad b1         | 1          |
+        | Biquad b2         | 1          |
+        | Biquad a1         | 1          |
+        | Biquad a2         | 1          |
+        | Biquad register 1 | `channels` |
+        | Biquad register 2 | `channels` |
+
+    If you are familiar with biquads, you'll note that b0 and a0 are missing. This is because b0
+    is set the same value as b2, so we just reuse b2, and a0 is just not used.
+    */
+    union
+    {
+        float*    f32;
+        ma_int32* s32;
+    } heap;
+
     /* Memory management. */
     void* _pHeap;
     ma_bool32 _ownsHeap;
@@ -58904,6 +58957,8 @@ typedef struct
     size_t x0Offset;
     size_t x1Offset;
     size_t lpfOffset;
+    size_t cachedSamplesOffset;
+    size_t lpfStateOffset;
 } ma_linear_resampler_heap_layout;
 
 
@@ -58925,6 +58980,10 @@ static void ma_linear_resampler_adjust_timer_for_new_rate(ma_linear_resampler* p
     pResampler->inTimeFrac = pResampler->inTimeFrac % newSampleRateOut;
 }
 
+/* A cache of samples unrelated to the LPF comes first and needs to be skipped. */
+#define MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, lpfIndex) pResampler->heap.f32 + pResampler->channels + (lpfIndex * (4 + (pResampler->channels*2)))
+#define MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, lpfIndex) pResampler->heap.s32 + pResampler->channels + (lpfIndex * (4 + (pResampler->channels*2)))
+
 static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pResampler, void* pHeap, ma_linear_resampler_heap_layout* pHeapLayout, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_bool32 isResamplerAlreadyInitialized)
 {
     ma_result result;
@@ -58933,6 +58992,10 @@ static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pRes
     double lpfCutoffFrequency;
     ma_lpf_config lpfConfig;
     ma_uint32 oldSampleRateOut; /* Required for adjusting time advance down the bottom. */
+    ma_uint32 minSampleRate;
+    ma_uint32 maxSampleRate;
+    ma_uint32 lpfCount;
+    ma_uint32 iLPF;
 
     if (pResampler == NULL) {
         return MA_INVALID_ARGS;
@@ -58952,14 +59015,78 @@ static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pRes
     pResampler->sampleRateIn  /= gcf;
     pResampler->sampleRateOut /= gcf;
 
-    /* Always initialize the low-pass filter, even when the order is 0. */
-    if (pResampler->lpfOrder > MA_MAX_FILTER_ORDER) {
-        return MA_INVALID_ARGS;
+    /* LPF. */
+    minSampleRate = ma_min(pResampler->sampleRateIn, pResampler->sampleRateOut);
+    maxSampleRate = ma_max(pResampler->sampleRateIn, pResampler->sampleRateOut);
+
+    lpfSampleRate      = maxSampleRate;
+    lpfCutoffFrequency = minSampleRate * 0.5 * pResampler->lpfNyquistFactor;
+
+    /*
+    When the input and output sample rates are the same, there's an edge case with the way the filter works
+    where we could have a singularity due to `sin(2*pi * cutoff/rate) = sin(pi) = 0`. I'm going to apply
+    a small clamp in an attempt to avoid hitting this case.
+    */
+    lpfCutoffFrequency = ma_min(lpfCutoffFrequency, 0.5 * minSampleRate * (1.0 - 1e-6));
+    lpfCutoffFrequency = ma_max(lpfCutoffFrequency, minSampleRate * 1e-6);
+
+    /* We now need to update our LPF parameters. */
+    lpfCount = pResampler->lpfOrder / 2;
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        /*
+        For our Q value, it's very temping to just use 0.707107 but that won't actually result in a true
+        Butterworth filter when the order is > 2. Instead it needs to be distributed.
+        */
+        double q = 1 / (2*ma_cosd((1 + iLPF*2) * (MA_PI_D/(pResampler->lpfOrder*2))));  /* <-- This is just distributing 0.707107 over each of our cascading filters. */
+        double w = 2 * MA_PI_D * lpfCutoffFrequency / lpfSampleRate;
+        double s = ma_sind(w);
+        double c = ma_cosd(w);
+        double a = s / (2*q);
+        double b1 =  1 - c;
+        double b2 = (1 - c) / 2;
+        double a0 =  1 + a; /* Only used for normalizing below. */
+        double a1 = -2 * c;
+        double a2 =  1 - a;
+
+        /* Biquad parameters need to be normalized. */
+        b1 /= a0;
+        b2 /= a0;
+        a1 /= a0;
+        a2 /= a0;
+
+        if (pResampler->format == ma_format_f32) {
+            float* pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, iLPF);
+
+            /* We can now initialize our biquad parameters. */
+            pLPF[0] = b1;
+            pLPF[1] = b2;
+            pLPF[2] = a1;
+            pLPF[3] = a2;
+
+            /*
+            For safety, make sure the registers are cleared if this is being called because the resampler
+            is being initialized fresh. If it's already been initialized, we must not clear out the LPF
+            state or else we'll get glitching. We want to have smooth transitions between rate changes.
+            */
+            if (!isResamplerAlreadyInitialized) {
+                MA_ZERO_MEMORY(pLPF + 4, sizeof(float) * pResampler->channels * 2);
+            }
+        } else {
+            ma_int32* pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, iLPF);
+
+            pLPF[0] = ma_biquad_float_to_fp(b1);
+            pLPF[1] = ma_biquad_float_to_fp(b2);
+            pLPF[2] = ma_biquad_float_to_fp(a1);
+            pLPF[3] = ma_biquad_float_to_fp(a2);
+
+            if (!isResamplerAlreadyInitialized) {
+                MA_ZERO_MEMORY(pLPF + 4, sizeof(ma_int32) * pResampler->channels * 2);
+            }
+        }
     }
 
-    lpfSampleRate      = (ma_uint32)(ma_max(pResampler->sampleRateIn, pResampler->sampleRateOut));
-    lpfCutoffFrequency = (   double)(ma_min(pResampler->sampleRateIn, pResampler->sampleRateOut) * 0.5 * pResampler->lpfNyquistFactor);
 
+    /* Old LPF. Will be removed later. */
     lpfConfig = ma_lpf_config_init(pResampler->format, pResampler->channels, lpfSampleRate, lpfCutoffFrequency, pResampler->lpfOrder);
 
     /*
@@ -59029,8 +59156,11 @@ static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_c
         pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels;
     }
 
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+
     /* LPF */
-    pHeapLayout->lpfOffset = ma_align_64(pHeapLayout->sizeInBytes);
+    pHeapLayout->lpfOffset = pHeapLayout->sizeInBytes;
     {
         ma_result result;
         size_t lpfHeapSizeInBytes;
@@ -59044,6 +59174,20 @@ static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_c
         pHeapLayout->sizeInBytes += lpfHeapSizeInBytes;
     }
 
+
+    /* Cached samples. These are always stored as either f32 or s32, so either way it's 4 bytes per sample, even when the format is s16. */
+    pHeapLayout->cachedSamplesOffset = pHeapLayout->sizeInBytes;
+    {
+        pHeapLayout->sizeInBytes += sizeof(ma_int32) * pConfig->channels;
+    }
+
+    /* LPF state. */
+    pHeapLayout->lpfStateOffset = pHeapLayout->sizeInBytes;
+    {
+        pHeapLayout->sizeInBytes += sizeof(ma_int32) * ((lpfOrder / 2) * (4 + (pConfig->channels * 2)));
+    }
+
+
     /* Make sure allocation size is aligned. */
     pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
 
@@ -59099,6 +59243,11 @@ MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler
         pResampler->lpfOrder += 1;  /* Round up to even. */
     }
 
+    /* It does not make sense for the Nyquist factor to go beyond 1. */
+    if (pResampler->lpfNyquistFactor > 1) {
+        pResampler->lpfNyquistFactor = 0;
+    }
+
     pResampler->_pHeap = pHeap;
     MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
 
@@ -59110,6 +59259,8 @@ MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler
         pResampler->x1.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x1Offset);
     }
 
+    pResampler->heap.s32 = (ma_int32*)ma_offset_ptr(pHeap, heapLayout.cachedSamplesOffset);
+
     /* Setting the rate will set up the filter and time advances for us. */
     result = ma_linear_resampler_set_rate_internal(pResampler, pHeap, &heapLayout, pConfig->sampleRateIn, pConfig->sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_FALSE);
     if (result != MA_SUCCESS) {
@@ -59171,7 +59322,7 @@ MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma
     }
 }
 
-#define MA_LINEAR_RESAMPLER_LERP_SHIFT 12
+#define MA_LINEAR_RESAMPLER_LERP_SHIFT MA_BIQUAD_FIXED_POINT_SHIFT
 
 static MA_INLINE ma_int16 ma_linear_resampler_mix_s16(ma_int16 x, ma_int16 y, ma_uint32 a)
 {
@@ -59202,16 +59353,7 @@ static MA_INLINE void ma_linear_resampler_interpolate_frame_s16(ma_linear_resamp
 
     MA_ASSUME(channels > 0);
     for (c = 0; c < channels; c += 1) {
-        ma_int16 x, y;
-        ma_int32 d;
-        ma_int32 n;
-
-        x = pResampler->x0.s16[c];
-        y = pResampler->x1.s16[c];
-        d = y - x;
-        n = d * a;
-
-        pFrameOut[c] = (ma_int16)(x + (n >> MA_LINEAR_RESAMPLER_LERP_SHIFT));
+        pFrameOut[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a);
     }
 }
 
@@ -59224,7 +59366,7 @@ static MA_INLINE void ma_linear_resampler_interpolate_frame_f32(ma_linear_resamp
     MA_ASSERT(pResampler != NULL);
     MA_ASSERT(pFrameOut  != NULL);
 
-    a = (float)pResampler->inTimeFrac * invSampleRateOut;
+    a = pResampler->inTimeFrac * invSampleRateOut;
 
     MA_ASSUME(channels > 0);
     for (c = 0; c < channels; c += 1) {
@@ -59284,6 +59426,209 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
         /* The rate must have changed between calls. Ignore the cached frame. */
     }
 
+    /* Experimental loop unrolling to make it easier for SIMD-ification. */
+    #if 1
+    {
+        ma_uint32 channels = pResampler->channels;
+
+        while (framesProcessedOut + 4 <= frameCountOut) {
+            ma_uint32 inTimeIntTemp;
+            ma_uint32 inTimeFracTemp;
+            ma_uint32 inTimeInt[4];
+            ma_uint32 inTimeFrac[4];
+            int i;
+
+            inTimeIntTemp  = pResampler->inTimeInt;
+            inTimeFracTemp = pResampler->inTimeFrac;
+
+            for (i = 0; i < 4; i += 1) {
+                inTimeInt[i]  = inTimeIntTemp;
+                inTimeFrac[i] = inTimeFracTemp;
+
+                inTimeIntTemp  += pResampler->inAdvanceInt;
+                inTimeFracTemp += pResampler->inAdvanceFrac;
+                if (inTimeFracTemp >= pResampler->sampleRateOut) {
+                    inTimeFracTemp -= pResampler->sampleRateOut;
+                    inTimeIntTemp  += 1;
+                }
+            }
+
+            /* Check that we have one extra sample at the end for doing the interpolation. */
+            if (inTimeInt[3] + 1 >= frameCountIn) {
+                break;  /* Not enough input frames. */
+            }
+
+            /* Advance the timer. */
+            pResampler->inTimeInt  = inTimeIntTemp;
+            pResampler->inTimeFrac = inTimeFracTemp;
+
+            /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
+            {
+                if (channels == 1) {
+                    ma_int32 x[4];
+                    ma_int32 y[4];
+                    ma_int32 a[4];
+                    ma_int32 d[4];
+                    ma_int32 n[4];
+                    ma_int32 r[4];
+                    
+                    x[0] = pFramesInS16[inTimeInt[0] + 0];
+                    x[1] = pFramesInS16[inTimeInt[1] + 0];
+                    x[2] = pFramesInS16[inTimeInt[2] + 0];
+                    x[3] = pFramesInS16[inTimeInt[3] + 0];
+
+                    y[0] = pFramesInS16[inTimeInt[0] + 1];
+                    y[1] = pFramesInS16[inTimeInt[1] + 1];
+                    y[2] = pFramesInS16[inTimeInt[2] + 1];
+                    y[3] = pFramesInS16[inTimeInt[3] + 1];
+
+                    a[0] = inTimeFrac[0] * invSampleRateOut;
+                    a[1] = inTimeFrac[1] * invSampleRateOut;
+                    a[2] = inTimeFrac[2] * invSampleRateOut;
+                    a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                    d[0] = y[0] - x[0];
+                    d[1] = y[1] - x[1];
+                    d[2] = y[2] - x[2];
+                    d[3] = y[3] - x[3];
+
+                    n[0] = d[0] * a[0];
+                    n[1] = d[1] * a[1];
+                    n[2] = d[2] * a[2];
+                    n[3] = d[3] * a[3];
+
+                    r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                    pFramesOutS16[0] = r[0];
+                    pFramesOutS16[1] = r[1];
+                    pFramesOutS16[2] = r[2];
+                    pFramesOutS16[3] = r[3];
+                } else if (channels == 2) {
+                    ma_int32 x[8];
+                    ma_int32 y[8];
+                    ma_int32 a[8];
+                    ma_int32 d[8];
+                    ma_int32 n[8];
+                    ma_int32 r[8];
+
+                    x[0] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 0];
+                    x[1] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 1];
+                    x[2] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 0];
+                    x[3] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 1];
+                    x[4] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 0];
+                    x[5] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 1];
+                    x[6] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 0];
+                    x[7] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 1];
+
+                    y[0] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 0];
+                    y[1] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 1];
+                    y[2] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 0];
+                    y[3] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 1];
+                    y[4] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 0];
+                    y[5] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 1];
+                    y[6] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 0];
+                    y[7] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 1];
+
+                    a[0] = inTimeFrac[0] * invSampleRateOut;
+                    a[1] = inTimeFrac[0] * invSampleRateOut;
+                    a[2] = inTimeFrac[1] * invSampleRateOut;
+                    a[3] = inTimeFrac[1] * invSampleRateOut;
+                    a[4] = inTimeFrac[2] * invSampleRateOut;
+                    a[5] = inTimeFrac[2] * invSampleRateOut;
+                    a[6] = inTimeFrac[3] * invSampleRateOut;
+                    a[7] = inTimeFrac[3] * invSampleRateOut;
+
+                    d[0] = y[0] - x[0];
+                    d[1] = y[1] - x[1];
+                    d[2] = y[2] - x[2];
+                    d[3] = y[3] - x[3];
+                    d[4] = y[4] - x[4];
+                    d[5] = y[5] - x[5];
+                    d[6] = y[6] - x[6];
+                    d[7] = y[7] - x[7];
+
+                    n[0] = d[0] * a[0];
+                    n[1] = d[1] * a[1];
+                    n[2] = d[2] * a[2];
+                    n[3] = d[3] * a[3];
+                    n[4] = d[4] * a[4];
+                    n[5] = d[5] * a[5];
+                    n[6] = d[6] * a[6];
+                    n[7] = d[7] * a[7];
+
+                    r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                    r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                    pFramesOutS16[0] = r[0];
+                    pFramesOutS16[1] = r[1];
+                    pFramesOutS16[2] = r[2];
+                    pFramesOutS16[3] = r[3];
+                    pFramesOutS16[4] = r[4];
+                    pFramesOutS16[5] = r[5];
+                    pFramesOutS16[6] = r[6];
+                    pFramesOutS16[7] = r[7];
+                } else {
+                    for (c = 0; c < channels; c += 1) {
+                        ma_int32 x[4];
+                        ma_int32 y[4];
+                        ma_int32 a[4];
+                        ma_int32 d[4];
+                        ma_int32 n[4];
+                        ma_int32 r[4];
+
+                        x[0] = pFramesInS16[((inTimeInt[0] + 0) * channels) + c];
+                        x[1] = pFramesInS16[((inTimeInt[1] + 0) * channels) + c];
+                        x[2] = pFramesInS16[((inTimeInt[2] + 0) * channels) + c];
+                        x[3] = pFramesInS16[((inTimeInt[3] + 0) * channels) + c];
+
+                        y[0] = pFramesInS16[((inTimeInt[0] + 1) * channels) + c];
+                        y[1] = pFramesInS16[((inTimeInt[1] + 1) * channels) + c];
+                        y[2] = pFramesInS16[((inTimeInt[2] + 1) * channels) + c];
+                        y[3] = pFramesInS16[((inTimeInt[3] + 1) * channels) + c];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[1] * invSampleRateOut;
+                        a[2] = inTimeFrac[2] * invSampleRateOut;
+                        a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
+                        d[3] = y[3] - x[3];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
+                        n[3] = d[3] * a[3];
+
+                        r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                        pFramesOutS16[(0 * channels) + c] = r[0];
+                        pFramesOutS16[(1 * channels) + c] = r[1];
+                        pFramesOutS16[(2 * channels) + c] = r[2];
+                        pFramesOutS16[(3 * channels) + c] = r[3];
+                    }
+                }
+
+                pFramesOutS16 += 4 * channels;
+                framesProcessedOut += 4;
+            }
+        }
+    }
+    #endif
+
     while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
         if (pResampler->inTimeInt + 1 < frameCountIn) {
             ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut;
@@ -59336,7 +59681,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_s16_no_lpf(ma_
     return MA_SUCCESS;
 }
 
-static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_linear_resampler* pResampler, const float* pFramesInF32, ma_uint64* pFrameCountIn, float* pFramesOutF32, ma_uint64* pFrameCountOut, float invSampleRateOut)
+static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_linear_resampler* pResampler, const float* pFramesInF32, ma_uint64* pFrameCountIn, float* pFramesOutF32, ma_uint64* pFrameCountOut, double invSampleRateOut)
 {
     ma_uint64 frameCountIn;
     ma_uint64 frameCountOut;
@@ -59360,7 +59705,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
         MA_ASSERT(pResampler->cachedFrameCount <= 1);   /* There is at most one cached frame. */
 
         while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) {
-            float a = pResampler->inTimeFrac * invSampleRateOut;
+            float a = (double)(pResampler->inTimeFrac * invSampleRateOut);
 
             for (c = 0; c < pResampler->channels; c += 1) {
                 pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pFramesInF32[c], a);
@@ -59434,31 +59779,33 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                     float r[4];
                     
                     x[0] = pFramesInF32[inTimeInt[0] + 0];
-                    y[0] = pFramesInF32[inTimeInt[0] + 1];
-                    a[0] = inTimeFrac[0] * invSampleRateOut;
-                    d[0] = y[0] - x[0];
-                    n[0] = d[0] * a[0];
-                    r[0] = x[0] + n[0];
-
                     x[1] = pFramesInF32[inTimeInt[1] + 0];
-                    y[1] = pFramesInF32[inTimeInt[1] + 1];
-                    a[1] = inTimeFrac[1] * invSampleRateOut;
-                    d[1] = y[1] - x[1];
-                    n[1] = d[1] * a[1];
-                    r[1] = x[1] + n[1];
-
                     x[2] = pFramesInF32[inTimeInt[2] + 0];
-                    y[2] = pFramesInF32[inTimeInt[2] + 1];
-                    a[2] = inTimeFrac[2] * invSampleRateOut;
-                    d[2] = y[2] - x[2];
-                    n[2] = d[2] * a[2];
-                    r[2] = x[2] + n[2];
-
                     x[3] = pFramesInF32[inTimeInt[3] + 0];
+
+                    y[0] = pFramesInF32[inTimeInt[0] + 1];
+                    y[1] = pFramesInF32[inTimeInt[1] + 1];
+                    y[2] = pFramesInF32[inTimeInt[2] + 1];
                     y[3] = pFramesInF32[inTimeInt[3] + 1];
+
+                    a[0] = inTimeFrac[0] * invSampleRateOut;
+                    a[1] = inTimeFrac[1] * invSampleRateOut;
+                    a[2] = inTimeFrac[2] * invSampleRateOut;
                     a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                    d[0] = y[0] - x[0];
+                    d[1] = y[1] - x[1];
+                    d[2] = y[2] - x[2];
                     d[3] = y[3] - x[3];
+
+                    n[0] = d[0] * a[0];
+                    n[1] = d[1] * a[1];
+                    n[2] = d[2] * a[2];
                     n[3] = d[3] * a[3];
+
+                    r[0] = x[0] + n[0];
+                    r[1] = x[1] + n[1];
+                    r[2] = x[2] + n[2];
                     r[3] = x[3] + n[3];
 
                     pFramesOutF32[0] = r[0];
@@ -59472,69 +59819,59 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                     float d[8];
                     float n[8];
                     float r[8];
-                    
-                    /* Frame 0, Channel 0 */
+
                     x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0];
-                    y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0];
-                    a[0] = inTimeFrac[0] * invSampleRateOut;
-                    d[0] = y[0] - x[0];
-                    n[0] = d[0] * a[0];
-                    r[0] = x[0] + n[0];
-
-                    /* Frame 0, Channel 1 */
                     x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1];
-                    y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1];
-                    a[1] = inTimeFrac[0] * invSampleRateOut;
-                    d[1] = y[1] - x[1];
-                    n[1] = d[1] * a[1];
-                    r[1] = x[1] + n[1];
-
-                    /* Frame 1, Channel 0 */
                     x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0];
-                    y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0];
-                    a[2] = inTimeFrac[1] * invSampleRateOut;
-                    d[2] = y[2] - x[2];
-                    n[2] = d[2] * a[2];
-                    r[2] = x[2] + n[2];
-
-                    /* Frame 1, Channel 1 */
                     x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1];
-                    y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1];
-                    a[3] = inTimeFrac[1] * invSampleRateOut;
-                    d[3] = y[3] - x[3];
-                    n[3] = d[3] * a[3];
-                    r[3] = x[3] + n[3];
-
-                    /* Frame 2, Channel 0 */
                     x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0];
-                    y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0];
-                    a[4] = inTimeFrac[2] * invSampleRateOut;
-                    d[4] = y[4] - x[4];
-                    n[4] = d[4] * a[4];
-                    r[4] = x[4] + n[4];
-
-                    /* Frame 2, Channel 1 */
                     x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1];
-                    y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1];
-                    a[5] = inTimeFrac[2] * invSampleRateOut;
-                    d[5] = y[5] - x[5];
-                    n[5] = d[5] * a[5];
-                    r[5] = x[5] + n[5];
-
-                    /* Frame 3, Channel 0 */
                     x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0];
-                    y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0];
-                    a[6] = inTimeFrac[3] * invSampleRateOut;
-                    d[6] = y[6] - x[6];
-                    n[6] = d[6] * a[6];
-                    r[6] = x[6] + n[6];
-
-                    /* Frame 3, Channel 1 */
                     x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1];
+
+                    y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0];
+                    y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1];
+                    y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0];
+                    y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1];
+                    y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0];
+                    y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1];
+                    y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0];
                     y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1];
+
+                    a[0] = inTimeFrac[0] * invSampleRateOut;
+                    a[1] = inTimeFrac[0] * invSampleRateOut;
+                    a[2] = inTimeFrac[1] * invSampleRateOut;
+                    a[3] = inTimeFrac[1] * invSampleRateOut;
+                    a[4] = inTimeFrac[2] * invSampleRateOut;
+                    a[5] = inTimeFrac[2] * invSampleRateOut;
+                    a[6] = inTimeFrac[3] * invSampleRateOut;
                     a[7] = inTimeFrac[3] * invSampleRateOut;
+
+                    d[0] = y[0] - x[0];
+                    d[1] = y[1] - x[1];
+                    d[2] = y[2] - x[2];
+                    d[3] = y[3] - x[3];
+                    d[4] = y[4] - x[4];
+                    d[5] = y[5] - x[5];
+                    d[6] = y[6] - x[6];
                     d[7] = y[7] - x[7];
+
+                    n[0] = d[0] * a[0];
+                    n[1] = d[1] * a[1];
+                    n[2] = d[2] * a[2];
+                    n[3] = d[3] * a[3];
+                    n[4] = d[4] * a[4];
+                    n[5] = d[5] * a[5];
+                    n[6] = d[6] * a[6];
                     n[7] = d[7] * a[7];
+
+                    r[0] = x[0] + n[0];
+                    r[1] = x[1] + n[1];
+                    r[2] = x[2] + n[2];
+                    r[3] = x[3] + n[3];
+                    r[4] = x[4] + n[4];
+                    r[5] = x[5] + n[5];
+                    r[6] = x[6] + n[6];
                     r[7] = x[7] + n[7];
 
                     pFramesOutF32[0] = r[0];
@@ -59553,33 +59890,35 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
                         float d[4];
                         float n[4];
                         float r[4];
-                        
+
                         x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c];
-                        y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c];
-                        a[0] = inTimeFrac[0] * invSampleRateOut;
-                        d[0] = y[0] - x[0];
-                        n[0] = d[0] * a[0];
-                        r[0] = x[0] + n[0];
-
                         x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c];
-                        y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c];
-                        a[1] = inTimeFrac[1] * invSampleRateOut;
-                        d[1] = y[1] - x[1];
-                        n[1] = d[1] * a[1];
-                        r[1] = x[1] + n[1];
-
                         x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c];
-                        y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c];
-                        a[2] = inTimeFrac[2] * invSampleRateOut;
-                        d[2] = y[2] - x[2];
-                        n[2] = d[2] * a[2];
-                        r[2] = x[2] + n[2];
-
                         x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c];
+
+                        y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c];
+                        y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c];
+                        y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c];
                         y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[1] * invSampleRateOut;
+                        a[2] = inTimeFrac[2] * invSampleRateOut;
                         a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
                         d[3] = y[3] - x[3];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
                         n[3] = d[3] * a[3];
+
+                        r[0] = x[0] + n[0];
+                        r[1] = x[1] + n[1];
+                        r[2] = x[2] + n[2];
                         r[3] = x[3] + n[3];
 
                         pFramesOutF32[(0 * channels) + c] = r[0];
@@ -59598,10 +59937,10 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
 
     while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
         if (pResampler->inTimeInt + 1 < frameCountIn) {
-            float a = pResampler->inTimeFrac * invSampleRateOut;
+            float a = (float)(pResampler->inTimeFrac * invSampleRateOut);
 
             for (c = 0; c < pResampler->channels; c += 1) {
-                pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[(pResampler->inTimeInt * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a);
+                pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a);
             }
             pFramesOutF32 += pResampler->channels;
 
@@ -59620,7 +59959,7 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
             the frame to ensure we make some forward progress.
             */
             for (c = 0; c < pResampler->channels; c += 1) {
-                pResampler->x0.f32[c] = pFramesInF32[(pResampler->inTimeInt * pResampler->channels) + c];
+                pResampler->x0.f32[c] = pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c];
             }
 
             pResampler->cachedFrameCount = 1;
@@ -59648,6 +59987,326 @@ static MA_INLINE ma_result ma_linear_resampler_process_pcm_frames_f32_no_lpf(ma_
     return MA_SUCCESS;
 }
 
+
+static MA_INLINE void ma_linear_resampler_filter_s16(ma_linear_resampler* pResampler, ma_int16* pFrame)
+{
+    ma_int32* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        ma_int32 b1 = pLPF[0];
+        ma_int32 b2 = pLPF[1];
+        ma_int32 a1 = pLPF[2];
+        ma_int32 a2 = pLPF[3];
+        ma_int32* pR0 = pLPF + 4;
+        ma_int32* pR1 = pLPF + 4 + pResampler->channels;
+
+        for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
+            ma_int32 r0 = pR0[iChannel];
+            ma_int32 r1 = pR1[iChannel];
+            ma_int32 x = pFrame[iChannel];
+            ma_int32 y;
+
+            y  = (b2*x        + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+            r0 = (b1*x - a1*y + r1);
+            r1 = (b2*x - a2*y);
+
+            pFrame[iChannel] = (ma_int16)ma_clamp(y, -32768, 32767);
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (pResampler->channels * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s16_4(ma_linear_resampler* pResampler, ma_int16* pFrames)
+{
+    ma_linear_resampler_filter_s16(pResampler, pFrames + (0 * pResampler->channels));
+    ma_linear_resampler_filter_s16(pResampler, pFrames + (1 * pResampler->channels));
+    ma_linear_resampler_filter_s16(pResampler, pFrames + (2 * pResampler->channels));
+    ma_linear_resampler_filter_s16(pResampler, pFrames + (3 * pResampler->channels));
+}
+
+#if 1
+static MA_INLINE void ma_linear_resampler_filter_s16_4_stereo(ma_linear_resampler* pResampler, ma_int16* pFrames)
+{
+    ma_linear_resampler_filter_s16(pResampler, pFrames + 0);
+    ma_linear_resampler_filter_s16(pResampler, pFrames + 2);
+    ma_linear_resampler_filter_s16(pResampler, pFrames + 4);
+    ma_linear_resampler_filter_s16(pResampler, pFrames + 6);
+}
+#endif
+
+static MA_INLINE void ma_linear_resampler_filter_s32(ma_linear_resampler* pResampler, ma_uint32 channels, ma_int32* pFrame)
+{
+    ma_int32* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        ma_int32 b1 = pLPF[0];
+        ma_int32 b2 = pLPF[1];
+        ma_int32 a1 = pLPF[2];
+        ma_int32 a2 = pLPF[3];
+        ma_int32* pR0 = pLPF + 4;
+        ma_int32* pR1 = pLPF + 4 + channels;
+
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            ma_int32 r0 = pR0[iChannel];
+            ma_int32 r1 = pR1[iChannel];
+            ma_int32 x = pFrame[iChannel];
+            ma_int32 y;
+
+            y  = (b2*x        + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+            r0 = (b1*x - a1*y + r1);
+            r1 = (b2*x - a2*y);
+
+            pFrame[iChannel] = ma_clamp(y, -32768, 32767);
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (channels * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s32_mono(ma_linear_resampler* pResampler, ma_int32* pFrame)
+{
+    ma_int32* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        ma_int32 b1 = pLPF[0];
+        ma_int32 b2 = pLPF[1];
+        ma_int32 a1 = pLPF[2];
+        ma_int32 a2 = pLPF[3];
+        ma_int32* pR0 = pLPF + 4;
+        ma_int32* pR1 = pLPF + 4 + 1;
+
+        for (iChannel = 0; iChannel < 1; iChannel += 1) {
+            ma_int32 r0 = pR0[iChannel];
+            ma_int32 r1 = pR1[iChannel];
+            ma_int32 x = pFrame[iChannel];
+            ma_int32 y;
+
+            y  = (b2*x        + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+            r0 = (b1*x - a1*y + r1);
+            r1 = (b2*x - a2*y);
+
+            pFrame[iChannel] = ma_clamp(y, -32768, 32767);
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (1 * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s32_stereo(ma_linear_resampler* pResampler, ma_int32* pFrame)
+{
+    ma_int32* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_S32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        ma_int32 b1 = pLPF[0];
+        ma_int32 b2 = pLPF[1];
+        ma_int32 a1 = pLPF[2];
+        ma_int32 a2 = pLPF[3];
+        ma_int32* pR0 = pLPF + 4;
+        ma_int32* pR1 = pLPF + 4 + 2;
+
+        for (iChannel = 0; iChannel < 2; iChannel += 1) {
+            ma_int32 r0 = pR0[iChannel];
+            ma_int32 r1 = pR1[iChannel];
+            ma_int32 x = pFrame[iChannel];
+            ma_int32 y;
+
+            y  = (b2*x        + r0) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+            r0 = (b1*x - a1*y + r1);
+            r1 = (b2*x - a2*y);
+
+            pFrame[iChannel] = ma_clamp(y, -32768, 32767);
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (2 * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s32_4(ma_linear_resampler* pResampler, ma_int32* pFrames)
+{
+    ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (0 * pResampler->channels));
+    ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (1 * pResampler->channels));
+    ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (2 * pResampler->channels));
+    ma_linear_resampler_filter_s32(pResampler, pResampler->channels, pFrames + (3 * pResampler->channels));
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s32_4_mono(ma_linear_resampler* pResampler, ma_int32* pFrames)
+{
+    ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 0);
+    ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 1);
+    ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 2);
+    ma_linear_resampler_filter_s32_mono(pResampler, pFrames + 3);
+}
+
+static MA_INLINE void ma_linear_resampler_filter_s32_4_stereo(ma_linear_resampler* pResampler, ma_int32* pFrames)
+{
+    ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 0);
+    ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 2);
+    ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 4);
+    ma_linear_resampler_filter_s32_stereo(pResampler, pFrames + 6);
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32(ma_linear_resampler* pResampler, float* pFrame)
+{
+    float* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        float b1 = pLPF[0];
+        float b2 = pLPF[1];
+        float a1 = pLPF[2];
+        float a2 = pLPF[3];
+        float* pR0 = pLPF + 4;
+        float* pR1 = pLPF + 4 + pResampler->channels;
+
+        for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
+            float r0 = pR0[iChannel];
+            float r1 = pR1[iChannel];
+            float x = pFrame[iChannel];
+            float y;
+
+            y  = b2*x        + r0;
+            r0 = b1*x - a1*y + r1;
+            r1 = b2*x - a2*y;
+
+            pFrame[iChannel] = y;
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (pResampler->channels * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32_mono(ma_linear_resampler* pResampler, float* pFrame)
+{
+    float* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        float b1 = pLPF[0];
+        float b2 = pLPF[1];
+        float a1 = pLPF[2];
+        float a2 = pLPF[3];
+        float* pR0 = pLPF + 4;
+        float* pR1 = pLPF + 4 + 1;
+
+        for (iChannel = 0; iChannel < 1; iChannel += 1) {
+            float r0 = pR0[iChannel];
+            float r1 = pR1[iChannel];
+            float x = pFrame[iChannel];
+            float y;
+
+            y  = b2*x        + r0;
+            r0 = b1*x - a1*y + r1;
+            r1 = b2*x - a2*y;
+
+            pFrame[iChannel] = y;
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (1 * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32_stereo(ma_linear_resampler* pResampler, float* pFrame)
+{
+    float* pLPF;
+    ma_uint32 iLPF;
+    ma_uint32 lpfCount = pResampler->lpfOrder >> 1;  /* TODO: It's actually more useful to store lpfCount instead of lpfOrder. */
+    ma_uint32 iChannel;
+    
+    pLPF = MA_RESAMPLER_GET_LPF_HEAP_F32(pResampler, 0);
+    for (iLPF = 0; iLPF < lpfCount; iLPF += 1) {
+        float b1 = pLPF[0];
+        float b2 = pLPF[1];
+        float a1 = pLPF[2];
+        float a2 = pLPF[3];
+        float* pR0 = pLPF + 4;
+        float* pR1 = pLPF + 4 + 2;
+
+        for (iChannel = 0; iChannel < 2; iChannel += 1) {
+            float r0 = pR0[iChannel];
+            float r1 = pR1[iChannel];
+            float x = pFrame[iChannel];
+            float y;
+
+            y  = b2*x        + r0;
+            r0 = b1*x - a1*y + r1;
+            r1 = b2*x - a2*y;
+
+            pFrame[iChannel] = y;
+            pR0[iChannel]    = r0;
+            pR1[iChannel]    = r1;
+        }
+
+        /* Go do the next LPF state. */
+        pLPF += 4 + (2 * 2);
+    }
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32_4(ma_linear_resampler* pResampler, float* pFrames)
+{
+    ma_linear_resampler_filter_f32(pResampler, pFrames + (0 * pResampler->channels));
+    ma_linear_resampler_filter_f32(pResampler, pFrames + (1 * pResampler->channels));
+    ma_linear_resampler_filter_f32(pResampler, pFrames + (2 * pResampler->channels));
+    ma_linear_resampler_filter_f32(pResampler, pFrames + (3 * pResampler->channels));
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32_4_mono(ma_linear_resampler* pResampler, float* pFrames)
+{
+    ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 0);
+    ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 1);
+    ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 2);
+    ma_linear_resampler_filter_f32_mono(pResampler, pFrames + 3);
+}
+
+static MA_INLINE void ma_linear_resampler_filter_f32_4_stereo(ma_linear_resampler* pResampler, float* pFrames)
+{
+    ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 0);
+    ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 2);
+    ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 4);
+    ma_linear_resampler_filter_f32_stereo(pResampler, pFrames + 6);
+}
+
+
 static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
 {
     const ma_int16* pFramesInS16;
@@ -59656,6 +60315,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear
     ma_uint64 frameCountOut;
     ma_uint64 framesProcessedIn;
     ma_uint64 framesProcessedOut;
+    ma_uint32 c;
     ma_uint32 invSampleRateOut;
 
     MA_ASSERT(pResampler     != NULL);
@@ -59674,27 +60334,18 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear
         return ma_linear_resampler_process_pcm_frames_s16_no_lpf(pResampler, pFramesInS16, pFrameCountIn, pFramesOutS16, pFrameCountOut, invSampleRateOut);
     } else {
         while (framesProcessedOut < frameCountOut) {
+            ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut;
+
             /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
             while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-                ma_uint32 iChannel;
-
-                if (pFramesInS16 != NULL) {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                        pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
-                    }
-                    pFramesInS16 += pResampler->channels;
-                } else {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                        pResampler->x1.s16[iChannel] = 0;
-                    }
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pResampler->x0.s16[c] = pResampler->x1.s16[c];
+                    pResampler->x1.s16[c] = pFramesInS16[c];
                 }
+                pFramesInS16 += pResampler->channels;
 
-                /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-                if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) {
-                    ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pResampler->x1.s16, pResampler->x1.s16);
-                }
+                /* Filter. */
+                ma_linear_resampler_filter_s16(pResampler, pResampler->x1.s16);
 
                 framesProcessedIn     += 1;
                 pResampler->inTimeInt -= 1;
@@ -59705,12 +60356,10 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear
             }
 
             /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
-            if (pFramesOutS16 != NULL) {
-                MA_ASSERT(pResampler->inTimeInt == 0);
-                ma_linear_resampler_interpolate_frame_s16(pResampler, invSampleRateOut, pFramesOutS16);
-
-                pFramesOutS16 += pResampler->channels;
+            for (c = 0; c < pResampler->channels; c += 1) {
+                pFramesOutS16[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a);
             }
+            pFramesOutS16 += pResampler->channels;
 
             framesProcessedOut += 1;
 
@@ -59738,6 +60387,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
     ma_uint64 frameCountOut;
     ma_uint64 framesProcessedIn;
     ma_uint64 framesProcessedOut;
+    ma_uint32 c;
     ma_uint32 invSampleRateOut;
 
     MA_ASSERT(pResampler     != NULL);
@@ -59760,23 +60410,324 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
         return ma_linear_resampler_process_pcm_frames_s16_no_lpf(pResampler, pFramesInS16, pFrameCountIn, pFramesOutS16, pFrameCountOut, invSampleRateOut);
     } else {
         /* Slow path. Need LPF. */
+
+        #if 1
+        /* If there's a cached frame we need to process it. */
+        if (pResampler->inTimeInt == 0) {
+            MA_ASSERT(pResampler->cachedFrameCount <= 1);   /* There is at most one cached frame. */
+
+            while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) {
+                ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut;
+
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pFramesOutS16[c] = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pFramesInS16[c], a);
+                }
+                ma_linear_resampler_filter_s16(pResampler, pFramesOutS16);
+
+                pFramesOutS16 += pResampler->channels;
+
+                framesProcessedOut += 1;
+
+                /* Advance time forward. */
+                pResampler->inTimeInt  += pResampler->inAdvanceInt;
+                pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+                if (pResampler->inTimeFrac >= pResampler->sampleRateOut) {
+                    pResampler->inTimeFrac -= pResampler->sampleRateOut;
+                    pResampler->inTimeInt  += 1;
+                }
+
+                /* Subtract one from the time to account for the cached frame, but only if the entire frame was processed. */
+                if (pResampler->inTimeInt  > 0) {
+                    pResampler->inTimeInt -= 1;
+                    pResampler->cachedFrameCount = 0;
+                }
+            }
+        } else {
+            /* The rate must have changed between calls. Ignore the cached frame. */
+        }
+
+        /* Experimental loop unrolling to make it easier for SIMD-ification. */
+        #if 1
+        {
+            ma_uint32 channels = pResampler->channels;
+
+            while (framesProcessedOut + 4 <= frameCountOut) {
+                ma_uint32 inTimeIntTemp;
+                ma_uint32 inTimeFracTemp;
+                ma_uint32 inTimeInt[4];
+                ma_uint32 inTimeFrac[4];
+                int i;
+
+                inTimeIntTemp  = pResampler->inTimeInt;
+                inTimeFracTemp = pResampler->inTimeFrac;
+
+                for (i = 0; i < 4; i += 1) {
+                    inTimeInt[i]  = inTimeIntTemp;
+                    inTimeFrac[i] = inTimeFracTemp;
+
+                    inTimeIntTemp  += pResampler->inAdvanceInt;
+                    inTimeFracTemp += pResampler->inAdvanceFrac;
+                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
+                        inTimeFracTemp -= pResampler->sampleRateOut;
+                        inTimeIntTemp  += 1;
+                    }
+                }
+
+                /* Check that we have one extra sample at the end for doing the interpolation. */
+                if (inTimeInt[3] + 1 >= frameCountIn) {
+                    break;  /* Not enough input frames. */
+                }
+
+                /* Advance the timer. */
+                pResampler->inTimeInt  = inTimeIntTemp;
+                pResampler->inTimeFrac = inTimeFracTemp;
+
+                /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
+                {
+                    if (channels == 1) {
+                        ma_int32 x[4];
+                        ma_int32 y[4];
+                        ma_int32 a[4];
+                        ma_int32 d[4];
+                        ma_int32 n[4];
+                        ma_int32 r[4];
+                        
+                        x[0] = pFramesInS16[inTimeInt[0] + 0];
+                        x[1] = pFramesInS16[inTimeInt[1] + 0];
+                        x[2] = pFramesInS16[inTimeInt[2] + 0];
+                        x[3] = pFramesInS16[inTimeInt[3] + 0];
+
+                        y[0] = pFramesInS16[inTimeInt[0] + 1];
+                        y[1] = pFramesInS16[inTimeInt[1] + 1];
+                        y[2] = pFramesInS16[inTimeInt[2] + 1];
+                        y[3] = pFramesInS16[inTimeInt[3] + 1];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[1] * invSampleRateOut;
+                        a[2] = inTimeFrac[2] * invSampleRateOut;
+                        a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
+                        d[3] = y[3] - x[3];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
+                        n[3] = d[3] * a[3];
+
+                        r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                        ma_linear_resampler_filter_s32_4_mono(pResampler, r);
+
+                        pFramesOutS16[0] = (ma_int16)r[0];
+                        pFramesOutS16[1] = (ma_int16)r[1];
+                        pFramesOutS16[2] = (ma_int16)r[2];
+                        pFramesOutS16[3] = (ma_int16)r[3];
+                    } else if (channels == 2) {
+                        ma_int32 x[8];
+                        ma_int32 y[8];
+                        ma_int32 a[8];
+                        ma_int32 d[8];
+                        ma_int32 n[8];
+                        ma_int32 r[8];
+                        
+                        x[0] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 0];
+                        x[1] = pFramesInS16[((inTimeInt[0] + 0) * 2) + 1];
+                        x[2] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 0];
+                        x[3] = pFramesInS16[((inTimeInt[1] + 0) * 2) + 1];
+                        x[4] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 0];
+                        x[5] = pFramesInS16[((inTimeInt[2] + 0) * 2) + 1];
+                        x[6] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 0];
+                        x[7] = pFramesInS16[((inTimeInt[3] + 0) * 2) + 1];
+
+                        y[0] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 0];
+                        y[1] = pFramesInS16[((inTimeInt[0] + 1) * 2) + 1];
+                        y[2] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 0];
+                        y[3] = pFramesInS16[((inTimeInt[1] + 1) * 2) + 1];
+                        y[4] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 0];
+                        y[5] = pFramesInS16[((inTimeInt[2] + 1) * 2) + 1];
+                        y[6] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 0];
+                        y[7] = pFramesInS16[((inTimeInt[3] + 1) * 2) + 1];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[0] * invSampleRateOut;
+                        a[2] = inTimeFrac[1] * invSampleRateOut;
+                        a[3] = inTimeFrac[1] * invSampleRateOut;
+                        a[4] = inTimeFrac[2] * invSampleRateOut;
+                        a[5] = inTimeFrac[2] * invSampleRateOut;
+                        a[6] = inTimeFrac[3] * invSampleRateOut;
+                        a[7] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
+                        d[3] = y[3] - x[3];
+                        d[4] = y[4] - x[4];
+                        d[5] = y[5] - x[5];
+                        d[6] = y[6] - x[6];
+                        d[7] = y[7] - x[7];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
+                        n[3] = d[3] * a[3];
+                        n[4] = d[4] * a[4];
+                        n[5] = d[5] * a[5];
+                        n[6] = d[6] * a[6];
+                        n[7] = d[7] * a[7];
+
+                        r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[4] = x[4] + (n[4] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[5] = x[5] + (n[5] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[6] = x[6] + (n[6] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                        r[7] = x[7] + (n[7] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                        ma_linear_resampler_filter_s32_4_stereo(pResampler, r);
+
+                        pFramesOutS16[0] = (ma_int16)r[0];
+                        pFramesOutS16[1] = (ma_int16)r[1];
+                        pFramesOutS16[2] = (ma_int16)r[2];
+                        pFramesOutS16[3] = (ma_int16)r[3];
+                        pFramesOutS16[4] = (ma_int16)r[4];
+                        pFramesOutS16[5] = (ma_int16)r[5];
+                        pFramesOutS16[6] = (ma_int16)r[6];
+                        pFramesOutS16[7] = (ma_int16)r[7];
+                    } else {
+                        for (c = 0; c < channels; c += 1) {
+                            ma_int32 x[4];
+                            ma_int32 y[4];
+                            ma_int32 a[4];
+                            ma_int32 d[4];
+                            ma_int32 n[4];
+                            ma_int32 r[4];
+                            
+                            x[0] = pFramesInS16[((inTimeInt[0] + 0) * channels) + c];
+                            x[1] = pFramesInS16[((inTimeInt[1] + 0) * channels) + c];
+                            x[2] = pFramesInS16[((inTimeInt[2] + 0) * channels) + c];
+                            x[3] = pFramesInS16[((inTimeInt[3] + 0) * channels) + c];
+                            
+                            y[0] = pFramesInS16[((inTimeInt[0] + 1) * channels) + c];
+                            y[1] = pFramesInS16[((inTimeInt[1] + 1) * channels) + c];
+                            y[2] = pFramesInS16[((inTimeInt[2] + 1) * channels) + c];
+                            y[3] = pFramesInS16[((inTimeInt[3] + 1) * channels) + c];
+
+                            a[0] = inTimeFrac[0] * invSampleRateOut;
+                            a[1] = inTimeFrac[1] * invSampleRateOut;
+                            a[2] = inTimeFrac[2] * invSampleRateOut;
+                            a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                            d[0] = y[0] - x[0];
+                            d[1] = y[1] - x[1];
+                            d[2] = y[2] - x[2];
+                            d[3] = y[3] - x[3];
+
+                            n[0] = d[0] * a[0];
+                            n[1] = d[1] * a[1];
+                            n[2] = d[2] * a[2];
+                            n[3] = d[3] * a[3];
+
+                            r[0] = x[0] + (n[0] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                            r[1] = x[1] + (n[1] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                            r[2] = x[2] + (n[2] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+                            r[3] = x[3] + (n[3] >> MA_LINEAR_RESAMPLER_LERP_SHIFT);
+
+                            pFramesOutS16[(0 * channels) + c] = (ma_int16)r[0];
+                            pFramesOutS16[(1 * channels) + c] = (ma_int16)r[1];
+                            pFramesOutS16[(2 * channels) + c] = (ma_int16)r[2];
+                            pFramesOutS16[(3 * channels) + c] = (ma_int16)r[3];
+                        }
+
+                        ma_linear_resampler_filter_s16_4(pResampler, pFramesOutS16);
+                    }
+
+                    pFramesOutS16 += 4 * channels;
+                    framesProcessedOut += 4;
+                }
+            }
+        }
+        #endif
+
+        while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
+            if (pResampler->inTimeInt + 1 < frameCountIn) {
+                ma_uint32 a = pResampler->inTimeFrac * invSampleRateOut;
+
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pFramesOutS16[c] = ma_linear_resampler_mix_s16(pFramesInS16[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInS16[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a);
+                }
+                ma_linear_resampler_filter_s16(pResampler, pFramesOutS16);
+
+                pFramesOutS16 += pResampler->channels;
+
+                framesProcessedOut += 1;
+
+                /* Advance time forward. */
+                pResampler->inTimeInt  += pResampler->inAdvanceInt;
+                pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+                if (pResampler->inTimeFrac >= pResampler->sampleRateOut) {
+                    pResampler->inTimeFrac -= pResampler->sampleRateOut;
+                    pResampler->inTimeInt  += 1;
+                }
+            } else {
+                /*
+                There is not enough input frames to interpolate. We'll need to stop here. But it's important that we cache
+                the frame to ensure we make some forward progress.
+                */
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pResampler->x0.s16[c] = pFramesInS16[((pResampler->inTimeInt + 0) * pResampler->channels) + c];
+                }
+
+                pResampler->cachedFrameCount = 1;
+                pResampler->inTimeInt += 1;
+
+                break;
+            }
+        }
+
+        /* The number of frames we processed is simply the difference between our current time and previous time, clamped. */
+        framesProcessedIn = pResampler->inTimeInt;
+        if (framesProcessedIn > frameCountIn) { /* Should never overshoot when upsampling. Downsampling could overshoot. */
+            framesProcessedIn = frameCountIn;
+        }
+
+        if (pResampler->inTimeInt >= framesProcessedIn) {
+            pResampler->inTimeInt -= framesProcessedIn;
+        } else {
+            pResampler->inTimeInt = 0;
+        }
+
+        /*
+        Now matter what, we want to cache the last input frame. The reason is that if the sample rate changes from upsampling to downsampling, the
+        downsampling process will be expecting an input frame.
+        */
+        if (framesProcessedIn > 0) {
+            for (c = 0; c < pResampler->channels; c += 1) {
+                pResampler->x1.s16[c] = pFramesInS16[((framesProcessedIn - 1) * pResampler->channels) + c];
+            }
+        }
+
+        *pFrameCountIn  = framesProcessedIn;
+        *pFrameCountOut = framesProcessedOut;
+
+        return MA_SUCCESS;
+        #else
         while (framesProcessedOut < frameCountOut) {
             /* Before interpolating we need to load the buffers. */
             while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
                 ma_uint32 iChannel;
 
-                if (pFramesInS16 != NULL) {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                        pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
-                    }
-                    pFramesInS16 += pResampler->channels;
-                } else {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
-                        pResampler->x1.s16[iChannel] = 0;
-                    }
+                for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
+                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
+                    pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
                 }
+                pFramesInS16 += pResampler->channels;
 
                 framesProcessedIn     += 1;
                 pResampler->inTimeInt -= 1;
@@ -59791,10 +60742,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
                 MA_ASSERT(pResampler->inTimeInt == 0);
                 ma_linear_resampler_interpolate_frame_s16(pResampler, invSampleRateOut, pFramesOutS16);
 
-                /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-                if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) {
-                    ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pFramesOutS16, pFramesOutS16);
-                }
+                /* Filter. */
+                ma_linear_resampler_filter_s16(pResampler, pFramesOutS16);
 
                 pFramesOutS16 += pResampler->channels;
             }
@@ -59814,6 +60763,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_r
         *pFrameCountOut = framesProcessedOut;
 
         return MA_SUCCESS;
+        #endif
     }
 }
 
@@ -59837,6 +60787,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
     ma_uint64 frameCountOut;
     ma_uint64 framesProcessedIn;
     ma_uint64 framesProcessedOut;
+    ma_uint32 c;
     float invSampleRateOut;
 
     MA_ASSERT(pResampler     != NULL);
@@ -59856,28 +60807,21 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
         return ma_linear_resampler_process_pcm_frames_f32_no_lpf(pResampler, pFramesInF32, pFrameCountIn, pFramesOutF32, pFrameCountOut, invSampleRateOut);
     } else {
         /* Slow path. Need LPF. */
+        /*printf("DOWN\n");*/
+
         while (framesProcessedOut < frameCountOut) {
+            float a = (float)(pResampler->inTimeFrac * invSampleRateOut);
+
             /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
             while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
-                ma_uint32 iChannel;
-
-                if (pFramesInF32 != NULL) {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                        pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
-                    }
-                    pFramesInF32 += pResampler->channels;
-                } else {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                        pResampler->x1.f32[iChannel] = 0;
-                    }
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pResampler->x0.f32[c] = pResampler->x1.f32[c];
+                    pResampler->x1.f32[c] = pFramesInF32[c];
                 }
+                pFramesInF32 += pResampler->channels;
 
-                /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-                if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) {
-                    ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pResampler->x1.f32, pResampler->x1.f32);
-                }
+                /* Filter. */
+                ma_linear_resampler_filter_f32(pResampler, pResampler->x1.f32);
 
                 framesProcessedIn     += 1;
                 pResampler->inTimeInt -= 1;
@@ -59888,12 +60832,10 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear
             }
 
             /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
-            if (pFramesOutF32 != NULL) {
-                MA_ASSERT(pResampler->inTimeInt == 0);
-                ma_linear_resampler_interpolate_frame_f32(pResampler, invSampleRateOut, pFramesOutF32);
-
-                pFramesOutF32 += pResampler->channels;
+            for (c = 0; c < pResampler->channels; c += 1) {
+                pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
             }
+            pFramesOutF32 += pResampler->channels;
 
             framesProcessedOut += 1;
 
@@ -59921,7 +60863,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
     ma_uint64 frameCountOut;
     ma_uint64 framesProcessedIn;
     ma_uint64 framesProcessedOut;
-    float invSampleRateOut;
+    ma_uint32 c;
+    double invSampleRateOut;
 
     MA_ASSERT(pResampler     != NULL);
     MA_ASSERT(pFrameCountIn  != NULL);
@@ -59933,30 +60876,332 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
     frameCountOut      = *pFrameCountOut;
     framesProcessedIn  = 0;
     framesProcessedOut = 0;
-    invSampleRateOut   = 1.0f / pResampler->sampleRateOut;
+    invSampleRateOut   = (1.0 / pResampler->sampleRateOut);
 
     if (pResampler->lpfOrder == 0) {
         /* Fast path. No LPF needed. */
         return ma_linear_resampler_process_pcm_frames_f32_no_lpf(pResampler, pFramesInF32, pFrameCountIn, pFramesOutF32, pFrameCountOut, invSampleRateOut);
     } else {
         /* Slow path. Need LPF. */
+        /*printf("UP %u\n", pResampler->inTimeInt);*/
+
+        #if 1
+        /* If there's a cached frame we need to process it. */
+        if (pResampler->inTimeInt == 0) {
+            MA_ASSERT(pResampler->cachedFrameCount <= 1);   /* There is at most one cached frame. */
+
+            while (pResampler->cachedFrameCount > 0 && frameCountIn > 0 && framesProcessedOut < frameCountOut) {
+                float a = (float)(pResampler->inTimeFrac * invSampleRateOut);
+
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pFramesOutF32[c] = ma_mix_f32_fast(pResampler->x0.f32[c], pFramesInF32[c], a);
+                }
+                ma_linear_resampler_filter_f32(pResampler, pFramesOutF32);
+
+                pFramesOutF32 += pResampler->channels;
+
+                framesProcessedOut += 1;
+
+                /* Advance time forward. */
+                pResampler->inTimeInt  += pResampler->inAdvanceInt;
+                pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+                if (pResampler->inTimeFrac >= pResampler->sampleRateOut) {
+                    pResampler->inTimeFrac -= pResampler->sampleRateOut;
+                    pResampler->inTimeInt  += 1;
+                }
+
+                /* Subtract one from the time to account for the cached frame, but only if the entire frame was processed. */
+                if (pResampler->inTimeInt  > 0) {
+                    pResampler->inTimeInt -= 1;
+                    pResampler->cachedFrameCount = 0;
+                }
+            }
+        } else {
+            /* The rate must have changed between calls. Ignore the cached frame. */
+        }
+
+        /* Experimental loop unrolling to make it easier for SIMD-ification. */
+        #if 1
+        {
+            ma_uint32 channels = pResampler->channels;
+
+            while (framesProcessedOut + 4 <= frameCountOut) {
+                ma_uint32 inTimeIntTemp;
+                ma_uint32 inTimeFracTemp;
+                ma_uint32 inTimeInt[4];
+                ma_uint32 inTimeFrac[4];
+                int i;
+
+                inTimeIntTemp  = pResampler->inTimeInt;
+                inTimeFracTemp = pResampler->inTimeFrac;
+
+                for (i = 0; i < 4; i += 1) {
+                    inTimeInt[i]  = inTimeIntTemp;
+                    inTimeFrac[i] = inTimeFracTemp;
+
+                    inTimeIntTemp  += pResampler->inAdvanceInt;
+                    inTimeFracTemp += pResampler->inAdvanceFrac;
+                    if (inTimeFracTemp >= pResampler->sampleRateOut) {
+                        inTimeFracTemp -= pResampler->sampleRateOut;
+                        inTimeIntTemp  += 1;
+                    }
+                }
+
+                /* Check that we have one extra sample at the end for doing the interpolation. */
+                if (inTimeInt[3] + 1 >= frameCountIn) {
+                    break;  /* Not enough input frames. */
+                }
+
+                /* Advance the timer. */
+                pResampler->inTimeInt  = inTimeIntTemp;
+                pResampler->inTimeFrac = inTimeFracTemp;
+
+                /* We should now be able to SIMD-ify the rest. For now I am trusting the compiler to vectorize this, but I'll experiment with some manual stuff later. */
+                {
+                    if (channels == 1) {
+                        float x[4];
+                        float y[4];
+                        float a[4];
+                        float d[4];
+                        float n[4];
+                        float r[4];
+                        
+                        x[0] = pFramesInF32[inTimeInt[0] + 0];
+                        x[1] = pFramesInF32[inTimeInt[1] + 0];
+                        x[2] = pFramesInF32[inTimeInt[2] + 0];
+                        x[3] = pFramesInF32[inTimeInt[3] + 0];
+
+                        y[0] = pFramesInF32[inTimeInt[0] + 1];
+                        y[1] = pFramesInF32[inTimeInt[1] + 1];
+                        y[2] = pFramesInF32[inTimeInt[2] + 1];
+                        y[3] = pFramesInF32[inTimeInt[3] + 1];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[1] * invSampleRateOut;
+                        a[2] = inTimeFrac[2] * invSampleRateOut;
+                        a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
+                        d[3] = y[3] - x[3];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
+                        n[3] = d[3] * a[3];
+
+                        r[0] = x[0] + n[0];
+                        r[1] = x[1] + n[1];
+                        r[2] = x[2] + n[2];
+                        r[3] = x[3] + n[3];
+
+                        ma_linear_resampler_filter_f32_4_mono(pResampler, r);
+
+                        pFramesOutF32[0] = r[0];
+                        pFramesOutF32[1] = r[1];
+                        pFramesOutF32[2] = r[2];
+                        pFramesOutF32[3] = r[3];
+                    } else if (channels == 2) {
+                        float x[8];
+                        float y[8];
+                        float a[8];
+                        float d[8];
+                        float n[8];
+                        float r[8];
+                        
+                        x[0] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 0];
+                        x[1] = pFramesInF32[((inTimeInt[0] + 0) * 2) + 1];
+                        x[2] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 0];
+                        x[3] = pFramesInF32[((inTimeInt[1] + 0) * 2) + 1];
+                        x[4] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 0];
+                        x[5] = pFramesInF32[((inTimeInt[2] + 0) * 2) + 1];
+                        x[6] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 0];
+                        x[7] = pFramesInF32[((inTimeInt[3] + 0) * 2) + 1];
+
+                        y[0] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 0];
+                        y[1] = pFramesInF32[((inTimeInt[0] + 1) * 2) + 1];
+                        y[2] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 0];
+                        y[3] = pFramesInF32[((inTimeInt[1] + 1) * 2) + 1];
+                        y[4] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 0];
+                        y[5] = pFramesInF32[((inTimeInt[2] + 1) * 2) + 1];
+                        y[6] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 0];
+                        y[7] = pFramesInF32[((inTimeInt[3] + 1) * 2) + 1];
+
+                        a[0] = inTimeFrac[0] * invSampleRateOut;
+                        a[1] = inTimeFrac[0] * invSampleRateOut;
+                        a[2] = inTimeFrac[1] * invSampleRateOut;
+                        a[3] = inTimeFrac[1] * invSampleRateOut;
+                        a[4] = inTimeFrac[2] * invSampleRateOut;
+                        a[5] = inTimeFrac[2] * invSampleRateOut;
+                        a[6] = inTimeFrac[3] * invSampleRateOut;
+                        a[7] = inTimeFrac[3] * invSampleRateOut;
+
+                        d[0] = y[0] - x[0];
+                        d[1] = y[1] - x[1];
+                        d[2] = y[2] - x[2];
+                        d[3] = y[3] - x[3];
+                        d[4] = y[4] - x[4];
+                        d[5] = y[5] - x[5];
+                        d[6] = y[6] - x[6];
+                        d[7] = y[7] - x[7];
+
+                        n[0] = d[0] * a[0];
+                        n[1] = d[1] * a[1];
+                        n[2] = d[2] * a[2];
+                        n[3] = d[3] * a[3];
+                        n[4] = d[4] * a[4];
+                        n[5] = d[5] * a[5];
+                        n[6] = d[6] * a[6];
+                        n[7] = d[7] * a[7];
+
+                        r[0] = x[0] + n[0];
+                        r[1] = x[1] + n[1];
+                        r[2] = x[2] + n[2];
+                        r[3] = x[3] + n[3];
+                        r[4] = x[4] + n[4];
+                        r[5] = x[5] + n[5];
+                        r[6] = x[6] + n[6];
+                        r[7] = x[7] + n[7];
+
+                        ma_linear_resampler_filter_f32_4_stereo(pResampler, r);
+
+                        pFramesOutF32[0] = r[0];
+                        pFramesOutF32[1] = r[1];
+                        pFramesOutF32[2] = r[2];
+                        pFramesOutF32[3] = r[3];
+                        pFramesOutF32[4] = r[4];
+                        pFramesOutF32[5] = r[5];
+                        pFramesOutF32[6] = r[6];
+                        pFramesOutF32[7] = r[7];
+                    } else {
+                        for (c = 0; c < channels; c += 1) {
+                            float x[4];
+                            float y[4];
+                            float a[4];
+                            float d[4];
+                            float n[4];
+                            float r[4];
+                            
+                            x[0] = pFramesInF32[((inTimeInt[0] + 0) * channels) + c];
+                            x[1] = pFramesInF32[((inTimeInt[1] + 0) * channels) + c];
+                            x[2] = pFramesInF32[((inTimeInt[2] + 0) * channels) + c];
+                            x[3] = pFramesInF32[((inTimeInt[3] + 0) * channels) + c];
+
+                            y[0] = pFramesInF32[((inTimeInt[0] + 1) * channels) + c];
+                            y[1] = pFramesInF32[((inTimeInt[1] + 1) * channels) + c];
+                            y[2] = pFramesInF32[((inTimeInt[2] + 1) * channels) + c];
+                            y[3] = pFramesInF32[((inTimeInt[3] + 1) * channels) + c];
+
+                            a[0] = inTimeFrac[0] * invSampleRateOut;
+                            a[1] = inTimeFrac[1] * invSampleRateOut;
+                            a[2] = inTimeFrac[2] * invSampleRateOut;
+                            a[3] = inTimeFrac[3] * invSampleRateOut;
+
+                            d[0] = y[0] - x[0];
+                            d[1] = y[1] - x[1];
+                            d[2] = y[2] - x[2];
+                            d[3] = y[3] - x[3];
+
+                            n[0] = d[0] * a[0];
+                            n[1] = d[1] * a[1];
+                            n[2] = d[2] * a[2];
+                            n[3] = d[3] * a[3];
+
+                            r[0] = x[0] + n[0];
+                            r[1] = x[1] + n[1];
+                            r[2] = x[2] + n[2];
+                            r[3] = x[3] + n[3];
+
+                            pFramesOutF32[(0 * channels) + c] = r[0];
+                            pFramesOutF32[(1 * channels) + c] = r[1];
+                            pFramesOutF32[(2 * channels) + c] = r[2];
+                            pFramesOutF32[(3 * channels) + c] = r[3];
+                        }
+
+                        ma_linear_resampler_filter_f32_4(pResampler, pFramesOutF32);
+                    }
+
+                    pFramesOutF32 += 4 * channels;
+                    framesProcessedOut += 4;
+                }
+            }
+        }
+        #endif
+
+        while (framesProcessedOut < frameCountOut && pResampler->inTimeInt < frameCountIn) {
+            if (pResampler->inTimeInt + 1 < frameCountIn) {
+                float a = (float)(pResampler->inTimeFrac * invSampleRateOut);
+
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pFramesOutF32[c] = ma_mix_f32_fast(pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c], pFramesInF32[((pResampler->inTimeInt + 1) * pResampler->channels) + c], a);
+                }
+                ma_linear_resampler_filter_f32(pResampler, pFramesOutF32);
+
+                pFramesOutF32 += pResampler->channels;
+
+                framesProcessedOut += 1;
+
+                /* Advance time forward. */
+                pResampler->inTimeInt  += pResampler->inAdvanceInt;
+                pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+                if (pResampler->inTimeFrac >= pResampler->sampleRateOut) {
+                    pResampler->inTimeFrac -= pResampler->sampleRateOut;
+                    pResampler->inTimeInt  += 1;
+                }
+            } else {
+                /*
+                There is not enough input frames to interpolate. We'll need to stop here. But it's important that we cache
+                the frame to ensure we make some forward progress.
+                */
+                for (c = 0; c < pResampler->channels; c += 1) {
+                    pResampler->x0.f32[c] = pFramesInF32[((pResampler->inTimeInt + 0) * pResampler->channels) + c];
+                }
+
+                pResampler->cachedFrameCount = 1;
+                pResampler->inTimeInt += 1;
+
+                break;
+            }
+        }
+
+        /* The number of frames we processed is simply the difference between our current time and previous time, clamped. */
+        framesProcessedIn = pResampler->inTimeInt;
+        if (framesProcessedIn > frameCountIn) { /* Should never overshoot when upsampling. Downsampling could overshoot. */
+            framesProcessedIn = frameCountIn;
+        }
+
+        if (pResampler->inTimeInt >= framesProcessedIn) {
+            pResampler->inTimeInt -= framesProcessedIn;
+        } else {
+            pResampler->inTimeInt = 0;
+        }
+
+        /*
+        Now matter what, we want to cache the last input frame. The reason is that if the sample rate changes from upsampling to downsampling, the
+        downsampling process will be expecting an input frame.
+        */
+        if (framesProcessedIn > 0) {
+            for (c = 0; c < pResampler->channels; c += 1) {
+                pResampler->x1.f32[c] = pFramesInF32[((framesProcessedIn - 1) * pResampler->channels) + c];
+            }
+        }
+
+        *pFrameCountIn  = framesProcessedIn;
+        *pFrameCountOut = framesProcessedOut;
+
+        return MA_SUCCESS;
+        #else
         while (framesProcessedOut < frameCountOut) {
             /* Before interpolating we need to load the buffers. */
             while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
                 ma_uint32 iChannel;
 
-                if (pFramesInF32 != NULL) {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                        pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
-                    }
-                    pFramesInF32 += pResampler->channels;
-                } else {
-                    for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
-                        pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
-                        pResampler->x1.f32[iChannel] = 0;
-                    }
+                for (iChannel = 0; iChannel < pResampler->channels; iChannel += 1) {
+                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
+                    pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
                 }
+                pFramesInF32 += pResampler->channels;
 
                 framesProcessedIn     += 1;
                 pResampler->inTimeInt -= 1;
@@ -59971,10 +61216,8 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
                 MA_ASSERT(pResampler->inTimeInt == 0);
                 ma_linear_resampler_interpolate_frame_f32(pResampler, invSampleRateOut, pFramesOutF32);
 
-                /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
-                if (pResampler->lpfOrder > 0 && (pResampler->inAdvanceInt != 1 || pResampler->inAdvanceFrac != 0)) {
-                    ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pFramesOutF32, pFramesOutF32);
-                }
+                /* Filter. */
+                ma_linear_resampler_filter_f32(pResampler, pFramesOutF32);
 
                 pFramesOutF32 += pResampler->channels;
             }
@@ -59994,6 +61237,7 @@ static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_r
         *pFrameCountOut = framesProcessedOut;
 
         return MA_SUCCESS;
+        #endif
     }
 }
 
@@ -60045,7 +61289,7 @@ MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResamp
         return MA_INVALID_ARGS;
     }
 
-    d = 1000000;
+    d = 1000;
     n = (ma_uint32)(ratioInOut * d);
 
     if (n == 0) {