From c9e2258daedef338e10fdd3c88287de533b4d3cd Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sat, 13 Mar 2021 17:46:22 -0800
Subject: [PATCH 1/6] introduce MA_COMPILER_HAS_BUILTIN macro

Both GCC and Clang can use this feature, so let's make it more general.

I didn't touch the dr_wav/dr_flac parts using this, since I figure the
amalgamated miniaudio.h header isn't the primary source for those.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/miniaudio.h b/miniaudio.h
index 5166f93f..e22f4b1f 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -6734,22 +6734,25 @@ static MA_INLINE ma_bool32 ma_has_neon(void)
     #endif
 #endif
 
+#if defined(__has_builtin)
+    #define MA_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
+#else
+    #define MA_COMPILER_HAS_BUILTIN(x) 0
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER >= 1400
     #define MA_HAS_BYTESWAP16_INTRINSIC
     #define MA_HAS_BYTESWAP32_INTRINSIC
     #define MA_HAS_BYTESWAP64_INTRINSIC
 #elif defined(__clang__)
-    #if defined(__has_builtin)
-        #if __has_builtin(__builtin_bswap16)
-            #define MA_HAS_BYTESWAP16_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap32)
-            #define MA_HAS_BYTESWAP32_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap64)
-            #define MA_HAS_BYTESWAP64_INTRINSIC
-        #endif
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap16)
+        #define MA_HAS_BYTESWAP16_INTRINSIC
+    #endif
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap32)
+        #define MA_HAS_BYTESWAP32_INTRINSIC
+    #endif
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap64)
+        #define MA_HAS_BYTESWAP64_INTRINSIC
     #endif
 #elif defined(__GNUC__)
     #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))

From 5472d8418065bfd3a6e1ce3cd0afa6b60c11eb6e Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sat, 13 Mar 2021 17:48:41 -0800
Subject: [PATCH 2/6] introduce MA_ASSUME macro

This macro can be used to tell the compiler's optimization passes static
assumptions which you *know* are true about code behavior.

Use of these can be risky -- if you assume incorrectly, the compiler may
emit code that will not work in circumstances you didn't anticipate.

On the other hand, use of this macro in places where the optimizer is
missing an assumption that would have been safe to make can cause it to
emit more compact/optimal code.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/miniaudio.h b/miniaudio.h
index e22f4b1f..6d648d61 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -6740,6 +6740,18 @@ static MA_INLINE ma_bool32 ma_has_neon(void)
     #define MA_COMPILER_HAS_BUILTIN(x) 0
 #endif
 
+#ifndef MA_ASSUME
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_assume)
+        #define MA_ASSUME(x) __builtin_assume(x)
+    #elif MA_COMPILER_HAS_BUILTIN(__builtin_unreachable)
+        #define MA_ASSUME(x) do { if (!(x)) __builtin_unreachable(); } while (0)
+    #elif defined(_MSC_VER)
+        #define MA_ASSUME(x) __assume(x)
+    #else
+        #define MA_ASSUME(x) while(0)
+    #endif
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER >= 1400
     #define MA_HAS_BYTESWAP16_INTRINSIC
     #define MA_HAS_BYTESWAP32_INTRINSIC

From 2dcce6d53bfc6708a57b36ab272dcd36526ee9ec Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sat, 13 Mar 2021 19:09:55 -0800
Subject: [PATCH 3/6] introduce MA_RESTRICT macro

This allows us to use the __restrict keyword in places where we know
that pointers do not alias.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/miniaudio.h b/miniaudio.h
index 6d648d61..728da361 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -6752,6 +6752,14 @@ static MA_INLINE ma_bool32 ma_has_neon(void)
     #endif
 #endif
 
+#ifndef MA_RESTRICT
+    #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
+        #define MA_RESTRICT __restrict
+    #else
+        #define MA_RESTRICT
+    #endif
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER >= 1400
     #define MA_HAS_BYTESWAP16_INTRINSIC
     #define MA_HAS_BYTESWAP32_INTRINSIC

From c88bb8ccd2574bab5036b2d529106c00fef66fae Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Mon, 15 Mar 2021 04:08:34 -0700
Subject: [PATCH 4/6] extract channel count constants from loops

These values are constant, but Clang has some trouble noticing that,
especially if the loop body is complex enough. This prevents it from
noticing places where vectorization is possible (and desirable).

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h                 | 99 ++++++++++++++++++++-----------------
 research/miniaudio_engine.h | 10 ++--
 2 files changed, 61 insertions(+), 48 deletions(-)

diff --git a/miniaudio.h b/miniaudio.h
index 728da361..03ebc713 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -36317,13 +36317,14 @@ MA_API ma_result ma_biquad_reinit(const ma_biquad_config* pConfig, ma_biquad* pB
 static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(ma_biquad* pBQ, float* pY, const float* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pBQ->channels;
     const float b0 = pBQ->b0.f32;
     const float b1 = pBQ->b1.f32;
     const float b2 = pBQ->b2.f32;
     const float a1 = pBQ->a1.f32;
     const float a2 = pBQ->a2.f32;
 
-    for (c = 0; c < pBQ->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         float r1 = pBQ->r1[c].f32;
         float r2 = pBQ->r2[c].f32;
         float x  = pX[c];
@@ -36347,13 +36348,14 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32(ma_biquad* pBQ, float* pY,
 static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(ma_biquad* pBQ, ma_int16* pY, const ma_int16* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pBQ->channels;
     const ma_int32 b0 = pBQ->b0.s32;
     const ma_int32 b1 = pBQ->b1.s32;
     const ma_int32 b2 = pBQ->b2.s32;
     const ma_int32 a1 = pBQ->a1.s32;
     const ma_int32 a2 = pBQ->a2.s32;
 
-    for (c = 0; c < pBQ->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pBQ->r1[c].s32;
         ma_int32 r2 = pBQ->r2[c].s32;
         ma_int32 x  = pX[c];
@@ -36517,10 +36519,11 @@ MA_API ma_result ma_lpf1_reinit(const ma_lpf1_config* pConfig, ma_lpf1* pLPF)
 static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pLPF->channels;
     const float a = pLPF->a.f32;
     const float b = 1 - a;
 
-    for (c = 0; c < pLPF->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         float r1 = pLPF->r1[c].f32;
         float x  = pX[c];
         float y;
@@ -36535,10 +36538,11 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, co
 static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY, const ma_int16* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pLPF->channels;
     const ma_int32 a = pLPF->a.s32;
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    for (c = 0; c < pLPF->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pLPF->r1[c].s32;
         ma_int32 x  = pX[c];
         ma_int32 y;
@@ -37024,10 +37028,11 @@ MA_API ma_result ma_hpf1_reinit(const ma_hpf1_config* pConfig, ma_hpf1* pHPF)
 static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, const float* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pHPF->channels;
     const float a = 1 - pHPF->a.f32;
     const float b = 1 - a;
 
-    for (c = 0; c < pHPF->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         float r1 = pHPF->r1[c].f32;
         float x  = pX[c];
         float y;
@@ -37042,10 +37047,11 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
 static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY, const ma_int16* pX)
 {
     ma_uint32 c;
+    const ma_uint32 channels = pHPF->channels;
     const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
-    for (c = 0; c < pHPF->channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pHPF->r1[c].s32;
         ma_int32 x  = pX[c];
         ma_int32 y;
@@ -38382,6 +38388,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 {
     ma_uint32 c;
     ma_uint32 a;
+    const ma_uint32 channels = pResampler->config.channels;
     const ma_uint32 shift = 12;
 
     MA_ASSERT(pResampler != NULL);
@@ -38389,7 +38396,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 
     a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
 
-    for (c = 0; c < pResampler->config.channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
         pFrameOut[c] = s;
     }
@@ -38400,13 +38407,14 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
 {
     ma_uint32 c;
     float a;
+    const ma_uint32 channels = pResampler->config.channels;
 
     MA_ASSERT(pResampler != NULL);
     MA_ASSERT(pFrameOut  != NULL);
 
     a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
 
-    for (c = 0; c < pResampler->config.channels; c += 1) {
+    for (c = 0; c < channels; c += 1) {
         float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
         pFrameOut[c] = s;
     }
@@ -48714,20 +48722,21 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
 {
     ma_uint64 iFrame;
     ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_white(pNoise);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = ma_noise_f32_white(pNoise);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_white(pNoise);
                 }
             }
         }
@@ -48736,31 +48745,31 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 ma_int16 s = ma_noise_s16_white(pNoise);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = ma_noise_s16_white(pNoise);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_white(pNoise);
                 }
             }
         }
     } else {
-        ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        ma_uint32 bpf = bps * pNoise->config.channels;
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
 
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_white(pNoise);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     float s = ma_noise_f32_white(pNoise);
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
@@ -48831,20 +48840,21 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
 {
     ma_uint64 iFrame;
     ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = ma_noise_f32_pink(pNoise, iChannel);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_pink(pNoise, iChannel);
                 }
             }
         }
@@ -48853,31 +48863,31 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 ma_int16 s = ma_noise_s16_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = ma_noise_s16_pink(pNoise, iChannel);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_pink(pNoise, iChannel);
                 }
             }
         }
     } else {
-        ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        ma_uint32 bpf = bps * pNoise->config.channels;
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
 
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_pink(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     float s = ma_noise_f32_pink(pNoise, iChannel);
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
@@ -48911,20 +48921,21 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
 {
     ma_uint64 iFrame;
     ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutF32[iFrame*pNoise->config.channels + iChannel] = ma_noise_f32_brownian(pNoise, iChannel);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_brownian(pNoise, iChannel);
                 }
             }
         }
@@ -48933,31 +48944,31 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 ma_int16 s = ma_noise_s16_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = s;
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
-                    pFramesOutS16[iFrame*pNoise->config.channels + iChannel] = ma_noise_s16_brownian(pNoise, iChannel);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_brownian(pNoise, iChannel);
                 }
             }
         }
     } else {
-        ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
-        ma_uint32 bpf = bps * pNoise->config.channels;
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
 
         if (pNoise->config.duplicateChannels) {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
                 float s = ma_noise_f32_brownian(pNoise, 0);
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
             }
         } else {
             for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
-                for (iChannel = 0; iChannel < pNoise->config.channels; iChannel += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
                     float s = ma_noise_f32_brownian(pNoise, iChannel);
                     ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
                 }
diff --git a/research/miniaudio_engine.h b/research/miniaudio_engine.h
index e1d4d5dc..10f65600 100644
--- a/research/miniaudio_engine.h
+++ b/research/miniaudio_engine.h
@@ -9320,6 +9320,8 @@ MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer,
         float gain = 1;
         ma_uint32 iChannel;
         float channelGainsOut[MA_MAX_CHANNELS];
+        const ma_uint32 channelsOut = pSpatializer->config.channelsOut;
+        const ma_uint32 channelsIn = pSpatializer->config.channelsIn;
 
         /*
         We'll need the listener velocity for doppler pitch calculations. The speed of sound is
@@ -9520,12 +9522,12 @@ MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer,
         be +1 on the X axis. A dot product is performed against the direction vector of the channel and the normalized
         position of the sound.
         */
-        for (iChannel = 0; iChannel < pSpatializer->config.channelsOut; iChannel += 1) {
+        for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
             channelGainsOut[iChannel] = gain;
         }
 
         /* Convert to our output channel count. */
-        ma_convert_pcm_frames_channels_f32((float*)pFramesOut, pSpatializer->config.channelsOut, pChannelMapOut, (const float*)pFramesIn, pSpatializer->config.channelsIn, pChannelMapIn, frameCount);
+        ma_convert_pcm_frames_channels_f32((float*)pFramesOut, channelsOut, pChannelMapOut, (const float*)pFramesIn, channelsIn, pChannelMapIn, frameCount);
 
         /*
         Calculate our per-channel gains. We do this based on the normalized relative position of the sound and it's
@@ -9538,7 +9540,7 @@ MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer,
             unitPos.y *= distanceInv;
             unitPos.z *= distanceInv;
 
-            for (iChannel = 0; iChannel < pSpatializer->config.channelsOut; iChannel += 1) {
+            for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
                 float d = ma_vec3f_dot(unitPos, g_maChannelDirections[pChannelMapOut[iChannel]]);
 
                 /*
@@ -9606,7 +9608,7 @@ MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer,
         }
 
         /* Now we need to apply the volume to each channel. */
-        ma_apply_volume_factor_per_channel_f32((float*)pFramesOut, frameCount, pSpatializer->config.channelsOut, channelGainsOut);
+        ma_apply_volume_factor_per_channel_f32((float*)pFramesOut, frameCount, channelsOut, channelGainsOut);
 
         /*
         Before leaving we'll want to update our doppler pitch so that the caller can apply some

From c1451b30a431ccdcae1ca032c68bd7112a1d67d0 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Sat, 13 Mar 2021 19:17:24 -0800
Subject: [PATCH 5/6] apply MA_RESTRICT to linear resampler interpolation
 functions

This tells the compiler that pFrameOut does not alias to pointers used
within the function, and teaches Clang that the loop is vectorizable.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/miniaudio.h b/miniaudio.h
index 03ebc713..fb2303bb 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -38384,7 +38384,7 @@ static MA_INLINE ma_int16 ma_linear_resampler_mix_s16(ma_int16 x, ma_int16 y, ma
     return (ma_int16)(r >> shift);
 }
 
-static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResampler, ma_int16* pFrameOut)
+static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResampler, ma_int16* MA_RESTRICT pFrameOut)
 {
     ma_uint32 c;
     ma_uint32 a;
@@ -38403,7 +38403,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 }
 
 
-static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResampler, float* pFrameOut)
+static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResampler, float* MA_RESTRICT pFrameOut)
 {
     ma_uint32 c;
     float a;

From 8a1858eba693c7d23db7a24720144b0c894c138d Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Mon, 15 Mar 2021 04:04:26 -0700
Subject: [PATCH 6/6] use MA_ASSUME for channel counts before loops

The range of the value isn't obvious to any compiler, as it could go for
one iteration or 4 billion iterations. Adding MA_ASSUME in these places
helps the compiler understand the range of possible values, and know how
heavily to vectorize (or not vectorize) these loops.

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
---
 miniaudio.h                 | 11 +++++++++++
 research/miniaudio_engine.h |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/miniaudio.h b/miniaudio.h
index fb2303bb..fc0f2ca0 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -36324,6 +36324,7 @@ static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(
     const float a1 = pBQ->a1.f32;
     const float a2 = pBQ->a2.f32;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float r1 = pBQ->r1[c].f32;
         float r2 = pBQ->r2[c].f32;
@@ -36355,6 +36356,7 @@ static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(
     const ma_int32 a1 = pBQ->a1.s32;
     const ma_int32 a2 = pBQ->a2.s32;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pBQ->r1[c].s32;
         ma_int32 r2 = pBQ->r2[c].s32;
@@ -36523,6 +36525,7 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, co
     const float a = pLPF->a.f32;
     const float b = 1 - a;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float r1 = pLPF->r1[c].f32;
         float x  = pX[c];
@@ -36542,6 +36545,7 @@ static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY,
     const ma_int32 a = pLPF->a.s32;
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pLPF->r1[c].s32;
         ma_int32 x  = pX[c];
@@ -37032,6 +37036,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, co
     const float a = 1 - pHPF->a.f32;
     const float b = 1 - a;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float r1 = pHPF->r1[c].f32;
         float x  = pX[c];
@@ -37051,6 +37056,7 @@ static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY,
     const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
     const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int32 r1 = pHPF->r1[c].s32;
         ma_int32 x  = pX[c];
@@ -38396,6 +38402,7 @@ static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResa
 
     a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
         pFrameOut[c] = s;
@@ -38414,6 +38421,7 @@ static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResa
 
     a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
 
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
     for (c = 0; c < channels; c += 1) {
         float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
         pFrameOut[c] = s;
@@ -48723,6 +48731,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, voi
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -48841,6 +48850,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
@@ -48922,6 +48932,7 @@ static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise,
     ma_uint64 iFrame;
     ma_uint32 iChannel;
     const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels >= MA_MIN_CHANNELS && channels <= MA_MAX_CHANNELS);
 
     if (pNoise->config.format == ma_format_f32) {
         float* pFramesOutF32 = (float*)pFramesOut;
diff --git a/research/miniaudio_engine.h b/research/miniaudio_engine.h
index 10f65600..101361df 100644
--- a/research/miniaudio_engine.h
+++ b/research/miniaudio_engine.h
@@ -9323,6 +9323,9 @@ MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer,
         const ma_uint32 channelsOut = pSpatializer->config.channelsOut;
         const ma_uint32 channelsIn = pSpatializer->config.channelsIn;
 
+        MA_ASSUME(channelsOut >= MA_MIN_CHANNELS && channelsOut <= MA_MAX_CHANNELS);
+        MA_ASSUME(channelsIn >= MA_MIN_CHANNELS && channelsIn <= MA_MAX_CHANNELS);
+
         /*
         We'll need the listener velocity for doppler pitch calculations. The speed of sound is
         defined by the listener, so we'll grab that here too.