From edb64e60172a9cbb39302aff1ca15cc90e01da4a Mon Sep 17 00:00:00 2001
From: David Reid <mackron@gmail.com>
Date: Sun, 25 Jan 2026 14:05:30 +1000
Subject: [PATCH] Add a new SPSC ring buffer.

This improves on the old ring buffer by having a much simpler
implementation and a much simpler API that does not require the caller
to do reading and writing in a loop.

Future commits will be removing the old ring buffer.

Public issue https://github.com/mackron/miniaudio/issues/671
---
 camal/miniaudio.camal |  36 ++++--
 miniaudio.h           | 274 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 303 insertions(+), 7 deletions(-)

diff --git a/camal/miniaudio.camal b/camal/miniaudio.camal
index 95cc9c17..d0320974 100644
--- a/camal/miniaudio.camal
+++ b/camal/miniaudio.camal
@@ -1,10 +1,12 @@
-miniaudio_h := <../miniaudio.h>;
-miniaudio_c := <../miniaudio.c>;
-dr_wav_h    :: <../../dr_libs/dr_wav.h>;
-dr_flac_h   :: <../../dr_libs/dr_flac.h>;
-dr_mp3_h    :: <../../dr_libs/dr_mp3.h>;
-c89atomic_h :: <../../c89atomic/c89atomic.h>;
-c89atomic_c :: <../../c89atomic/c89atomic.c>;
+miniaudio_h             := <../miniaudio.h>;
+miniaudio_c             := <../miniaudio.c>;
+dr_wav_h                :: <../../dr_libs/dr_wav.h>;
+dr_flac_h               :: <../../dr_libs/dr_flac.h>;
+dr_mp3_h                :: <../../dr_libs/dr_mp3.h>;
+c89atomic_h             :: <../../c89atomic/c89atomic.h>;
+c89atomic_c             :: <../../c89atomic/c89atomic.c>;
+c89atomic_ring_buffer_h :: <../../c89atomic/extras/c89atomic_ring_buffer.h>;
+c89atomic_ring_buffer_c :: <../../c89atomic/extras/c89atomic_ring_buffer.c>;
 
 minify :: function(src:string) string
 {
@@ -276,6 +278,26 @@ convert_c89atomic_h :: function(src:string) string
 miniaudio_h("/\* c89atomic.h begin \*/\R":"\R/\* c89atomic.h end \*/") = convert_c89atomic_h(@(c89atomic_h["#ifndef c89atomic_h\R":"\R#endif  /\* c89atomic_h \*/"]));
 
 
+// Ring Buffer
+rename_c89atomic_ring_buffer_namespace :: function(src:string) string
+{
+    return rename_c89atomic_namespace(src)
+        ["\bma_atomic_ring_buffer"]      <= "ma_ring_buffer"
+        ["\bMA_ATOMIC"]                  <= "MA"
+        ["\bMA_RING_BUFFER_API"]         <= "MA_API"
+        ["\bMA_RING_BUFFER_ASSERT"]      <= "MA_ASSERT"
+        ["\bMA_RING_BUFFER_COPY_MEMORY"] <= "MA_COPY_MEMORY"
+        ["\bMA_RING_BUFFER_OFFSET_PTR"]  <= "ma_offset_ptr"
+
+        // Alignment hack.
+        ["void\* pBuffer;       "]       <= "void* pBuffer;"
+    ;
+}
+
+miniaudio_h("/\* BEG ma_ring_buffer.h \*/\R":"\R/\* END ma_ring_buffer.h \*/") = rename_c89atomic_ring_buffer_namespace(@(c89atomic_ring_buffer_h("/\* BEG c89atomic_ring_buffer.h \*/\R":"\R/\* END c89atomic_ring_buffer.h \*/")));
+miniaudio_h("/\* BEG ma_ring_buffer.c \*/\R":"\R/\* END ma_ring_buffer.c \*/") = rename_c89atomic_ring_buffer_namespace(@(c89atomic_ring_buffer_c("/\* BEG c89atomic_ring_buffer.c \*/\R":"\R/\* END c89atomic_ring_buffer.c \*/")));
+
+
 // Cleanup. If we don't normalize line endings we'll fail to compile on old versions of GCC.
 cleanup :: function(src:string) string
 {
diff --git a/miniaudio.h b/miniaudio.h
index f7268a45..540d8330 100644
--- a/miniaudio.h
+++ b/miniaudio.h
@@ -6248,6 +6248,28 @@ MA_API ma_result ma_audio_queue_get_length_in_pcm_frames(ma_audio_queue* pAudioQ
 Ring Buffer
 
 ************************************************************************************************************************************************************/
+/* BEG ma_ring_buffer.h */
+#define MA_RING_BUFFER_FLAG_MIRRORED (1 << 0)    /* When set the buffer is mirrored at capacity * stride. */
+
+typedef struct ma_ring_buffer
+{
+    ma_uint32 head;      /* Atomic. Most significant bit is a loop flag. When the head is equal to the tail (including the flag) it means the buffer is empty. If the only difference is the flag, it means the buffer is full. */
+    ma_uint32 tail;      /* Atomic. Most significant bit is a loop flag. */
+    ma_uint32 capacity;  /* Capacity of the buffer, in elements. */
+    ma_uint32 stride;    /* Size of an element in bytes. */
+    ma_uint32 flags;
+    void* pBuffer;       /* Must be twice the size of capacity * stride. */
+} ma_ring_buffer;
+
+MA_API void ma_ring_buffer_init(ma_uint32 capacity, ma_uint32 stride, ma_uint32 flags, void* pBuffer, ma_ring_buffer* pRingBuffer);   /* Buffer must be `2 * capacity * stride`. That is twice the capacity. You can use a mirrored buffer, in which case specify the MA_RING_BUFFER_FLAG_MIRRORED flag. */
+MA_API size_t ma_ring_buffer_map_produce(ma_ring_buffer* pRingBuffer, size_t count, void** ppMappedBuffer);    /* Returns the number of elements actually mapped. */
+MA_API void ma_ring_buffer_unmap_produce(ma_ring_buffer* pRingBuffer, size_t count);
+MA_API size_t ma_ring_buffer_map_consume(ma_ring_buffer* pRingBuffer, size_t count, void** ppMappedBuffer);    /* Returns the number of elements actually mapped. */
+MA_API void ma_ring_buffer_unmap_consume(ma_ring_buffer* pRingBuffer, size_t count);
+MA_API ma_uint32 ma_ring_buffer_length(const ma_ring_buffer* pRingBuffer);      /* Returns the number of elements currently in the ring buffer. If something is in the middle of producing or consuming data on the ring buffer than the returned value may already be out of date. */
+MA_API ma_uint32 ma_ring_buffer_capacity(const ma_ring_buffer* pRingBuffer);
+/* END ma_ring_buffer.h */
+
 typedef struct
 {
     void* pBuffer;
@@ -63389,6 +63411,258 @@ MA_API ma_uint64 ma_convert_frames_ex(void* pOut, ma_uint64 frameCountOut, const
 Ring Buffer
 
 **************************************************************************************************************************************************************/
+/* BEG ma_ring_buffer.c */
+MA_API void ma_ring_buffer_init(ma_uint32 capacity, ma_uint32 stride, ma_uint32 flags, void* pBuffer, ma_ring_buffer* pRingBuffer)
+{
+    if (pRingBuffer == NULL) {
+        return;
+    }
+
+    ma_atomic_store_explicit_32(&pRingBuffer->head, 0, ma_atomic_memory_order_relaxed);
+    ma_atomic_store_explicit_32(&pRingBuffer->tail, 0, ma_atomic_memory_order_relaxed);
+    pRingBuffer->capacity = 0;
+    pRingBuffer->stride   = 0;
+    pRingBuffer->flags    = 0;
+    pRingBuffer->pBuffer  = NULL;
+
+    if (pBuffer == NULL || stride == 0 || capacity == 0) {
+        MA_ASSERT(!"Ring buffer initialized with invalid values. It must have a valid buffer, stride and capacity.");
+        return;
+    }
+
+    pRingBuffer->capacity = capacity;
+    pRingBuffer->stride   = stride;
+    pRingBuffer->flags    = flags;
+    pRingBuffer->pBuffer  = pBuffer;
+}
+
+static MA_INLINE ma_uint32 ma_ring_buffer_calculate_length(ma_uint32 head, ma_uint32 tail, ma_uint32 capacity)
+{
+    /*
+    Our ring buffer encodes a "loop flag" in the head and tail cursors. The flag flip-flops as the cursor
+    loops around and is basically just used to disambiguate the case when the head and tail are equal (are
+    they the same because the buffer is empty, or because it's full).
+
+    The head should always be ahead of the tail, and by no more than `capacity`. This algorithm asserts
+    this rule. Logically it follows that the length is simply the head minus the tail. However, since the
+    cursors loop, there are times when the tail will be greater than the head which will mess up the math.
+    To make the math work we need to adjust the head to ensure it's always ahead of the tail.
+
+    When the two cursors are on the same loop there's nothing complicated - just do the subtraction like
+    normal. When they're on different loops we need only offset the head by the capacity before doing the
+    subtraction. This can all be done without a branch.
+    */
+    #if 0
+    {
+        ma_uint32 loopFlag = (head & 0x80000000) ^ (tail & 0x80000000);
+
+        if (loopFlag) {
+            return ((head & 0x7FFFFFFF) + capacity) - (tail & 0x7FFFFFFF);
+        } else {
+            return head - tail;
+        }
+    }
+    #else
+    {
+        return ((head & 0x7FFFFFFF) + (capacity * (((head & 0x80000000) ^ (tail & 0x80000000)) >> 31))) - (tail & 0x7FFFFFFF);
+    }
+    #endif
+}
+
+static MA_INLINE ma_uint32 ma_ring_buffer_calculate_remaining(ma_uint32 head, ma_uint32 tail, ma_uint32 capacity)
+{
+    return capacity - ma_ring_buffer_calculate_length(head, tail, capacity);
+}
+
+MA_API size_t ma_ring_buffer_map_produce(ma_ring_buffer* pRingBuffer, size_t count, void** ppMappedBuffer)
+{
+    ma_uint32 head;
+    ma_uint32 tail;
+    ma_uint32 remaining;
+
+    if (ppMappedBuffer == NULL) {
+        return 0;
+    }
+
+    *ppMappedBuffer = NULL;
+
+    if (pRingBuffer == NULL) {
+        return 0;
+    }
+
+    /*
+    For the head, only the producer will be making modifications to it so we can just use relaxed. For the tail,
+    it is indeed modified by the consumer, but since the producer does not actually have a data dependency on
+    anything done by the consumer the tail is essentially just a cursor for us to determine how much data we can
+    produce. It can therefore be relaxed as well.
+    */
+    head = ma_atomic_load_explicit_32(&pRingBuffer->head, ma_atomic_memory_order_relaxed);
+    tail = ma_atomic_load_explicit_32(&pRingBuffer->tail, ma_atomic_memory_order_relaxed);
+
+    /* Now we need to clamp the count to ensure it never goes beyond our capacity. */
+    remaining = ma_ring_buffer_calculate_remaining(head, tail, pRingBuffer->capacity);
+    if (count > remaining) {
+        count = remaining;
+    }
+
+    /* Our pointer will always just be where our head is pointing. */
+    *ppMappedBuffer = ma_offset_ptr(pRingBuffer->pBuffer, (head & 0x7FFFFFFF) * pRingBuffer->stride);
+
+    return count;
+}
+
+MA_API void ma_ring_buffer_unmap_produce(ma_ring_buffer* pRingBuffer, size_t count)
+{
+    ma_uint32 head;
+    ma_uint32 tail;
+
+    if (pRingBuffer == NULL) {
+        return;
+    }
+
+    head = ma_atomic_load_explicit_32(&pRingBuffer->head, ma_atomic_memory_order_relaxed);
+    tail = ma_atomic_load_explicit_32(&pRingBuffer->tail, ma_atomic_memory_order_relaxed);
+
+    MA_ASSERT(count <= ma_ring_buffer_calculate_remaining(head, tail, pRingBuffer->capacity));
+
+    /* The tail is not modified. We load it for the benefit of the assert above. */
+    (void)tail;
+
+    /* If the buffer is not mirrored we need to copy any overflow to the start of the ring buffer. */
+    if ((pRingBuffer->flags & MA_RING_BUFFER_FLAG_MIRRORED) == 0) {
+        ma_uint32 newHead = (head & 0x7FFFFFFF) + count;
+        if (newHead  > pRingBuffer->capacity) {
+            newHead -= pRingBuffer->capacity;
+            MA_COPY_MEMORY(pRingBuffer->pBuffer, ma_offset_ptr(pRingBuffer->pBuffer, pRingBuffer->capacity * pRingBuffer->stride), newHead * pRingBuffer->stride);
+        }
+    }
+
+    /* Advance the head. */
+    head += count;
+
+    /* Check if the head has looped and adjust if so. */
+    if ((head & 0x7FFFFFFF) >= pRingBuffer->capacity) {
+        head -= pRingBuffer->capacity;  /* Get the index back into range. */
+        head ^= 0x80000000;             /* Flip the loop flag. */
+    }
+
+    /*
+    The consumer will be wanting to read from the buffer that we just wrote so we'll need to use
+    release semantics here to ensure the consumer does not see the adjustment to the advanced head
+    until after the data has been written.
+    */
+    ma_atomic_store_explicit_32(&pRingBuffer->head, head, ma_atomic_memory_order_release);
+}
+
+MA_API size_t ma_ring_buffer_map_consume(ma_ring_buffer* pRingBuffer, size_t count, void** ppMappedBuffer)
+{
+    ma_uint32 head;
+    ma_uint32 tail;
+    ma_uint32 length;
+
+    if (ppMappedBuffer == NULL) {
+        return 0;
+    }
+
+    *ppMappedBuffer = NULL;
+
+    if (pRingBuffer == NULL) {
+        return 0;
+    }
+
+    /*
+    We're about to read data that was written by the producer. We'll need to use acquire semantics
+    here for the head.
+    */
+    head = ma_atomic_load_explicit_32(&pRingBuffer->head, ma_atomic_memory_order_acquire);
+    tail = ma_atomic_load_explicit_32(&pRingBuffer->tail, ma_atomic_memory_order_relaxed);
+
+    /* Make sure we don't try to consume more than what we have available for consumption. */
+    length = ma_ring_buffer_calculate_length(head, tail, pRingBuffer->capacity);
+    if (count > length) {
+        count = length;
+    }
+
+    /* Our pointer will always just be where our tail is pointing. */
+    *ppMappedBuffer = ma_offset_ptr(pRingBuffer->pBuffer, (tail & 0x7FFFFFFF) * pRingBuffer->stride);
+
+    /*
+    If the buffer is not mirrored we may need to copy some data from the start of the buffer to the overflow
+    part so the caller has a contiguous block to work with.
+    */
+    if ((pRingBuffer->flags & MA_RING_BUFFER_FLAG_MIRRORED) == 0) {
+        ma_uint32 newTail = (tail & 0x7FFFFFFF) + count;
+        if (newTail  > pRingBuffer->capacity) {
+            newTail -= pRingBuffer->capacity;
+            MA_COPY_MEMORY(ma_offset_ptr(pRingBuffer->pBuffer, pRingBuffer->capacity * pRingBuffer->stride), pRingBuffer->pBuffer, newTail * pRingBuffer->stride);
+        }
+    }
+
+    return count;
+}
+
+MA_API void ma_ring_buffer_unmap_consume(ma_ring_buffer* pRingBuffer, size_t count)
+{
+    ma_uint32 head;
+    ma_uint32 tail;
+
+    if (pRingBuffer == NULL) {
+        return;
+    }
+
+    /*
+    When we first mapped the buffer with `map_consume()` we used acquire semantics for the head. Since we won't
+    be touching any data that was produced between our map and unmap, we should be able to use relaxed here.
+    */
+    head = ma_atomic_load_explicit_32(&pRingBuffer->head, ma_atomic_memory_order_relaxed);
+    tail = ma_atomic_load_explicit_32(&pRingBuffer->tail, ma_atomic_memory_order_relaxed);
+
+    MA_ASSERT(count <= ma_ring_buffer_calculate_length(head, tail, pRingBuffer->capacity));
+
+    /* The head is not modified. We load it for the benefit of the assert above. */
+    (void)head;
+
+    /* Advance the tail. */
+    tail += count;
+
+    /* Check if the tail has looped and adjust if so. */
+    if ((tail & 0x7FFFFFFF) >= pRingBuffer->capacity) {
+        tail -= pRingBuffer->capacity;  /* Get the index back into range. */
+        tail ^= 0x80000000;             /* Flip the loop flag. */
+    }
+
+    /* The producer doesn't care what we do with the data in the buffer so we should be able to use relaxed semantics here for the tail. */
+    ma_atomic_store_explicit_32(&pRingBuffer->tail, tail, ma_atomic_memory_order_relaxed);
+}
+
+MA_API ma_uint32 ma_ring_buffer_length(const ma_ring_buffer* pRingBuffer)
+{
+    ma_uint32 head;
+    ma_uint32 tail;
+
+    if (pRingBuffer == NULL) {
+        return 0;
+    }
+
+    head = ma_atomic_load_explicit_32(&pRingBuffer->head, ma_atomic_memory_order_relaxed);
+    tail = ma_atomic_load_explicit_32(&pRingBuffer->tail, ma_atomic_memory_order_relaxed);
+
+    return ma_ring_buffer_calculate_length(head, tail, pRingBuffer->capacity);
+}
+
+MA_API ma_uint32 ma_ring_buffer_capacity(const ma_ring_buffer* pRingBuffer)
+{
+    if (pRingBuffer == NULL) {
+        return 0;
+    }
+
+    return pRingBuffer->capacity;
+}
+/* END ma_ring_buffer.c */
+
+
+
+
 static MA_INLINE ma_uint32 ma_rb__extract_offset_in_bytes(ma_uint32 encodedOffset)
 {
     return encodedOffset & 0x7FFFFFFF;