diff --git a/extras/dr_flac.h b/extras/dr_flac.h
index fa727aca..cb4c29cb 100644
--- a/extras/dr_flac.h
+++ b/extras/dr_flac.h
@@ -1,5 +1,5 @@
 // FLAC audio decoder. Public domain. See "unlicense" statement at the end of this file.
-// dr_flac - v0.10.0 - 2018-09-11
+// dr_flac - v0.11.0 - 2018-12-xx
 //
 // David Reid - mackron@gmail.com
 
@@ -18,7 +18,7 @@
 //     }
 //
 //     drflac_int32* pSamples = malloc(pFlac->totalSampleCount * sizeof(drflac_int32));
-//     drflac_uint64 numberOfInterleavedSamplesActuallyRead = drflac_read_s32(pFlac, pFlac->totalSampleCount, pSamples);
+//     drflac_uint64 numberOfInterleavedSamplesActuallyRead = drflac_read_pcm_frames_s32(pFlac, pFlac->totalSampleCount, pSamples);
 //
 // The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of
 // channels and the bits per sample, should be directly accessible - just make sure you don't change their values. Samples are
@@ -29,7 +29,7 @@
 // the decoder will give you as many samples as it can, up to the amount requested. Later on when you need the next batch of
 // samples, just call it again. Example:
 //
-//     while (drflac_read_s32(pFlac, chunkSize, pChunkSamples) > 0) {
+//     while (drflac_read_pcm_frames_s32(pFlac, chunkSize, pChunkSamples) > 0) {
 //         do_something();
 //     }
 //
@@ -138,6 +138,16 @@ typedef drflac_uint32    drflac_bool32;
 #define DRFLAC_TRUE      1
 #define DRFLAC_FALSE     0
 
+#if defined(_MSC_VER) && _MSC_VER >= 1700 // Visual Studio 2012
+#define DRFLAC_DEPRECATED   __declspec(deprecated)
+#elif (defined(__GNUC__) && __GNUC__ >= 4)
+#define DRFLAC_DEPRECATED   __attribute__((deprecated))
+#elif (defined(__clang__) && __has_feature(attribute_deprecated))
+#define DRFLAC_DEPRECATED   __attribute__((deprecated))
+#else
+#define DRFLAC_DEPRECATED
+#endif
+
 // As data is read from the client it is placed into an internal buffer for fast access. This controls the
 // size of that buffer. Larger values means more speed, but also more memory. In my testing there is diminishing
 // returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
@@ -464,6 +474,7 @@ typedef struct
     // with each channel having a total of 4096, this value will be set to 2*4096 = 8192. Can be 0 in which case it's still a
     // valid stream, but just means the total sample count is unknown. Likely the case with streams like internet radio.
     drflac_uint64 totalSampleCount;
+    drflac_uint64 totalPCMFrameCount;   // <-- Equal to totalSampleCount / channels.
 
 
     // The container type. This is set based on whether or not the decoder was opened from a native or Ogg stream.
@@ -581,58 +592,34 @@ void drflac_close(drflac* pFlac);
 
 // Reads sample data from the given FLAC decoder, output as interleaved signed 32-bit PCM.
 //
-// pFlac         [in]            The decoder.
-// samplesToRead [in]            The number of samples to read.
-// pBufferOut    [out, optional] A pointer to the buffer that will receive the decoded samples.
+// pFlac        [in]            The decoder.
+// framesToRead [in]            The number of PCM frames to read.
+// pBufferOut   [out, optional] A pointer to the buffer that will receive the decoded samples.
 //
-// Returns the number of samples actually read.
+// Returns the number of PCM frames actually read.
 //
-// pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of samples
+// pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames
 // seeked.
-drflac_uint64 drflac_read_s32(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int32* pBufferOut);
+drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut);
 
-// Same as drflac_read_s32(), except outputs samples as 16-bit integer PCM rather than 32-bit.
-//
-// pFlac         [in]            The decoder.
-// samplesToRead [in]            The number of samples to read.
-// pBufferOut    [out, optional] A pointer to the buffer that will receive the decoded samples.
-//
-// Returns the number of samples actually read.
-//
-// pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of samples
-// seeked.
+// Same as drflac_read_pcm_frames_s32(), except outputs samples as 16-bit integer PCM rather than 32-bit.
 //
 // Note that this is lossy for streams where the bits per sample is larger than 16.
-drflac_uint64 drflac_read_s16(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int16* pBufferOut);
+drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut);
 
-// Same as drflac_read_s32(), except outputs samples as 32-bit floating-point PCM.
-//
-// pFlac         [in]            The decoder.
-// samplesToRead [in]            The number of samples to read.
-// pBufferOut    [out, optional] A pointer to the buffer that will receive the decoded samples.
-//
-// Returns the number of samples actually read.
-//
-// pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of samples
-// seeked.
+// Same as drflac_read_pcm_frames_s32(), except outputs samples as 32-bit floating-point PCM.
 //
 // Note that this should be considered lossy due to the nature of floating point numbers not being able to exactly
 // represent every possible number.
-drflac_uint64 drflac_read_f32(drflac* pFlac, drflac_uint64 samplesToRead, float* pBufferOut);
+drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut);
 
-// Seeks to the sample at the given index.
+// Seeks to the PCM frame at the given index.
 //
-// pFlac       [in] The decoder.
-// sampleIndex [in] The index of the sample to seek to. See notes below.
+// pFlac         [in] The decoder.
+// pcmFrameIndex [in] The index of the PCM frame to seek to. See notes below.
 //
 // Returns DRFLAC_TRUE if successful; DRFLAC_FALSE otherwise.
-//
-// The sample index is based on interleaving. In a stereo stream, for example, the sample at index 0 is the first sample
-// in the left channel; the sample at index 1 is the first sample on the right channel, and so on.
-//
-// When seeking, you will likely want to ensure it's rounded to a multiple of the channel count. You can do this with
-// something like drflac_seek_to_sample(pFlac, (mySampleIndex + (mySampleIndex % pFlac->channels)))
-drflac_bool32 drflac_seek_to_sample(drflac* pFlac, drflac_uint64 sampleIndex);
+drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex);
 
 
 
@@ -680,33 +667,33 @@ drflac* drflac_open_memory_with_metadata(const void* data, size_t dataSize, drfl
 // read samples into a dynamically sized buffer on the heap until no samples are left.
 //
 // Do not call this function on a broadcast type of stream (like internet radio streams and whatnot).
-drflac_int32* drflac_open_and_decode_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_s32(), except returns signed 16-bit integer samples.
-drflac_int16* drflac_open_and_decode_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples.
+drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_s32(), except returns 32-bit floating-point samples.
-float* drflac_open_and_decode_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples.
+float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
 #ifndef DR_FLAC_NO_STDIO
-// Same as drflac_open_and_decode_s32() except opens the decoder from a file.
-drflac_int32* drflac_open_and_decode_file_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a file.
+drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_file_s32(), except returns signed 16-bit integer samples.
-drflac_int16* drflac_open_and_decode_file_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_file_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples.
+drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_file_f32(), except returns 32-bit floating-point samples.
-float* drflac_open_and_decode_file_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_file_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples.
+float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 #endif
 
-// Same as drflac_open_and_decode_s32() except opens the decoder from a block of memory.
-drflac_int32* drflac_open_and_decode_memory_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a block of memory.
+drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_memory_s32(), except returns signed 16-bit integer samples.
-drflac_int16* drflac_open_and_decode_memory_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples.
+drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
-// Same as drflac_open_and_decode_memory_s32(), except returns 32-bit floating-point samples.
-float* drflac_open_and_decode_memory_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);
+// Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples.
+float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount);
 
 // Frees memory that was allocated internally by dr_flac.
 void drflac_free(void* p);
@@ -764,6 +751,20 @@ void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter,
 drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack);
 
 
+//// Deprecated APIs //// 
+DRFLAC_DEPRECATED drflac_uint64 drflac_read_s32(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int32* pBufferOut);    // Use drflac_read_pcm_frames_s32() instead.
+DRFLAC_DEPRECATED drflac_uint64 drflac_read_s16(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int16* pBufferOut);    // Use drflac_read_pcm_frames_s16() instead.
+DRFLAC_DEPRECATED drflac_uint64 drflac_read_f32(drflac* pFlac, drflac_uint64 samplesToRead, float* pBufferOut);           // Use drflac_read_pcm_frames_f32() instead.
+DRFLAC_DEPRECATED drflac_bool32 drflac_seek_to_sample(drflac* pFlac, drflac_uint64 sampleIndex);                          // Use drflac_seek_to_pcm_frame() instead.
+DRFLAC_DEPRECATED drflac_int32* drflac_open_and_decode_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount); // Use drflac_open_and_read_pcm_frames_s32().
+DRFLAC_DEPRECATED drflac_int16* drflac_open_and_decode_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount); // Use drflac_open_and_read_pcm_frames_s16().
+DRFLAC_DEPRECATED float* drflac_open_and_decode_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);        // Use drflac_open_and_read_pcm_frames_f32().
+DRFLAC_DEPRECATED drflac_int32* drflac_open_and_decode_file_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                                         // Use drflac_open_file_and_read_pcm_frames_s32().
+DRFLAC_DEPRECATED drflac_int16* drflac_open_and_decode_file_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                                         // Use drflac_open_file_and_read_pcm_frames_s16().
+DRFLAC_DEPRECATED float* drflac_open_and_decode_file_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                                                // Use drflac_open_file_and_read_pcm_frames_f32().
+DRFLAC_DEPRECATED drflac_int32* drflac_open_and_decode_memory_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                          // Use drflac_open_memory_and_read_pcm_frames_s32().
+DRFLAC_DEPRECATED drflac_int16* drflac_open_and_decode_memory_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                          // Use drflac_open_memory_and_read_pcm_frames_s16().
+DRFLAC_DEPRECATED float* drflac_open_and_decode_memory_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount);                                 // Use drflac_open_memory_and_read_pcm_frames_f32().
 
 #ifdef __cplusplus
 }
@@ -790,6 +791,16 @@ drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter,
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef _MSC_VER
+#define DRFLAC_INLINE __forceinline
+#else
+#ifdef __GNUC__
+#define DRFLAC_INLINE inline __attribute__((always_inline))
+#else
+#define DRFLAC_INLINE inline
+#endif
+#endif
+
 // CPU architecture.
 #if defined(__x86_64__) || defined(_M_X64)
     #define DRFLAC_X64
@@ -799,6 +810,62 @@ drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter,
     #define DRFLAC_ARM
 #endif
 
+// Intrinsics Support
+#if !defined(DR_FLAC_NO_SIMD)
+    #if defined(DRFLAC_X64) || defined(DRFLAC_X86)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            // MSVC.
+            #if !defined(DRFLAC_NO_SSE2)   // Assume all MSVC compilers support SSE2 intrinsics.
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if _MSC_VER >= 1600 && !defined(DRFLAC_NO_SSE41)   // 2010
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #else
+            // Assume GNUC-style.
+            #if defined(__SSE2__) && !defined(DRFLAC_NO_SSE2)
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if defined(__SSE4_1__) && !defined(DRFLAC_NO_SSE41)
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #endif
+
+        // If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include.
+        #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
+            #if !defined(DRFLAC_SUPPORT_SSE2) && !defined(DRFLAC_NO_SSE2) && __has_include(<emmintrin.h>)
+                #define DRFLAC_SUPPORT_SSE2
+            #endif
+            #if !defined(DRFLAC_SUPPORT_SSE41) && !defined(DRFLAC_NO_SSE41) && __has_include(<smmintrin.h>)
+                #define DRFLAC_SUPPORT_SSE41
+            #endif
+        #endif
+
+        #if defined(DRFLAC_SUPPORT_SSE41)
+            #include <smmintrin.h>
+        #elif defined(DRFLAC_SUPPORT_SSE2)
+            #include <emmintrin.h>
+        #endif
+    #endif
+
+    #if defined(DRFLAC_ARM)
+        #if !defined(DRFLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            #define DRFLAC_SUPPORT_NEON
+        #endif
+
+        // Fall back to looking for the #include file.
+        #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
+            #if !defined(DRFLAC_SUPPORT_NEON) && !defined(DRFLAC_NO_NEON) && __has_include(<arm_neon.h>)
+                #define DRFLAC_SUPPORT_NEON
+            #endif
+        #endif
+
+        #if defined(DRFLAC_SUPPORT_NEON)
+            #include <arm_neon.h>
+        #endif
+    #endif
+#endif
+
 // Compile-time CPU feature support.
 #if !defined(DR_FLAC_NO_SIMD) && (defined(DRFLAC_X86) || defined(DRFLAC_X64))
     #if defined(_MSC_VER) && !defined(__clang__)
@@ -841,6 +908,56 @@ drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter,
     #define DRFLAC_NO_CPUID
 #endif
 
+static DRFLAC_INLINE drflac_bool32 drflac_has_sse2()
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE2)
+        #if defined(DRFLAC_X64)
+            return DRFLAC_TRUE;    // 64-bit targets always support SSE2.
+        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
+            return DRFLAC_TRUE;    // If the compiler is allowed to freely generate SSE2 code we can assume support.
+        #else
+            #if defined(DRFLAC_NO_CPUID)
+                return DRFLAC_FALSE;
+            #else
+                int info[4];
+                drflac_cpuid(info, 1);
+                return (info[3] & (1 << 26)) != 0;
+            #endif
+        #endif
+    #else
+        return DRFLAC_FALSE;       // SSE2 is only supported on x86 and x64 architectures.
+    #endif
+#else
+    return DRFLAC_FALSE;           // No compiler support.
+#endif
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac_has_sse41()
+{
+#if defined(DRFLAC_SUPPORT_SSE41)
+    #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE41)
+        #if defined(DRFLAC_X64)
+            return DRFLAC_TRUE;    // 64-bit targets always support SSE4.1.
+        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE4_1__)
+            return DRFLAC_TRUE;    // If the compiler is allowed to freely generate SSE41 code we can assume support.
+        #else
+            #if defined(DRFLAC_NO_CPUID)
+                return DRFLAC_FALSE;
+            #else
+                int info[4];
+                drflac_cpuid(info, 1);
+                return (info[2] & (1 << 19)) != 0;
+            #endif
+        #endif
+    #else
+        return DRFLAC_FALSE;       // SSE41 is only supported on x86 and x64 architectures.
+    #endif
+#else
+    return DRFLAC_FALSE;           // No compiler support.
+#endif
+}
+
 
 #if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(DRFLAC_X86) || defined(DRFLAC_X64))
     #define DRFLAC_HAS_LZCNT_INTRINSIC
@@ -900,16 +1017,6 @@ drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter,
 
 #define DRFLAC_MAX_SIMD_VECTOR_SIZE                     64  // 64 for AVX-512 in the future.
 
-#ifdef _MSC_VER
-#define DRFLAC_INLINE __forceinline
-#else
-#ifdef __GNUC__
-#define DRFLAC_INLINE inline __attribute__((always_inline))
-#else
-#define DRFLAC_INLINE inline
-#endif
-#endif
-
 typedef drflac_int32 drflac_result;
 #define DRFLAC_SUCCESS                                  0
 #define DRFLAC_ERROR                                    -1  // A generic error.
@@ -931,6 +1038,10 @@ typedef drflac_int32 drflac_result;
 #define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
 #define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
 
+// Keeps track of the number of leading samples for each sub-frame. This is required because the SSE pipeline will occasionally
+// reference excess prior samples.
+#define DRFLAC_LEADING_SAMPLES                          32
+
 
 #define drflac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
 #define drflac_assert                                   DRFLAC_ASSERT
@@ -941,18 +1052,21 @@ typedef drflac_int32 drflac_result;
 // CPU caps.
 static drflac_bool32 drflac__gIsLZCNTSupported = DRFLAC_FALSE;
 #ifndef DRFLAC_NO_CPUID
-static drflac_bool32 drflac__gIsSSE42Supported = DRFLAC_FALSE;
+static drflac_bool32 drflac__gIsSSE2Supported  = DRFLAC_FALSE;
+static drflac_bool32 drflac__gIsSSE41Supported = DRFLAC_FALSE;
 static void drflac__init_cpu_caps()
 {
     int info[4] = {0};
 
     // LZCNT
     drflac__cpuid(info, 0x80000001);
-    drflac__gIsLZCNTSupported = (info[2] & (1 <<  5)) != 0;
+    drflac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
 
-    // SSE4.2
-    drflac__cpuid(info, 1);
-    drflac__gIsSSE42Supported = (info[2] & (1 << 19)) != 0;
+    // SSE2
+    drflac__gIsSSE2Supported = drflac_has_sse2();
+
+    // SSE4.1
+    drflac__gIsSSE41Supported = drflac_has_sse41();
 }
 #endif
 
@@ -1761,6 +1875,10 @@ static drflac_bool32 drflac__find_and_seek_to_next_sync_code(drflac_bs* bs)
 
 static DRFLAC_INLINE drflac_uint32 drflac__clz_software(drflac_cache_t x)
 {
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+
     static drflac_uint32 clz_table_4[] = {
         0,
         4,
@@ -1808,6 +1926,9 @@ static DRFLAC_INLINE drflac_uint32 drflac__clz_lzcnt(drflac_cache_t x)
     #endif
 #else
     #if defined(__GNUC__) || defined(__clang__)
+        if (x == 0) {
+            return sizeof(x)*8;
+        }
         #ifdef DRFLAC_64BIT
             return (drflac_uint32)__builtin_clzll((unsigned long long)x);
         #else
@@ -1826,6 +1947,10 @@ static DRFLAC_INLINE drflac_uint32 drflac__clz_lzcnt(drflac_cache_t x)
 
 static DRFLAC_INLINE drflac_uint32 drflac__clz_msvc(drflac_cache_t x)
 {
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+
     drflac_uint32 n;
 #ifdef DRFLAC_64BIT
     _BitScanReverse64((unsigned long*)&n, x);
@@ -1838,7 +1963,6 @@ static DRFLAC_INLINE drflac_uint32 drflac__clz_msvc(drflac_cache_t x)
 
 static DRFLAC_INLINE drflac_uint32 drflac__clz(drflac_cache_t x)
 {
-    // This function assumes at least one bit is set. Checking for 0 needs to be done at a higher level, outside this function.
 #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
     if (drflac__is_lzcnt_supported()) {
         return drflac__clz_lzcnt(x);
@@ -2204,6 +2328,435 @@ static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_64(drflac_uint32
     return (drflac_int32)(prediction >> shift);
 }
 
+static DRFLAC_INLINE void drflac__calculate_prediction_64_x4(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, const drflac_uint32 riceParamParts[4], drflac_int32* pDecodedSamples)
+{
+    drflac_assert(order <= 32);
+
+    drflac_int64 prediction0 = 0;
+    drflac_int64 prediction1 = 0;
+    drflac_int64 prediction2 = 0;
+    drflac_int64 prediction3 = 0;
+
+    switch (order)
+    {
+    case 32:
+        prediction0 += coefficients[31] * (drflac_int64)pDecodedSamples[-32];
+        prediction1 += coefficients[31] * (drflac_int64)pDecodedSamples[-31];
+        prediction2 += coefficients[31] * (drflac_int64)pDecodedSamples[-30];
+        prediction3 += coefficients[31] * (drflac_int64)pDecodedSamples[-29];
+    case 31:
+        prediction0 += coefficients[30] * (drflac_int64)pDecodedSamples[-31];
+        prediction1 += coefficients[30] * (drflac_int64)pDecodedSamples[-30];
+        prediction2 += coefficients[30] * (drflac_int64)pDecodedSamples[-29];
+        prediction3 += coefficients[30] * (drflac_int64)pDecodedSamples[-28];
+    case 30:
+        prediction0 += coefficients[29] * (drflac_int64)pDecodedSamples[-30];
+        prediction1 += coefficients[29] * (drflac_int64)pDecodedSamples[-29];
+        prediction2 += coefficients[29] * (drflac_int64)pDecodedSamples[-28];
+        prediction3 += coefficients[29] * (drflac_int64)pDecodedSamples[-27];
+    case 29:
+        prediction0 += coefficients[28] * (drflac_int64)pDecodedSamples[-29];
+        prediction1 += coefficients[28] * (drflac_int64)pDecodedSamples[-28];
+        prediction2 += coefficients[28] * (drflac_int64)pDecodedSamples[-27];
+        prediction3 += coefficients[28] * (drflac_int64)pDecodedSamples[-26];
+    case 28:
+        prediction0 += coefficients[27] * (drflac_int64)pDecodedSamples[-28];
+        prediction1 += coefficients[27] * (drflac_int64)pDecodedSamples[-27];
+        prediction2 += coefficients[27] * (drflac_int64)pDecodedSamples[-26];
+        prediction3 += coefficients[27] * (drflac_int64)pDecodedSamples[-25];
+    case 27:
+        prediction0 += coefficients[26] * (drflac_int64)pDecodedSamples[-27];
+        prediction1 += coefficients[26] * (drflac_int64)pDecodedSamples[-26];
+        prediction2 += coefficients[26] * (drflac_int64)pDecodedSamples[-25];
+        prediction3 += coefficients[26] * (drflac_int64)pDecodedSamples[-24];
+    case 26:
+        prediction0 += coefficients[25] * (drflac_int64)pDecodedSamples[-26];
+        prediction1 += coefficients[25] * (drflac_int64)pDecodedSamples[-25];
+        prediction2 += coefficients[25] * (drflac_int64)pDecodedSamples[-24];
+        prediction3 += coefficients[25] * (drflac_int64)pDecodedSamples[-23];
+    case 25:
+        prediction0 += coefficients[24] * (drflac_int64)pDecodedSamples[-25];
+        prediction1 += coefficients[24] * (drflac_int64)pDecodedSamples[-24];
+        prediction2 += coefficients[24] * (drflac_int64)pDecodedSamples[-23];
+        prediction3 += coefficients[24] * (drflac_int64)pDecodedSamples[-22];
+    case 24:
+        prediction0 += coefficients[23] * (drflac_int64)pDecodedSamples[-24];
+        prediction1 += coefficients[23] * (drflac_int64)pDecodedSamples[-23];
+        prediction2 += coefficients[23] * (drflac_int64)pDecodedSamples[-22];
+        prediction3 += coefficients[23] * (drflac_int64)pDecodedSamples[-21];
+    case 23:
+        prediction0 += coefficients[22] * (drflac_int64)pDecodedSamples[-23];
+        prediction1 += coefficients[22] * (drflac_int64)pDecodedSamples[-22];
+        prediction2 += coefficients[22] * (drflac_int64)pDecodedSamples[-21];
+        prediction3 += coefficients[22] * (drflac_int64)pDecodedSamples[-20];
+    case 22:
+        prediction0 += coefficients[21] * (drflac_int64)pDecodedSamples[-22];
+        prediction1 += coefficients[21] * (drflac_int64)pDecodedSamples[-21];
+        prediction2 += coefficients[21] * (drflac_int64)pDecodedSamples[-20];
+        prediction3 += coefficients[21] * (drflac_int64)pDecodedSamples[-19];
+    case 21:
+        prediction0 += coefficients[20] * (drflac_int64)pDecodedSamples[-21];
+        prediction1 += coefficients[20] * (drflac_int64)pDecodedSamples[-20];
+        prediction2 += coefficients[20] * (drflac_int64)pDecodedSamples[-19];
+        prediction3 += coefficients[20] * (drflac_int64)pDecodedSamples[-18];
+    case 20:
+        prediction0 += coefficients[19] * (drflac_int64)pDecodedSamples[-20];
+        prediction1 += coefficients[19] * (drflac_int64)pDecodedSamples[-19];
+        prediction2 += coefficients[19] * (drflac_int64)pDecodedSamples[-18];
+        prediction3 += coefficients[19] * (drflac_int64)pDecodedSamples[-17];
+    case 19:
+        prediction0 += coefficients[18] * (drflac_int64)pDecodedSamples[-19];
+        prediction1 += coefficients[18] * (drflac_int64)pDecodedSamples[-18];
+        prediction2 += coefficients[18] * (drflac_int64)pDecodedSamples[-17];
+        prediction3 += coefficients[18] * (drflac_int64)pDecodedSamples[-16];
+    case 18:
+        prediction0 += coefficients[17] * (drflac_int64)pDecodedSamples[-18];
+        prediction1 += coefficients[17] * (drflac_int64)pDecodedSamples[-17];
+        prediction2 += coefficients[17] * (drflac_int64)pDecodedSamples[-16];
+        prediction3 += coefficients[17] * (drflac_int64)pDecodedSamples[-15];
+    case 17:
+        prediction0 += coefficients[16] * (drflac_int64)pDecodedSamples[-17];
+        prediction1 += coefficients[16] * (drflac_int64)pDecodedSamples[-16];
+        prediction2 += coefficients[16] * (drflac_int64)pDecodedSamples[-15];
+        prediction3 += coefficients[16] * (drflac_int64)pDecodedSamples[-14];
+
+    case 16:
+        prediction0 += coefficients[15] * (drflac_int64)pDecodedSamples[-16];
+        prediction1 += coefficients[15] * (drflac_int64)pDecodedSamples[-15];
+        prediction2 += coefficients[15] * (drflac_int64)pDecodedSamples[-14];
+        prediction3 += coefficients[15] * (drflac_int64)pDecodedSamples[-13];
+    case 15:
+        prediction0 += coefficients[14] * (drflac_int64)pDecodedSamples[-15];
+        prediction1 += coefficients[14] * (drflac_int64)pDecodedSamples[-14];
+        prediction2 += coefficients[14] * (drflac_int64)pDecodedSamples[-13];
+        prediction3 += coefficients[14] * (drflac_int64)pDecodedSamples[-12];
+    case 14:
+        prediction0 += coefficients[13] * (drflac_int64)pDecodedSamples[-14];
+        prediction1 += coefficients[13] * (drflac_int64)pDecodedSamples[-13];
+        prediction2 += coefficients[13] * (drflac_int64)pDecodedSamples[-12];
+        prediction3 += coefficients[13] * (drflac_int64)pDecodedSamples[-11];
+    case 13:
+        prediction0 += coefficients[12] * (drflac_int64)pDecodedSamples[-13];
+        prediction1 += coefficients[12] * (drflac_int64)pDecodedSamples[-12];
+        prediction2 += coefficients[12] * (drflac_int64)pDecodedSamples[-11];
+        prediction3 += coefficients[12] * (drflac_int64)pDecodedSamples[-10];
+    case 12:
+        prediction0 += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
+        prediction1 += coefficients[11] * (drflac_int64)pDecodedSamples[-11];
+        prediction2 += coefficients[11] * (drflac_int64)pDecodedSamples[-10];
+        prediction3 += coefficients[11] * (drflac_int64)pDecodedSamples[- 9];
+    case 11:
+        prediction0 += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
+        prediction1 += coefficients[10] * (drflac_int64)pDecodedSamples[-10];
+        prediction2 += coefficients[10] * (drflac_int64)pDecodedSamples[- 9];
+        prediction3 += coefficients[10] * (drflac_int64)pDecodedSamples[- 8];
+    case 10:
+        prediction0 += coefficients[9] * (drflac_int64)pDecodedSamples[-10];
+        prediction1 += coefficients[9] * (drflac_int64)pDecodedSamples[- 9];
+        prediction2 += coefficients[9] * (drflac_int64)pDecodedSamples[- 8];
+        prediction3 += coefficients[9] * (drflac_int64)pDecodedSamples[- 7];
+    case  9:
+        prediction0 += coefficients[8] * (drflac_int64)pDecodedSamples[- 9];
+        prediction1 += coefficients[8] * (drflac_int64)pDecodedSamples[- 8];
+        prediction2 += coefficients[8] * (drflac_int64)pDecodedSamples[- 7];
+        prediction3 += coefficients[8] * (drflac_int64)pDecodedSamples[- 6];
+    case  8:
+        prediction0 += coefficients[7] * (drflac_int64)pDecodedSamples[- 8];
+        prediction1 += coefficients[7] * (drflac_int64)pDecodedSamples[- 7];
+        prediction2 += coefficients[7] * (drflac_int64)pDecodedSamples[- 6];
+        prediction3 += coefficients[7] * (drflac_int64)pDecodedSamples[- 5];
+    case  7:
+        prediction0 += coefficients[6] * (drflac_int64)pDecodedSamples[- 7];
+        prediction1 += coefficients[6] * (drflac_int64)pDecodedSamples[- 6];
+        prediction2 += coefficients[6] * (drflac_int64)pDecodedSamples[- 5];
+        prediction3 += coefficients[6] * (drflac_int64)pDecodedSamples[- 4];
+    case  6:
+        prediction0 += coefficients[5] * (drflac_int64)pDecodedSamples[- 6];
+        prediction1 += coefficients[5] * (drflac_int64)pDecodedSamples[- 5];
+        prediction2 += coefficients[5] * (drflac_int64)pDecodedSamples[- 4];
+        prediction3 += coefficients[5] * (drflac_int64)pDecodedSamples[- 3];
+    case  5:
+        prediction0 += coefficients[4] * (drflac_int64)pDecodedSamples[- 5];
+        prediction1 += coefficients[4] * (drflac_int64)pDecodedSamples[- 4];
+        prediction2 += coefficients[4] * (drflac_int64)pDecodedSamples[- 3];
+        prediction3 += coefficients[4] * (drflac_int64)pDecodedSamples[- 2];
+    case  4:
+        prediction0 += coefficients[3] * (drflac_int64)pDecodedSamples[- 4];
+        prediction1 += coefficients[3] * (drflac_int64)pDecodedSamples[- 3];
+        prediction2 += coefficients[3] * (drflac_int64)pDecodedSamples[- 2];
+        prediction3 += coefficients[3] * (drflac_int64)pDecodedSamples[- 1];
+        order = 3;
+    }
+
+    switch (order)
+    {
+    case 3: prediction0 += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 3];
+    case 2: prediction0 += coefficients[ 1] * (drflac_int64)pDecodedSamples[- 2];
+    case 1: prediction0 += coefficients[ 0] * (drflac_int64)pDecodedSamples[- 1];
+    }
+    pDecodedSamples[0] = riceParamParts[0] + (drflac_int32)(prediction0 >> shift);
+
+    switch (order)
+    {
+    case 3: prediction1 += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 2];
+    case 2: prediction1 += coefficients[ 1] * (drflac_int64)pDecodedSamples[- 1];
+    case 1: prediction1 += coefficients[ 0] * (drflac_int64)pDecodedSamples[  0];
+    }
+    pDecodedSamples[1] = riceParamParts[1] + (drflac_int32)(prediction1 >> shift);
+
+    switch (order)
+    {
+    case 3: prediction2 += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 1];
+    case 2: prediction2 += coefficients[ 1] * (drflac_int64)pDecodedSamples[  0];
+    case 1: prediction2 += coefficients[ 0] * (drflac_int64)pDecodedSamples[  1];
+    }
+    pDecodedSamples[2] = riceParamParts[2] + (drflac_int32)(prediction2 >> shift);
+
+    switch (order)
+    {
+    case 3: prediction3 += coefficients[ 2] * (drflac_int64)pDecodedSamples[  0];
+    case 2: prediction3 += coefficients[ 1] * (drflac_int64)pDecodedSamples[  1];
+    case 1: prediction3 += coefficients[ 0] * (drflac_int64)pDecodedSamples[  2];
+    }
+    pDecodedSamples[3] = riceParamParts[3] + (drflac_int32)(prediction3 >> shift);
+}
+
+#if defined(DRFLAC_SUPPORT_SSE41)
+static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_64__sse41(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
+{
+    drflac_assert(order <= 32);
+
+    __m128i prediction = _mm_setzero_si128();
+
+    switch (order)
+    {
+    case 32:
+    case 31: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[31], 0, coefficients[30]), _mm_set_epi32(0, pDecodedSamples[-32], 0, pDecodedSamples[-31])));
+    case 30:
+    case 29: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[29], 0, coefficients[28]), _mm_set_epi32(0, pDecodedSamples[-30], 0, pDecodedSamples[-29])));
+    case 28:
+    case 27: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[27], 0, coefficients[26]), _mm_set_epi32(0, pDecodedSamples[-28], 0, pDecodedSamples[-27])));
+    case 26:
+    case 25: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[25], 0, coefficients[24]), _mm_set_epi32(0, pDecodedSamples[-26], 0, pDecodedSamples[-25])));
+    case 24:
+    case 23: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[23], 0, coefficients[22]), _mm_set_epi32(0, pDecodedSamples[-24], 0, pDecodedSamples[-23])));
+    case 22:
+    case 21: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[21], 0, coefficients[20]), _mm_set_epi32(0, pDecodedSamples[-22], 0, pDecodedSamples[-21])));
+    case 20:
+    case 19: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[19], 0, coefficients[18]), _mm_set_epi32(0, pDecodedSamples[-20], 0, pDecodedSamples[-19])));
+    case 18:
+    case 17: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[17], 0, coefficients[16]), _mm_set_epi32(0, pDecodedSamples[-18], 0, pDecodedSamples[-17])));
+    case 16:
+    case 15: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[15], 0, coefficients[14]), _mm_set_epi32(0, pDecodedSamples[-16], 0, pDecodedSamples[-15])));
+    case 14:
+    case 13: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[13], 0, coefficients[12]), _mm_set_epi32(0, pDecodedSamples[-14], 0, pDecodedSamples[-13])));
+    case 12:
+    case 11: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[11], 0, coefficients[10]), _mm_set_epi32(0, pDecodedSamples[-12], 0, pDecodedSamples[-11])));
+    case 10:
+    case  9: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 9], 0, coefficients[ 8]), _mm_set_epi32(0, pDecodedSamples[-10], 0, pDecodedSamples[- 9])));
+    case  8:
+    case  7: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 7], 0, coefficients[ 6]), _mm_set_epi32(0, pDecodedSamples[- 8], 0, pDecodedSamples[- 7])));
+    case  6:
+    case  5: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 5], 0, coefficients[ 4]), _mm_set_epi32(0, pDecodedSamples[- 6], 0, pDecodedSamples[- 5])));
+    case  4:
+    case  3: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 3], 0, coefficients[ 2]), _mm_set_epi32(0, pDecodedSamples[- 4], 0, pDecodedSamples[- 3])));
+    case  2:
+    case  1: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 1], 0, coefficients[ 0]), _mm_set_epi32(0, pDecodedSamples[- 2], 0, pDecodedSamples[- 1])));
+    }
+
+    return (drflac_int32)((
+        ((drflac_uint64*)&prediction)[0] +
+        ((drflac_uint64*)&prediction)[1]) >> shift);
+}
+
+static DRFLAC_INLINE void drflac__calculate_prediction_64_x2__sse41(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, const drflac_uint32 riceParamParts[4], drflac_int32* pDecodedSamples)
+{
+    drflac_assert(order <= 32);
+
+    __m128i prediction = _mm_setzero_si128();
+    drflac_int64 predictions[2] = {0, 0};
+
+    switch (order)
+    {
+    case 32: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[31], 0, coefficients[31]), _mm_set_epi32(0, pDecodedSamples[-31], 0, pDecodedSamples[-32])));
+    case 31: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[30], 0, coefficients[30]), _mm_set_epi32(0, pDecodedSamples[-30], 0, pDecodedSamples[-31])));
+    case 30: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[29], 0, coefficients[29]), _mm_set_epi32(0, pDecodedSamples[-29], 0, pDecodedSamples[-30])));
+    case 29: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[28], 0, coefficients[28]), _mm_set_epi32(0, pDecodedSamples[-28], 0, pDecodedSamples[-29])));
+    case 28: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[27], 0, coefficients[27]), _mm_set_epi32(0, pDecodedSamples[-27], 0, pDecodedSamples[-28])));
+    case 27: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[26], 0, coefficients[26]), _mm_set_epi32(0, pDecodedSamples[-26], 0, pDecodedSamples[-27])));
+    case 26: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[25], 0, coefficients[25]), _mm_set_epi32(0, pDecodedSamples[-25], 0, pDecodedSamples[-26])));
+    case 25: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[24], 0, coefficients[24]), _mm_set_epi32(0, pDecodedSamples[-24], 0, pDecodedSamples[-25])));
+    case 24: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[23], 0, coefficients[23]), _mm_set_epi32(0, pDecodedSamples[-23], 0, pDecodedSamples[-24])));
+    case 23: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[22], 0, coefficients[22]), _mm_set_epi32(0, pDecodedSamples[-22], 0, pDecodedSamples[-23])));
+    case 22: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[21], 0, coefficients[21]), _mm_set_epi32(0, pDecodedSamples[-21], 0, pDecodedSamples[-22])));
+    case 21: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[20], 0, coefficients[20]), _mm_set_epi32(0, pDecodedSamples[-20], 0, pDecodedSamples[-21])));
+    case 20: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[19], 0, coefficients[19]), _mm_set_epi32(0, pDecodedSamples[-19], 0, pDecodedSamples[-20])));
+    case 19: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[18], 0, coefficients[18]), _mm_set_epi32(0, pDecodedSamples[-18], 0, pDecodedSamples[-19])));
+    case 18: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[17], 0, coefficients[17]), _mm_set_epi32(0, pDecodedSamples[-17], 0, pDecodedSamples[-18])));
+    case 17: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[16], 0, coefficients[16]), _mm_set_epi32(0, pDecodedSamples[-16], 0, pDecodedSamples[-17])));
+    case 16: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[15], 0, coefficients[15]), _mm_set_epi32(0, pDecodedSamples[-15], 0, pDecodedSamples[-16])));
+    case 15: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[14], 0, coefficients[14]), _mm_set_epi32(0, pDecodedSamples[-14], 0, pDecodedSamples[-15])));
+    case 14: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[13], 0, coefficients[13]), _mm_set_epi32(0, pDecodedSamples[-13], 0, pDecodedSamples[-14])));
+    case 13: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[12], 0, coefficients[12]), _mm_set_epi32(0, pDecodedSamples[-12], 0, pDecodedSamples[-13])));
+    case 12: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[11], 0, coefficients[11]), _mm_set_epi32(0, pDecodedSamples[-11], 0, pDecodedSamples[-12])));
+    case 11: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[10], 0, coefficients[10]), _mm_set_epi32(0, pDecodedSamples[-10], 0, pDecodedSamples[-11])));
+    case 10: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 9], 0, coefficients[ 9]), _mm_set_epi32(0, pDecodedSamples[- 9], 0, pDecodedSamples[-10])));
+    case  9: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 8], 0, coefficients[ 8]), _mm_set_epi32(0, pDecodedSamples[- 8], 0, pDecodedSamples[- 9])));
+    case  8: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 7], 0, coefficients[ 7]), _mm_set_epi32(0, pDecodedSamples[- 7], 0, pDecodedSamples[- 8])));
+    case  7: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 6], 0, coefficients[ 6]), _mm_set_epi32(0, pDecodedSamples[- 6], 0, pDecodedSamples[- 7])));
+    case  6: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 5], 0, coefficients[ 5]), _mm_set_epi32(0, pDecodedSamples[- 5], 0, pDecodedSamples[- 6])));
+    case  5: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 4], 0, coefficients[ 4]), _mm_set_epi32(0, pDecodedSamples[- 4], 0, pDecodedSamples[- 5])));
+    case  4: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 3], 0, coefficients[ 3]), _mm_set_epi32(0, pDecodedSamples[- 3], 0, pDecodedSamples[- 4])));
+    case  3: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 2], 0, coefficients[ 2]), _mm_set_epi32(0, pDecodedSamples[- 2], 0, pDecodedSamples[- 3])));
+    case  2: prediction = _mm_add_epi64(prediction, _mm_mul_epi32(_mm_set_epi32(0, coefficients[ 1], 0, coefficients[ 1]), _mm_set_epi32(0, pDecodedSamples[- 1], 0, pDecodedSamples[- 2])));
+        order = 1;
+    }
+
+    _mm_storeu_si128((__m128i*)predictions, prediction);
+
+    switch (order)
+    {
+    case 1: predictions[0] += coefficients[ 0] * (drflac_int64)pDecodedSamples[- 1];
+    }
+    pDecodedSamples[0] = riceParamParts[0] + (drflac_int32)(predictions[0] >> shift);
+
+    switch (order)
+    {
+    case 1: predictions[1] += coefficients[ 0] * (drflac_int64)pDecodedSamples[  0];
+    }
+    pDecodedSamples[1] = riceParamParts[1] + (drflac_int32)(predictions[1] >> shift);
+}
+
+
+static DRFLAC_INLINE __m128i drflac__mm_not_si128(__m128i a)
+{
+    return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_slide1_epi32(__m128i a, __m128i b)
+{
+    // a3a2a1a0/b3b2b1b0 -> a2a1a0b3
+
+    // Result = a2a1a0b3
+    __m128i b3a3b2a2 = _mm_unpackhi_epi32(a, b);
+    __m128i a2b3a2b3 = _mm_shuffle_epi32(b3a3b2a2, _MM_SHUFFLE(0, 3, 0, 3));
+    __m128i a1a2a0b3 = _mm_unpacklo_epi32(a2b3a2b3, a);
+    __m128i a2a1a0b3 = _mm_shuffle_epi32(a1a2a0b3, _MM_SHUFFLE(2, 3, 1, 0));
+    return a2a1a0b3;
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_slide2_epi32(__m128i a, __m128i b)
+{
+    // Result = a1a0b3b2
+    __m128i b1b0b3b2 = _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i a1b3a0b2 = _mm_unpacklo_epi32(b1b0b3b2, a);
+    __m128i a1a0b3b2 = _mm_shuffle_epi32(a1b3a0b2, _MM_SHUFFLE(3, 1, 2, 0));
+    return a1a0b3b2;
+}
+
+static DRFLAC_INLINE __m128i drflac__mm_slide3_epi32(__m128i a, __m128i b)
+{
+    // Result = a0b3b2b1
+    __m128i b1a1b0a0 = _mm_unpacklo_epi32(a, b);
+    __m128i a0b1a0b1 = _mm_shuffle_epi32(b1a1b0a0, _MM_SHUFFLE(0, 3, 0, 3));
+    __m128i b3a0b2b1 = _mm_unpackhi_epi32(a0b1a0b1, b);
+    __m128i a0b3b2b1 = _mm_shuffle_epi32(b3a0b2b1, _MM_SHUFFLE(2, 3, 1, 0));
+    return a0b3b2b1;
+}
+
+static DRFLAC_INLINE void drflac__calculate_prediction_32_x4__sse41(drflac_uint32 order, drflac_int32 shift, const __m128i* coefficients128, const __m128i riceParamParts128, drflac_int32* pDecodedSamples)
+{
+    drflac_assert(order <= 32);
+
+    // I don't think this is as efficient as it could be. More work needs to be done on this.
+    if (order > 0) {
+        __m128i s_09_10_11_12 = _mm_loadu_si128((const __m128i*)(pDecodedSamples - 12));
+        __m128i s_05_06_07_08 = _mm_loadu_si128((const __m128i*)(pDecodedSamples -  8));
+        __m128i s_01_02_03_04 = _mm_loadu_si128((const __m128i*)(pDecodedSamples -  4));
+
+        __m128i prediction = _mm_setzero_si128();
+
+        // The idea with this switch is to do do a single jump based on the value of "order". In my test library, "order" is never larger than 12, so
+        // I have decided to do a less optimal solution in the order > 12 case.
+        switch (order)
+        {
+        case 32: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[31], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 32))));
+        case 31: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[30], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 31))));
+        case 30: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[29], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 30))));
+        case 29: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[28], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 29))));
+        case 28: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[27], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 28))));
+        case 27: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[26], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 27))));
+        case 26: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[25], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 26))));
+        case 25: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[24], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 25))));
+        case 24: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[23], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 24))));
+        case 23: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[22], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 23))));
+        case 22: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[21], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 22))));
+        case 21: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[20], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 21))));
+        case 20: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[19], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 20))));
+        case 19: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[18], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 19))));
+        case 18: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[17], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 18))));
+        case 17: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[16], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 17))));
+        case 16: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[15], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 16))));
+        case 15: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[14], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 15))));
+        case 14: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[13], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 14))));
+        case 13: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[12], _mm_loadu_si128((const __m128i*)(pDecodedSamples - 13))));
+
+        case 12: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[11], s_09_10_11_12));
+        case 11: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[10], drflac__mm_slide3_epi32(s_05_06_07_08, s_09_10_11_12)));
+        case 10: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 9], drflac__mm_slide2_epi32(s_05_06_07_08, s_09_10_11_12)));
+        case  9: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 8], drflac__mm_slide1_epi32(s_05_06_07_08, s_09_10_11_12)));
+        case  8: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 7], s_05_06_07_08));
+        case  7: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 6], drflac__mm_slide3_epi32(s_01_02_03_04, s_05_06_07_08)));
+        case  6: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 5], drflac__mm_slide2_epi32(s_01_02_03_04, s_05_06_07_08)));
+        case  5: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 4], drflac__mm_slide1_epi32(s_01_02_03_04, s_05_06_07_08)));
+        case  4: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 3], s_01_02_03_04)); order = 3;    // <-- Don't forget to set order to 3 here!
+        case  3: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 2], drflac__mm_slide3_epi32(_mm_setzero_si128(), s_01_02_03_04)));
+        case  2: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 1], drflac__mm_slide2_epi32(_mm_setzero_si128(), s_01_02_03_04)));
+        case  1: prediction = _mm_add_epi32(prediction, _mm_mullo_epi32(coefficients128[ 0], drflac__mm_slide1_epi32(_mm_setzero_si128(), s_01_02_03_04)));
+        }
+
+        drflac_int32 predictions[4];
+        _mm_storeu_si128((__m128i*)predictions, prediction);
+
+        drflac_uint32 riceParamParts[4];
+        _mm_storeu_si128((__m128i*)riceParamParts, riceParamParts128);
+
+        predictions[0] = riceParamParts[0] + (predictions[0] >> shift);
+
+        switch (order)
+        {
+        case 3: predictions[3] += ((const drflac_int32*)&coefficients128[ 2])[0] * predictions[  0];
+        case 2: predictions[2] += ((const drflac_int32*)&coefficients128[ 1])[0] * predictions[  0];
+        case 1: predictions[1] += ((const drflac_int32*)&coefficients128[ 0])[0] * predictions[  0];
+        }
+        predictions[1] = riceParamParts[1] + (predictions[1] >> shift);
+
+        switch (order)
+        {
+        case 3:
+        case 2: predictions[3] += ((const drflac_int32*)&coefficients128[ 1])[0] * predictions[  1];
+        case 1: predictions[2] += ((const drflac_int32*)&coefficients128[ 0])[0] * predictions[  1];
+        }
+        predictions[2] = riceParamParts[2] + (predictions[2] >> shift);
+
+        switch (order)
+        {
+        case 3:
+        case 2:
+        case 1: predictions[3] += ((const drflac_int32*)&coefficients128[ 0])[0] * predictions[  2];
+        }
+        predictions[3] = riceParamParts[3] + (predictions[3] >> shift);
+
+        pDecodedSamples[0] = predictions[0];
+        pDecodedSamples[1] = predictions[1];
+        pDecodedSamples[2] = predictions[2];
+        pDecodedSamples[3] = predictions[3];
+    } else {
+        _mm_storeu_si128((__m128i*)pDecodedSamples, riceParamParts128);
+    }
+}
+#endif
+
 #if 0
 // Reference implementation for reading and decoding samples with residual. This is intentionally left unoptimized for the
 // sake of readability and should only be used as a reference.
@@ -2288,6 +2841,7 @@ static drflac_bool32 drflac__read_rice_parts__reference(drflac_bs* bs, drflac_ui
 }
 #endif
 
+#if 0
 static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
 {
     drflac_assert(riceParam > 0);   // <-- riceParam should never be 0. drflac__read_rice_parts__param_equals_zero() should be used instead for this case.
@@ -2344,74 +2898,335 @@ static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts(drflac_bs* bs, drflac
         bs->cache <<= bitCountLo;
     }
 
-    *pZeroCounterOut = zeroCounter;
-    *pRiceParamPartOut = riceParamPart;
+    pZeroCounterOut[0] = zeroCounter;
+    pRiceParamPartOut[0] = riceParamPart;
+
     return DRFLAC_TRUE;
 }
+#endif
 
-static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts__param_equals_zero(drflac_bs* bs, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
+static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts_x1(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
 {
-    drflac_cache_t riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(0);
+    drflac_uint32  riceParamPlus1 = riceParam + 1;
+    //drflac_cache_t riceParamPlus1Mask  = DRFLAC_CACHE_L1_SELECTION_MASK(riceParamPlus1);
+    drflac_uint32  riceParamPlus1Shift = DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
+    drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
 
-    drflac_uint32 zeroCounter = 0;
-    while (bs->cache == 0) {
-        zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
-        if (!drflac__reload_cache(bs)) {
-            return DRFLAC_FALSE;
-        }
-    }
+    // The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
+    // no idea how this will work in practice...
+    drflac_cache_t bs_cache = bs->cache;
+    drflac_uint32  bs_consumedBits = bs->consumedBits;
 
-    drflac_uint32 setBitOffsetPlus1 = drflac__clz(bs->cache);
-    zeroCounter += setBitOffsetPlus1;
-    setBitOffsetPlus1 += 1;
+    // The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line.
+    drflac_uint32  lzcount = drflac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+        pZeroCounterOut[0] = lzcount;
 
+        // It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
+        // this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
+        // outside of this function at a higher level.
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
 
-    drflac_uint32 riceParamPart;
-    drflac_uint32 riceLength = setBitOffsetPlus1;
-    if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
-        riceParamPart = (drflac_uint32)((bs->cache & (riceParamMask >> setBitOffsetPlus1)) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceLength));
-
-        bs->consumedBits += riceLength;
-        bs->cache <<= riceLength;
-    } else {
-        // It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them.
-        drflac_uint32 bitCountLo = riceLength + bs->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS(bs);
-
-        if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
-#ifndef DR_FLAC_NO_CRC
-            drflac__update_crc16(bs);
-#endif
-            bs->cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
-            bs->consumedBits = 0;
-#ifndef DR_FLAC_NO_CRC
-            bs->crc16Cache = bs->cache;
-#endif
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            // Getting here means the rice parameter part is wholly contained within the current cache line.
+            pRiceParamPartOut[0] = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
         } else {
-            // Slow path. We need to fetch more data from the client.
-            if (!drflac__reload_cache(bs)) {
-                return DRFLAC_FALSE;
+            // Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
+            // line, reload the cache, and then combine it with the head of the next cache line.
+
+            // Grab the high part of the rice parameter part.
+            drflac_uint32 riceParamPartHi = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+
+            // Before reloading the cache we need to grab the size in bits of the low part.
+            drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            drflac_assert(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+                
+            // Now reload the cache.
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+
+            // We should now have enough information to construct the rice parameter part.
+            drflac_uint32 riceParamPartLo = (drflac_uint32)(bs_cache >> (DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
+            pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
+
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        // Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
+        // to drflac__clz() and we need to reload the cache.
+        drflac_uint32 zeroCounter = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
+        for (;;) {
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+
+            lzcount = drflac__clz(bs_cache);
+            zeroCounter += lzcount;
+
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
             }
         }
 
-        riceParamPart = (drflac_uint32)(DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, bitCountLo));
-
-        bs->consumedBits += bitCountLo;
-        bs->cache <<= bitCountLo;
+        pZeroCounterOut[0] = zeroCounter;
+        goto extract_rice_param_part;
     }
 
-    *pZeroCounterOut = zeroCounter;
-    *pRiceParamPartOut = riceParamPart;
+    // Make sure the cache is restored at the end of it all.
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+
+    return DRFLAC_TRUE;
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts_x4(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
+{
+    drflac_uint32  riceParamPlus1 = riceParam + 1;
+    //drflac_cache_t riceParamPlus1Mask  = DRFLAC_CACHE_L1_SELECTION_MASK(riceParamPlus1);
+    drflac_uint32  riceParamPlus1Shift = DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
+    drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+
+    // The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
+    // no idea how this will work in practice...
+    drflac_cache_t bs_cache = bs->cache;
+    drflac_uint32  bs_consumedBits = bs->consumedBits;
+
+    // What this is doing is trying to efficiently extract 4 rice parts at a time, the idea being that we can exploit certain properties
+    // to our advantage to make things more efficient.
+    for (int i = 0; i < 4; ++i) {
+        // The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line.
+        drflac_uint32  lzcount = drflac__clz(bs_cache);
+        if (lzcount < sizeof(bs_cache)*8) {
+            pZeroCounterOut[i] = lzcount;
+
+            // It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
+            // this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
+            // outside of this function at a higher level.
+        extract_rice_param_part:
+            bs_cache       <<= lzcount;
+            bs_consumedBits += lzcount;
+
+            if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+                // Getting here means the rice parameter part is wholly contained within the current cache line.
+                pRiceParamPartOut[i] = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+                bs_cache       <<= riceParamPlus1;
+                bs_consumedBits += riceParamPlus1;
+            } else {
+                // Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
+                // line, reload the cache, and then combine it with the head of the next cache line.
+
+                // Grab the high part of the rice parameter part.
+                drflac_uint32 riceParamPartHi = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
+
+                // Before reloading the cache we need to grab the size in bits of the low part.
+                drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+                
+                // Now reload the cache.
+                if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+                #ifndef DR_FLAC_NO_CRC
+                    drflac__update_crc16(bs);
+                #endif
+                    bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                    bs_consumedBits = riceParamPartLoBitCount;
+                #ifndef DR_FLAC_NO_CRC
+                    bs->crc16Cache = bs_cache;
+                #endif
+                } else {
+                    // Slow path. We need to fetch more data from the client.
+                    if (!drflac__reload_cache(bs)) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    bs_cache = bs->cache;
+                    bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+                }
+
+                // We should now have enough information to construct the rice parameter part.
+                drflac_uint32 riceParamPartLo = (drflac_uint32)(bs_cache >> (DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
+                pRiceParamPartOut[i] = riceParamPartHi | riceParamPartLo;
+
+                bs_cache <<= riceParamPartLoBitCount;
+            }
+        } else {
+            // Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
+            // to drflac__clz() and we need to reload the cache.
+            drflac_uint32 zeroCounter = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
+            for (;;) {
+                if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+                #ifndef DR_FLAC_NO_CRC
+                    drflac__update_crc16(bs);
+                #endif
+                    bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                    bs_consumedBits = 0;
+                #ifndef DR_FLAC_NO_CRC
+                    bs->crc16Cache = bs_cache;
+                #endif
+                } else {
+                    // Slow path. We need to fetch more data from the client.
+                    if (!drflac__reload_cache(bs)) {
+                        return DRFLAC_FALSE;
+                    }
+
+                    bs_cache = bs->cache;
+                    bs_consumedBits = bs->consumedBits;
+                }
+
+                lzcount = drflac__clz(bs_cache);
+                zeroCounter += lzcount;
+
+                if (lzcount < sizeof(bs_cache)*8) {
+                    break;
+                }
+            }
+
+            pZeroCounterOut[i] = zeroCounter;
+            goto extract_rice_param_part;
+        }
+    }
+
+    // Make sure the cache is restored at the end of it all.
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+
+    return DRFLAC_TRUE;
+}
+
+static DRFLAC_INLINE drflac_bool32 drflac__seek_rice_parts(drflac_bs* bs, drflac_uint8 riceParam)
+{
+    drflac_uint32  riceParamPlus1 = riceParam + 1;
+    drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+
+    // The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
+    // no idea how this will work in practice...
+    drflac_cache_t bs_cache = bs->cache;
+    drflac_uint32  bs_consumedBits = bs->consumedBits;
+
+    // The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line.
+    drflac_uint32  lzcount = drflac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+        // It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
+        // this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
+        // outside of this function at a higher level.
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
+
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            // Getting here means the rice parameter part is wholly contained within the current cache line.
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
+        } else {
+            // Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
+            // line, reload the cache, and then combine it with the head of the next cache line.
+
+            // Before reloading the cache we need to grab the size in bits of the low part.
+            drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            drflac_assert(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+                
+            // Now reload the cache.
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        // Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
+        // to drflac__clz() and we need to reload the cache.
+        for (;;) {
+            if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef DR_FLAC_NO_CRC
+                drflac__update_crc16(bs);
+            #endif
+                bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                // Slow path. We need to fetch more data from the client.
+                if (!drflac__reload_cache(bs)) {
+                    return DRFLAC_FALSE;
+                }
+
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+
+            lzcount = drflac__clz(bs_cache);
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
+            }
+        }
+
+        goto extract_rice_param_part;
+    }
+
+    // Make sure the cache is restored at the end of it all.
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+
     return DRFLAC_TRUE;
 }
 
 
-static drflac_bool32 drflac__decode_samples_with_residual__rice__simple(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
 {
     drflac_assert(bs != NULL);
     drflac_assert(count > 0);
     drflac_assert(pSamplesOut != NULL);
 
-    static drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
 
     drflac_uint32 zeroCountPart0;
     drflac_uint32 zeroCountPart1;
@@ -2421,57 +3236,92 @@ static drflac_bool32 drflac__decode_samples_with_residual__rice__simple(drflac_b
     drflac_uint32 riceParamPart1;
     drflac_uint32 riceParamPart2;
     drflac_uint32 riceParamPart3;
-    drflac_uint32 i4 = 0;
-    drflac_uint32 count4 = count >> 2;
-    while (i4 < count4) {
-        // Rice extraction.
-        if (!drflac__read_rice_parts(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
-            !drflac__read_rice_parts(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
-            !drflac__read_rice_parts(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
-            !drflac__read_rice_parts(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
-            return DRFLAC_FALSE;
-        }
+    drflac_uint32 riceParamMask = ~((~0UL) << riceParam);
+    const drflac_int32* pSamplesOutEnd = pSamplesOut + ((count >> 2) << 2);
 
-        riceParamPart0 |= (zeroCountPart0 << riceParam);
-        riceParamPart1 |= (zeroCountPart1 << riceParam);
-        riceParamPart2 |= (zeroCountPart2 << riceParam);
-        riceParamPart3 |= (zeroCountPart3 << riceParam);
+    if (bitsPerSample >= 24) {
+        while (pSamplesOut < pSamplesOutEnd) {
+            // Rice extraction. It's faster to do this one at a time against local variables than it is to use the x4 version
+            // against an array. Not sure why, but perhaps it's making more efficient use of registers?
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return DRFLAC_FALSE;
+            }
 
-        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-        riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
-        riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
-        riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
 
-        if (bitsPerSample > 16) {
             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 1);
             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 2);
             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 3);
-        } else {
+
+            pSamplesOut += 4;
+        }
+    } else {
+        while (pSamplesOut < pSamplesOutEnd) {
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return DRFLAC_FALSE;
+            }
+
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+
             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 1);
             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 2);
             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 3);
-        }
 
-        i4 += 1;
-        pSamplesOut += 4;
+            pSamplesOut += 4;
+        }
     }
 
-    drflac_uint32 i = i4 << 2;
+    
+
+    drflac_uint32 i = ((count >> 2) << 2);
     while (i < count) {
         // Rice extraction.
-        if (!drflac__read_rice_parts(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
             return DRFLAC_FALSE;
         }
 
         // Rice reconstruction.
+        riceParamPart0 &= riceParamMask;
         riceParamPart0 |= (zeroCountPart0 << riceParam);
         riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
         //riceParamPart0  = (riceParamPart0 >> 1) ^ (~(riceParamPart0 & 0x01) + 1);
 
         // Sample reconstruction.
-        if (bitsPerSample > 16) {
+        if (bitsPerSample >= 24) {
             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
         } else {
             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
@@ -2480,11 +3330,12 @@ static drflac_bool32 drflac__decode_samples_with_residual__rice__simple(drflac_b
         i += 1;
         pSamplesOut += 1;
     }
-
+    
     return DRFLAC_TRUE;
 }
 
-static drflac_bool32 drflac__decode_samples_with_residual__rice__param_equals_zero(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
+#if defined(DRFLAC_SUPPORT_SSE41)
+static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
 {
     drflac_assert(bs != NULL);
     drflac_assert(count > 0);
@@ -2492,67 +3343,126 @@ static drflac_bool32 drflac__decode_samples_with_residual__rice__param_equals_ze
 
     static drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
 
-    drflac_uint32 zeroCountPart0;
-    drflac_uint32 zeroCountPart1;
-    drflac_uint32 zeroCountPart2;
-    drflac_uint32 zeroCountPart3;
-    drflac_uint32 riceParamPart0;
-    drflac_uint32 riceParamPart1;
-    drflac_uint32 riceParamPart2;
-    drflac_uint32 riceParamPart3;
-    drflac_uint32 i4 = 0;
-    drflac_uint32 count4 = count >> 2;
-    while (i4 < count4) {
-        // Rice extraction.
-        if (!drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart0, &riceParamPart0) ||
-            !drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart1, &riceParamPart1) ||
-            !drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart2, &riceParamPart2) ||
-            !drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart3, &riceParamPart3)) {
-            return DRFLAC_FALSE;
+    //drflac_uint32 zeroCountParts[4];
+    //drflac_uint32 riceParamParts[4];
+
+    drflac_uint32 zeroCountParts0;
+    drflac_uint32 zeroCountParts1;
+    drflac_uint32 zeroCountParts2;
+    drflac_uint32 zeroCountParts3;
+    drflac_uint32 riceParamParts0;
+    drflac_uint32 riceParamParts1;
+    drflac_uint32 riceParamParts2;
+    drflac_uint32 riceParamParts3;
+
+    drflac_uint32 riceParamMask = ~((~0UL) << riceParam);
+    __m128i riceParamMask128 = _mm_set1_epi32(riceParamMask);
+    __m128i one = _mm_set1_epi32(0x01);
+
+    const drflac_int32* pSamplesOutEnd = pSamplesOut + ((count >> 2) << 2);
+
+    if (bitsPerSample >= 24) {
+        while (pSamplesOut < pSamplesOutEnd) {
+            // Rice extraction.
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+                return DRFLAC_FALSE;
+            }
+
+            __m128i zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+            __m128i riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+
+            riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+            riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+            riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_mullo_epi32(_mm_and_si128(riceParamPart128, one), _mm_set1_epi32(0xFFFFFFFF))); // <-- Only supported from SSE4.1
+            //riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, one)), one));  // <-- SSE2 compatible
+
+            drflac_uint32 riceParamParts[4];
+            _mm_storeu_si128((__m128i*)riceParamParts, riceParamPart128);
+
+        #if defined(DRFLAC_64BIT)
+            // The scalar implementation seems to be faster on 64-bit in my testing.
+            drflac__calculate_prediction_64_x4(order, shift, coefficients, riceParamParts, pSamplesOut);
+        #else
+            pSamplesOut[0] = riceParamParts[0] + drflac__calculate_prediction_64__sse41(order, shift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamParts[1] + drflac__calculate_prediction_64__sse41(order, shift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamParts[2] + drflac__calculate_prediction_64__sse41(order, shift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamParts[3] + drflac__calculate_prediction_64__sse41(order, shift, coefficients, pSamplesOut + 3);
+        #endif
+
+            pSamplesOut += 4;
+        }
+    } else {
+        drflac_int32 coefficientsUnaligned[32*4 + 4] = {0};
+        drflac_int32* coefficients128 = (drflac_int32*)(((size_t)coefficientsUnaligned + 15) & ~15);
+        for (drflac_uint32 i = 0; i < order; ++i) {
+            coefficients128[i*4+0] = coefficients[i];
+            coefficients128[i*4+1] = coefficients[i];
+            coefficients128[i*4+2] = coefficients[i];
+            coefficients128[i*4+3] = coefficients[i];
         }
 
-        riceParamPart0 |= zeroCountPart0;
-        riceParamPart1 |= zeroCountPart1;
-        riceParamPart2 |= zeroCountPart2;
-        riceParamPart3 |= zeroCountPart3;
+        while (pSamplesOut < pSamplesOutEnd) {
+            // Rice extraction.
+#if 1
+            if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+                !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+                return DRFLAC_FALSE;
+            }
 
-        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
-        riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
-        riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
-        riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+            __m128i zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+            __m128i riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+#else
+            if (!drflac__read_rice_parts_x4(bs, riceParam, zeroCountParts, riceParamParts)) {
+                return DRFLAC_FALSE;
+            }
 
-        if (bitsPerSample > 16) {
-            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
-            pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 1);
-            pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 2);
-            pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 3);
-        } else {
-            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
-            pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 1);
-            pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 2);
-            pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 3);
+            __m128i zeroCountPart128 = _mm_set_epi32(zeroCountParts[3], zeroCountParts[2], zeroCountParts[1], zeroCountParts[0]);
+            __m128i riceParamPart128 = _mm_set_epi32(riceParamParts[3], riceParamParts[2], riceParamParts[1], riceParamParts[0]);
+#endif
+
+            riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+            riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+            riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_mullo_epi32(_mm_and_si128(riceParamPart128, one), _mm_set1_epi32(0xFFFFFFFF)));
+
+#if 1
+            drflac__calculate_prediction_32_x4__sse41(order, shift, (const __m128i*)coefficients128, riceParamPart128, pSamplesOut);
+#else
+            drflac_int32 riceParamParts[4];
+            _mm_storeu_si128((__m128i*)riceParamParts, riceParamPart128);
+
+            pSamplesOut[0] = riceParamParts[0] + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamParts[1] + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamParts[2] + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamParts[3] + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 3);
+#endif
+
+            pSamplesOut += 4;
         }
-
-        i4 += 1;
-        pSamplesOut += 4;
     }
 
-    drflac_uint32 i = i4 << 2;
+
+    drflac_uint32 i = ((count >> 2) << 2);
     while (i < count) {
         // Rice extraction.
-        if (!drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart0, &riceParamPart0)) {
+        if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
             return DRFLAC_FALSE;
         }
 
         // Rice reconstruction.
-        riceParamPart0 |= zeroCountPart0;
-        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+        riceParamParts0 &= riceParamMask;
+        riceParamParts0 |= (zeroCountParts0 << riceParam);
+        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
 
         // Sample reconstruction.
-        if (bitsPerSample > 16) {
-            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
+        if (bitsPerSample >= 24) {
+            pSamplesOut[0] = riceParamParts0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
         } else {
-            pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
+            pSamplesOut[0] = riceParamParts0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
         }
 
         i += 1;
@@ -2561,18 +3471,23 @@ static drflac_bool32 drflac__decode_samples_with_residual__rice__param_equals_ze
 
     return DRFLAC_TRUE;
 }
+#endif
 
 static drflac_bool32 drflac__decode_samples_with_residual__rice(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
 {
-#if 0
-    return drflac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
-#else
-    if (riceParam != 0) {
-        return drflac__decode_samples_with_residual__rice__simple(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
-    } else {
-        return drflac__decode_samples_with_residual__rice__param_equals_zero(bs, bitsPerSample, count, order, shift, coefficients, pSamplesOut);
-    }
+#if defined(DRFLAC_SUPPORT_SSE41)
+    if (drflac__gIsSSE41Supported) {
+        return drflac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
+    } else
 #endif
+    {
+        // Scalar fallback.
+    #if 0
+        return drflac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
+    #else
+        return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
+    #endif
+    }
 }
 
 // Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes.
@@ -2581,20 +3496,9 @@ static drflac_bool32 drflac__read_and_seek_residual__rice(drflac_bs* bs, drflac_
     drflac_assert(bs != NULL);
     drflac_assert(count > 0);
 
-    drflac_uint32 zeroCountPart;
-    drflac_uint32 riceParamPart;
-
-    if (riceParam != 0) {
-        for (drflac_uint32 i = 0; i < count; ++i) {
-            if (!drflac__read_rice_parts(bs, riceParam, &zeroCountPart, &riceParamPart)) {
-                return DRFLAC_FALSE;
-            }
-        }
-    } else {
-        for (drflac_uint32 i = 0; i < count; ++i) {
-            if (!drflac__read_rice_parts__param_equals_zero(bs, &zeroCountPart, &riceParamPart)) {
-                return DRFLAC_FALSE;
-            }
+    for (drflac_uint32 i = 0; i < count; ++i) {
+        if (!drflac__seek_rice_parts(bs, riceParam)) {
+            return DRFLAC_FALSE;
         }
     }
 
@@ -2833,7 +3737,7 @@ static drflac_bool32 drflac__decode_samples__verbatim(drflac_bs* bs, drflac_uint
 
 static drflac_bool32 drflac__decode_samples__fixed(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 bitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
 {
-    drflac_int32 lpcCoefficientsTable[5][4] = {
+    static drflac_int32 lpcCoefficientsTable[5][4] = {
         {0,  0, 0,  0},
         {1,  0, 0,  0},
         {2, -1, 0,  0},
@@ -2890,6 +3794,7 @@ static drflac_bool32 drflac__decode_samples__lpc(drflac_bs* bs, drflac_uint32 bl
 
 
     drflac_int32 coefficients[32];
+    drflac_zero_memory(coefficients, sizeof(coefficients));
     for (i = 0; i < lpcOrder; ++i) {
         if (!drflac__read_int32(bs, lpcPrecision, coefficients + i)) {
             return DRFLAC_FALSE;
@@ -2904,7 +3809,7 @@ static drflac_bool32 drflac__decode_samples__lpc(drflac_bs* bs, drflac_uint32 bl
 }
 
 
-static drflac_bool32 drflac__read_next_frame_header(drflac_bs* bs, drflac_uint8 streaminfoBitsPerSample, drflac_frame_header* header)
+static drflac_bool32 drflac__read_next_flac_frame_header(drflac_bs* bs, drflac_uint8 streaminfoBitsPerSample, drflac_frame_header* header)
 {
     drflac_assert(bs != NULL);
     drflac_assert(header != NULL);
@@ -3275,7 +4180,7 @@ static DRFLAC_INLINE drflac_uint8 drflac__get_channel_count_from_channel_assignm
     return lookup[channelAssignment];
 }
 
-static drflac_result drflac__decode_frame(drflac* pFlac)
+static drflac_result drflac__decode_flac_frame(drflac* pFlac)
 {
     // This function should be called while the stream is sitting on the first byte after the frame header.
     drflac_zero_memory(pFlac->currentFrame.subframes, sizeof(pFlac->currentFrame.subframes));
@@ -3292,7 +4197,7 @@ static drflac_result drflac__decode_frame(drflac* pFlac)
     }
 
     for (int i = 0; i < channelCount; ++i) {
-        if (!drflac__decode_subframe(&pFlac->bs, &pFlac->currentFrame, i, pFlac->pDecodedSamples + (pFlac->currentFrame.header.blockSize * i))) {
+        if (!drflac__decode_subframe(&pFlac->bs, &pFlac->currentFrame, i, pFlac->pDecodedSamples + ((pFlac->currentFrame.header.blockSize+DRFLAC_LEADING_SAMPLES) * i) + DRFLAC_LEADING_SAMPLES)) {
             return DRFLAC_ERROR;
         }
     }
@@ -3324,7 +4229,7 @@ static drflac_result drflac__decode_frame(drflac* pFlac)
     return DRFLAC_SUCCESS;
 }
 
-static drflac_result drflac__seek_frame(drflac* pFlac)
+static drflac_result drflac__seek_flac_frame(drflac* pFlac)
 {
     int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.header.channelAssignment);
     for (int i = 0; i < channelCount; ++i) {
@@ -3356,16 +4261,16 @@ static drflac_result drflac__seek_frame(drflac* pFlac)
     return DRFLAC_SUCCESS;
 }
 
-static drflac_bool32 drflac__read_and_decode_next_frame(drflac* pFlac)
+static drflac_bool32 drflac__read_and_decode_next_flac_frame(drflac* pFlac)
 {
     drflac_assert(pFlac != NULL);
 
     for (;;) {
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
 
-        drflac_result result = drflac__decode_frame(pFlac);
+        drflac_result result = drflac__decode_flac_frame(pFlac);
         if (result != DRFLAC_SUCCESS) {
             if (result == DRFLAC_CRC_MISMATCH) {
                 continue;   // CRC mismatch. Skip to the next frame.
@@ -3411,11 +4316,41 @@ static drflac_bool32 drflac__seek_to_first_frame(drflac* pFlac)
     return result;
 }
 
-static DRFLAC_INLINE drflac_result drflac__seek_to_next_frame(drflac* pFlac)
+static DRFLAC_INLINE drflac_result drflac__seek_to_next_flac_frame(drflac* pFlac)
 {
     // This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section.
     drflac_assert(pFlac != NULL);
-    return drflac__seek_frame(pFlac);
+    return drflac__seek_flac_frame(pFlac);
+}
+
+drflac_uint64 drflac__seek_forward_by_samples(drflac* pFlac, drflac_uint64 samplesToRead)
+{
+    drflac_uint64 samplesRead = 0;
+    while (samplesToRead > 0) {
+        if (pFlac->currentFrame.samplesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        } else {
+            if (pFlac->currentFrame.samplesRemaining > samplesToRead) {
+                samplesRead   += samplesToRead;
+                pFlac->currentFrame.samplesRemaining -= (drflac_uint32)samplesToRead;   // <-- Safe cast. Will always be < currentFrame.samplesRemaining < 65536.
+                samplesToRead  = 0;
+            } else {
+                samplesRead   += pFlac->currentFrame.samplesRemaining;
+                samplesToRead -= pFlac->currentFrame.samplesRemaining;
+                pFlac->currentFrame.samplesRemaining = 0;
+            }
+        }
+    }
+
+    pFlac->currentSample += samplesRead;
+    return samplesRead;
+}
+
+drflac_uint64 drflac__seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 pcmFramesToSeek)
+{
+    return drflac__seek_forward_by_samples(pFlac, pcmFramesToSeek*pFlac->channels);
 }
 
 static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_uint64 sampleIndex)
@@ -3432,7 +4367,7 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
 
         // The frame header for the first frame may not yet have been read. We need to do that if necessary.
         if (pFlac->currentSample == 0 && pFlac->currentFrame.samplesRemaining == 0) {
-            if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+            if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
                 return DRFLAC_FALSE;
             }
         } else {
@@ -3448,7 +4383,7 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
         }
 
         // Decode the first frame in preparation for sample-exact seeking below.
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
     }
@@ -3467,10 +4402,10 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
             drflac_uint64 samplesToDecode = sampleIndex - runningSampleCount;
 
             if (!isMidFrame) {
-                drflac_result result = drflac__decode_frame(pFlac);
+                drflac_result result = drflac__decode_flac_frame(pFlac);
                 if (result == DRFLAC_SUCCESS) {
                     // The frame is valid. We just need to skip over some samples to ensure it's sample-exact.
-                    return drflac_read_s32(pFlac, samplesToDecode, NULL) == samplesToDecode;  // <-- If this fails, something bad has happened (it should never fail).
+                    return drflac__seek_forward_by_samples(pFlac, samplesToDecode) == samplesToDecode;  // <-- If this fails, something bad has happened (it should never fail).
                 } else {
                     if (result == DRFLAC_CRC_MISMATCH) {
                         goto next_iteration;   // CRC mismatch. Pretend this frame never existed.
@@ -3480,13 +4415,13 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
                 }
             } else {
                 // We started seeking mid-frame which means we need to skip the frame decoding part.
-                return drflac_read_s32(pFlac, samplesToDecode, NULL) == samplesToDecode;
+                return drflac__seek_forward_by_samples(pFlac, samplesToDecode) == samplesToDecode;
             }
         } else {
             // It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
             // frame never existed and leave the running sample count untouched.
             if (!isMidFrame) {
-                drflac_result result = drflac__seek_to_next_frame(pFlac);
+                drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
                 if (result == DRFLAC_SUCCESS) {
                     runningSampleCount += sampleCountInThisFrame;
                 } else {
@@ -3498,7 +4433,7 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
                 }
             } else {
                 // We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
-                // drflac__seek_to_next_frame() which only works if the decoder is sitting on the byte just after the frame header.
+                // drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
                 runningSampleCount += pFlac->currentFrame.samplesRemaining;
                 pFlac->currentFrame.samplesRemaining = 0;
                 isMidFrame = DRFLAC_FALSE;
@@ -3507,7 +4442,7 @@ static drflac_bool32 drflac__seek_to_sample__brute_force(drflac* pFlac, drflac_u
 
     next_iteration:
         // Grab the next frame in preparation for the next iteration.
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
     }
@@ -3544,7 +4479,7 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
 
         // The frame header for the first frame may not yet have been read. We need to do that if necessary.
         if (pFlac->currentSample == 0 && pFlac->currentFrame.samplesRemaining == 0) {
-            if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+            if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
                 return DRFLAC_FALSE;
             }
         } else {
@@ -3559,7 +4494,7 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
         }
 
         // Grab the frame the seekpoint is sitting on in preparation for the sample-exact seeking below.
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
     }
@@ -3576,10 +4511,10 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
             drflac_uint64 samplesToDecode = sampleIndex - runningSampleCount;
 
             if (!isMidFrame) {
-                drflac_result result = drflac__decode_frame(pFlac);
+                drflac_result result = drflac__decode_flac_frame(pFlac);
                 if (result == DRFLAC_SUCCESS) {
                     // The frame is valid. We just need to skip over some samples to ensure it's sample-exact.
-                    return drflac_read_s32(pFlac, samplesToDecode, NULL) == samplesToDecode;  // <-- If this fails, something bad has happened (it should never fail).
+                    return drflac__seek_forward_by_samples(pFlac, samplesToDecode) == samplesToDecode;  // <-- If this fails, something bad has happened (it should never fail).
                 } else {
                     if (result == DRFLAC_CRC_MISMATCH) {
                         goto next_iteration;   // CRC mismatch. Pretend this frame never existed.
@@ -3589,13 +4524,13 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
                 }
             } else {
                 // We started seeking mid-frame which means we need to skip the frame decoding part.
-                return drflac_read_s32(pFlac, samplesToDecode, NULL) == samplesToDecode;
+                return drflac__seek_forward_by_samples(pFlac, samplesToDecode) == samplesToDecode;
             }
         } else {
             // It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
             // frame never existed and leave the running sample count untouched.
             if (!isMidFrame) {
-                drflac_result result = drflac__seek_to_next_frame(pFlac);
+                drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
                 if (result == DRFLAC_SUCCESS) {
                     runningSampleCount += sampleCountInThisFrame;
                 } else {
@@ -3607,7 +4542,7 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
                 }
             } else {
                 // We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
-                // drflac__seek_to_next_frame() which only works if the decoder is sitting on the byte just after the frame header.
+                // drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
                 runningSampleCount += pFlac->currentFrame.samplesRemaining;
                 pFlac->currentFrame.samplesRemaining = 0;
                 isMidFrame = DRFLAC_FALSE;
@@ -3616,7 +4551,7 @@ static drflac_bool32 drflac__seek_to_sample__seek_table(drflac* pFlac, drflac_ui
 
     next_iteration:
         // Grab the next frame in preparation for the next iteration.
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
     }
@@ -4086,7 +5021,7 @@ drflac_bool32 drflac__init_private__native(drflac_init_info* pInit, drflac_read_
             pInit->hasStreamInfoBlock = DRFLAC_FALSE;
             pInit->hasMetadataBlocks  = DRFLAC_FALSE;
 
-            if (!drflac__read_next_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
+            if (!drflac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
                 return DRFLAC_FALSE;    // Couldn't find a frame.
             }
 
@@ -4112,7 +5047,7 @@ drflac_bool32 drflac__init_private__native(drflac_init_info* pInit, drflac_read_
         pInit->bitsPerSample      = streaminfo.bitsPerSample;
         pInit->totalSampleCount   = streaminfo.totalSampleCount;
         pInit->maxBlockSize       = streaminfo.maxBlockSize;    // Don't care about the min block size - only the max (used for determining the size of the memory allocation).
-        pInit->hasMetadataBlocks = !isLastBlock;
+        pInit->hasMetadataBlocks  = !isLastBlock;
 
         if (onMeta) {
             drflac_metadata metadata;
@@ -4695,14 +5630,14 @@ drflac_bool32 drflac_ogg__seek_to_sample(drflac* pFlac, drflac_uint64 sampleInde
         // bitstream. This is important to consider because it means we cannot read data from the drflac_bs object using the
         // standard drflac__*() APIs because that will read in extra data for its own internal caching which in turn breaks
         // the positioning of the read pointer of the physical Ogg bitstream. Therefore, anything that would normally be read
-        // using the native FLAC decoding APIs, such as drflac__read_next_frame_header(), need to be re-implemented so as to
+        // using the native FLAC decoding APIs, such as drflac__read_next_flac_frame_header(), need to be re-implemented so as to
         // avoid the use of the drflac_bs object.
         //
         // Considering these issues, I have decided to use the slower native FLAC decoding method for the following reasons:
         //   1) Seeking is already partially accelerated using Ogg's paging system in the code block above.
         //   2) Seeking in an Ogg encapsulated FLAC stream is probably quite uncommon.
         //   3) Simplicity.
-        if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+        if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
             return DRFLAC_FALSE;
         }
 
@@ -4714,14 +5649,14 @@ drflac_bool32 drflac_ogg__seek_to_sample(drflac* pFlac, drflac_uint64 sampleInde
         if (sampleIndex < (runningSampleCount + sampleCountInThisFrame)) {
             // The sample should be in this frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
             // it never existed and keep iterating.
-            drflac_result result = drflac__decode_frame(pFlac);
+            drflac_result result = drflac__decode_flac_frame(pFlac);
             if (result == DRFLAC_SUCCESS) {
                 // The frame is valid. We just need to skip over some samples to ensure it's sample-exact.
                 drflac_uint64 samplesToDecode = (size_t)(sampleIndex - runningSampleCount);    // <-- Safe cast because the maximum number of samples in a frame is 65535.
                 if (samplesToDecode == 0) {
                     return DRFLAC_TRUE;
                 }
-                return drflac_read_s32(pFlac, samplesToDecode, NULL) != 0;  // <-- If this fails, something bad has happened (it should never fail).
+                return drflac__seek_forward_by_samples(pFlac, samplesToDecode) == samplesToDecode;  // <-- If this fails, something bad has happened (it should never fail).
             } else {
                 if (result == DRFLAC_CRC_MISMATCH) {
                     continue;   // CRC mismatch. Pretend this frame never existed.
@@ -4732,7 +5667,7 @@ drflac_bool32 drflac_ogg__seek_to_sample(drflac* pFlac, drflac_uint64 sampleInde
         } else {
             // It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
             // frame never existed and leave the running sample count untouched.
-            drflac_result result = drflac__seek_to_next_frame(pFlac);
+            drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
             if (result == DRFLAC_SUCCESS) {
                 runningSampleCount += sampleCountInThisFrame;
             } else {
@@ -4986,15 +5921,16 @@ void drflac__init_from_info(drflac* pFlac, drflac_init_info* pInit)
     drflac_assert(pInit != NULL);
 
     drflac_zero_memory(pFlac, sizeof(*pFlac));
-    pFlac->bs               = pInit->bs;
-    pFlac->onMeta           = pInit->onMeta;
-    pFlac->pUserDataMD      = pInit->pUserDataMD;
-    pFlac->maxBlockSize     = pInit->maxBlockSize;
-    pFlac->sampleRate       = pInit->sampleRate;
-    pFlac->channels         = (drflac_uint8)pInit->channels;
-    pFlac->bitsPerSample    = (drflac_uint8)pInit->bitsPerSample;
-    pFlac->totalSampleCount = pInit->totalSampleCount;
-    pFlac->container        = pInit->container;
+    pFlac->bs                 = pInit->bs;
+    pFlac->onMeta             = pInit->onMeta;
+    pFlac->pUserDataMD        = pInit->pUserDataMD;
+    pFlac->maxBlockSize       = pInit->maxBlockSize;
+    pFlac->sampleRate         = pInit->sampleRate;
+    pFlac->channels           = (drflac_uint8)pInit->channels;
+    pFlac->bitsPerSample      = (drflac_uint8)pInit->bitsPerSample;
+    pFlac->totalSampleCount   = pInit->totalSampleCount;
+    pFlac->totalPCMFrameCount = pInit->totalSampleCount / pFlac->channels;
+    pFlac->container          = pInit->container;
 }
 
 drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD)
@@ -5021,10 +5957,10 @@ drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_p
     // The allocation size for decoded frames depends on the number of 32-bit integers that fit inside the largest SIMD vector
     // we are supporting.
     drflac_uint32 wholeSIMDVectorCountPerChannel;
-    if ((init.maxBlockSize % (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) == 0) {
-        wholeSIMDVectorCountPerChannel = (init.maxBlockSize / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32)));
+    if (((init.maxBlockSize+DRFLAC_LEADING_SAMPLES) % (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) == 0) {
+        wholeSIMDVectorCountPerChannel = ((init.maxBlockSize+DRFLAC_LEADING_SAMPLES) / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32)));
     } else {
-        wholeSIMDVectorCountPerChannel = (init.maxBlockSize / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) + 1;
+        wholeSIMDVectorCountPerChannel = ((init.maxBlockSize+DRFLAC_LEADING_SAMPLES) / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) + 1;
     }
 
     drflac_uint32 decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * DRFLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
@@ -5151,12 +6087,12 @@ drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_p
         pFlac->currentFrame.header = init.firstFrameHeader;
         do
         {
-            drflac_result result = drflac__decode_frame(pFlac);
+            drflac_result result = drflac__decode_flac_frame(pFlac);
             if (result == DRFLAC_SUCCESS) {
                 break;
             } else {
                 if (result == DRFLAC_CRC_MISMATCH) {
-                    if (!drflac__read_next_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
+                    if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFrame.header)) {
                         DRFLAC_FREE(pFlac);
                         return NULL;
                     }
@@ -5475,31 +6411,6 @@ drflac_uint64 drflac__read_s32__misaligned(drflac* pFlac, drflac_uint64 samplesT
     return samplesRead;
 }
 
-drflac_uint64 drflac__seek_forward_by_samples(drflac* pFlac, drflac_uint64 samplesToRead)
-{
-    drflac_uint64 samplesRead = 0;
-    while (samplesToRead > 0) {
-        if (pFlac->currentFrame.samplesRemaining == 0) {
-            if (!drflac__read_and_decode_next_frame(pFlac)) {
-                break;  // Couldn't read the next frame, so just break from the loop and return.
-            }
-        } else {
-            if (pFlac->currentFrame.samplesRemaining > samplesToRead) {
-                samplesRead   += samplesToRead;
-                pFlac->currentFrame.samplesRemaining -= (drflac_uint32)samplesToRead;   // <-- Safe cast. Will always be < currentFrame.samplesRemaining < 65536.
-                samplesToRead  = 0;
-            } else {
-                samplesRead   += pFlac->currentFrame.samplesRemaining;
-                samplesToRead -= pFlac->currentFrame.samplesRemaining;
-                pFlac->currentFrame.samplesRemaining = 0;
-            }
-        }
-    }
-
-    pFlac->currentSample += samplesRead;
-    return samplesRead;
-}
-
 drflac_uint64 drflac_read_s32(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int32* bufferOut)
 {
     // Note that <bufferOut> is allowed to be null, in which case this will act like a seek.
@@ -5516,7 +6427,7 @@ drflac_uint64 drflac_read_s32(drflac* pFlac, drflac_uint64 samplesToRead, drflac
     while (samplesToRead > 0) {
         // If we've run out of samples in this frame, go to the next.
         if (pFlac->currentFrame.samplesRemaining == 0) {
-            if (!drflac__read_and_decode_next_frame(pFlac)) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
                 break;  // Couldn't read the next frame, so just break from the loop and return.
             }
         } else {
@@ -5649,11 +6560,37 @@ drflac_uint64 drflac_read_s32(drflac* pFlac, drflac_uint64 samplesToRead, drflac
     return samplesRead;
 }
 
+drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4996)   // was declared deprecated
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+    return drflac_read_s32(pFlac, framesToRead*pFlac->channels, pBufferOut) / pFlac->channels;
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+}
+
+
 drflac_uint64 drflac_read_s16(drflac* pFlac, drflac_uint64 samplesToRead, drflac_int16* pBufferOut)
 {
     // This reads samples in 2 passes and can probably be optimized.
     drflac_uint64 totalSamplesRead = 0;
 
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4996)   // was declared deprecated
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
     while (samplesToRead > 0) {
         drflac_int32 samples32[4096];
         drflac_uint64 samplesJustRead = drflac_read_s32(pFlac, (samplesToRead > 4096) ? 4096 : samplesToRead, samples32);
@@ -5671,14 +6608,57 @@ drflac_uint64 drflac_read_s16(drflac* pFlac, drflac_uint64 samplesToRead, drflac
         pBufferOut       += samplesJustRead;
     }
 
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+
     return totalSamplesRead;
 }
 
+drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut)
+{
+    // This reads samples in 2 passes and can probably be optimized.
+    drflac_uint64 totalPCMFramesRead = 0;
+
+    while (framesToRead > 0) {
+        drflac_int32 samples32[4096];
+        drflac_uint64 framesJustRead = drflac_read_pcm_frames_s32(pFlac, (framesToRead > 4096/pFlac->channels) ? 4096/pFlac->channels : framesToRead, samples32);
+        if (framesJustRead == 0) {
+            break;  // Reached the end.
+        }
+
+        // s32 -> s16
+        for (drflac_uint64 iFrame = 0; iFrame < framesJustRead; ++iFrame) {
+            for (drflac_uint32 iChannel = 0; iChannel < pFlac->channels; ++iChannel) {
+                drflac_uint64 iSample = iFrame*pFlac->channels + iChannel;
+                pBufferOut[iSample] = (drflac_int16)(samples32[iSample] >> 16);
+            }
+        }
+
+        totalPCMFramesRead += framesJustRead;
+        framesToRead       -= framesJustRead;
+        pBufferOut         += framesJustRead * pFlac->channels;
+    }
+
+    return totalPCMFramesRead;
+}
+
+
 drflac_uint64 drflac_read_f32(drflac* pFlac, drflac_uint64 samplesToRead, float* pBufferOut)
 {
     // This reads samples in 2 passes and can probably be optimized.
     drflac_uint64 totalSamplesRead = 0;
 
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4996)   // was declared deprecated
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
     while (samplesToRead > 0) {
         drflac_int32 samples32[4096];
         drflac_uint64 samplesJustRead = drflac_read_s32(pFlac, (samplesToRead > 4096) ? 4096 : samplesToRead, samples32);
@@ -5696,9 +6676,645 @@ drflac_uint64 drflac_read_f32(drflac* pFlac, drflac_uint64 samplesToRead, float*
         pBufferOut       += samplesJustRead;
     }
 
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__GNUC__) || defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+
     return totalSamplesRead;
 }
 
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        int left  = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample);
+        int side  = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample);
+        int right = left - side;
+
+        pOutputSamples[i*2+0] = (float)(left / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)(right / 2147483648.0);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1 / 2147483648.0;
+
+    drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+    drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        drflac_int32 left0 = pInputSamples0[i*4+0] << shift0;
+        drflac_int32 left1 = pInputSamples0[i*4+1] << shift0;
+        drflac_int32 left2 = pInputSamples0[i*4+2] << shift0;
+        drflac_int32 left3 = pInputSamples0[i*4+3] << shift0;
+
+        drflac_int32 side0 = pInputSamples1[i*4+0] << shift1;
+        drflac_int32 side1 = pInputSamples1[i*4+1] << shift1;
+        drflac_int32 side2 = pInputSamples1[i*4+2] << shift1;
+        drflac_int32 side3 = pInputSamples1[i*4+3] << shift1;
+
+        drflac_int32 right0 = left0 - side0;
+        drflac_int32 right1 = left1 - side1;
+        drflac_int32 right2 = left2 - side2;
+        drflac_int32 right3 = left3 - side3;
+
+        pOutputSamples[i*8+0] = left0  * factor;
+        pOutputSamples[i*8+1] = right0 * factor;
+        pOutputSamples[i*8+2] = left1  * factor;
+        pOutputSamples[i*8+3] = right1 * factor;
+        pOutputSamples[i*8+4] = left2  * factor;
+        pOutputSamples[i*8+5] = right2 * factor;
+        pOutputSamples[i*8+6] = left3  * factor;
+        pOutputSamples[i*8+7] = right3 * factor;
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        int left  = pInputSamples0[i] << shift0;
+        int side  = pInputSamples1[i] << shift1;
+        int right = left - side;
+
+        pOutputSamples[i*2+0] = (float)(left  * factor);
+        pOutputSamples[i*2+1] = (float)(right * factor);
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_assert(pFlac->bitsPerSample <= 24);
+
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    __m128 factor = _mm_set1_ps(1.0f / 8388608.0f);
+    int shift0 = (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample) - 8;
+    int shift1 = (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample) - 8;
+
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        __m128i inputSample0 = _mm_loadu_si128((const __m128i*)pInputSamples0 + i);
+        __m128i inputSample1 = _mm_loadu_si128((const __m128i*)pInputSamples1 + i);
+
+        __m128i left  = _mm_slli_epi32(inputSample0, shift0);
+        __m128i side  = _mm_slli_epi32(inputSample1, shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
+
+        pOutputSamples[i*8+0] = ((float*)&leftf)[0];
+        pOutputSamples[i*8+1] = ((float*)&rightf)[0];
+        pOutputSamples[i*8+2] = ((float*)&leftf)[1];
+        pOutputSamples[i*8+3] = ((float*)&rightf)[1];
+        pOutputSamples[i*8+4] = ((float*)&leftf)[2];
+        pOutputSamples[i*8+5] = ((float*)&rightf)[2];
+        pOutputSamples[i*8+6] = ((float*)&leftf)[3];
+        pOutputSamples[i*8+7] = ((float*)&rightf)[3];
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        int left  = pInputSamples0[i] << shift0;
+        int side  = pInputSamples1[i] << shift1;
+        int right = left - side;
+
+        pOutputSamples[i*2+0] = (float)(left  / 8388608.0f);
+        pOutputSamples[i*2+1] = (float)(right / 8388608.0f);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        // Scalar fallback.
+#if 0
+        drflac_read_pcm_frames_f32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        int side  = pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample);
+        int right = pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample);
+        int left  = right + side;
+
+        pOutputSamples[i*2+0] = (float)(left / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)(right / 2147483648.0);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1 / 2147483648.0;
+
+    drflac_int32 shift0 = unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+    drflac_int32 shift1 = unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        drflac_int32 side0  = pInputSamples0[i*4+0] << shift0;
+        drflac_int32 side1  = pInputSamples0[i*4+1] << shift0;
+        drflac_int32 side2  = pInputSamples0[i*4+2] << shift0;
+        drflac_int32 side3  = pInputSamples0[i*4+3] << shift0;
+
+        drflac_int32 right0 = pInputSamples1[i*4+0] << shift1;
+        drflac_int32 right1 = pInputSamples1[i*4+1] << shift1;
+        drflac_int32 right2 = pInputSamples1[i*4+2] << shift1;
+        drflac_int32 right3 = pInputSamples1[i*4+3] << shift1;
+
+        drflac_int32 left0 = right0 + side0;
+        drflac_int32 left1 = right1 + side1;
+        drflac_int32 left2 = right2 + side2;
+        drflac_int32 left3 = right3 + side3;
+
+        pOutputSamples[i*8+0] = left0  * factor;
+        pOutputSamples[i*8+1] = right0 * factor;
+        pOutputSamples[i*8+2] = left1  * factor;
+        pOutputSamples[i*8+3] = right1 * factor;
+        pOutputSamples[i*8+4] = left2  * factor;
+        pOutputSamples[i*8+5] = right2 * factor;
+        pOutputSamples[i*8+6] = left3  * factor;
+        pOutputSamples[i*8+7] = right3 * factor;
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        int side  = pInputSamples0[i] << shift0;
+        int right = pInputSamples1[i] << shift1;
+        int left  = right + side;
+
+        pOutputSamples[i*2+0] = (float)(left  * factor);
+        pOutputSamples[i*2+1] = (float)(right * factor);
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_assert(pFlac->bitsPerSample <= 24);
+
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    __m128 factor = _mm_set1_ps(1.0f / 8388608.0f);
+    int shift0 = (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample) - 8;
+    int shift1 = (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample) - 8;
+
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        __m128i inputSample0 = _mm_loadu_si128((const __m128i*)pInputSamples0 + i);
+        __m128i inputSample1 = _mm_loadu_si128((const __m128i*)pInputSamples1 + i);
+
+        __m128i side  = _mm_slli_epi32(inputSample0, shift0);
+        __m128i right = _mm_slli_epi32(inputSample1, shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
+
+        pOutputSamples[i*8+0] = ((float*)&leftf)[0];
+        pOutputSamples[i*8+1] = ((float*)&rightf)[0];
+        pOutputSamples[i*8+2] = ((float*)&leftf)[1];
+        pOutputSamples[i*8+3] = ((float*)&rightf)[1];
+        pOutputSamples[i*8+4] = ((float*)&leftf)[2];
+        pOutputSamples[i*8+5] = ((float*)&rightf)[2];
+        pOutputSamples[i*8+6] = ((float*)&leftf)[3];
+        pOutputSamples[i*8+7] = ((float*)&rightf)[3];
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        int side  = pInputSamples0[i] << shift0;
+        int right = pInputSamples1[i] << shift1;
+        int left  = right + side;
+
+        pOutputSamples[i*2+0] = (float)(left  / 8388608.0f);
+        pOutputSamples[i*2+1] = (float)(right / 8388608.0f);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        // Scalar fallback.
+#if 0
+        drflac_read_pcm_frames_f32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        int mid  = pInputSamples0[i] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+        int side = pInputSamples1[i] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+                        
+        mid = (((drflac_uint32)mid) << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (float)((((mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((((mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1 / 2147483648.0;
+
+    int shift = unusedBitsPerSample;
+    if (shift > 0) {
+        shift -= 1;
+        for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+            int mid0  = pInputSamples0[i*4+0] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid1  = pInputSamples0[i*4+1] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid2  = pInputSamples0[i*4+2] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid3  = pInputSamples0[i*4+3] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+
+            int side0 = pInputSamples1[i*4+0] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side1 = pInputSamples1[i*4+1] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side2 = pInputSamples1[i*4+2] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side3 = pInputSamples1[i*4+3] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01);
+            mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01);
+            mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01);
+            mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01);
+
+            int temp0L = ((mid0 + side0) << shift);
+            int temp1L = ((mid1 + side1) << shift);
+            int temp2L = ((mid2 + side2) << shift);
+            int temp3L = ((mid3 + side3) << shift);
+
+            int temp0R = ((mid0 - side0) << shift);
+            int temp1R = ((mid1 - side1) << shift);
+            int temp2R = ((mid2 - side2) << shift);
+            int temp3R = ((mid3 - side3) << shift);
+
+            pOutputSamples[i*8+0] = (float)(temp0L * factor);
+            pOutputSamples[i*8+1] = (float)(temp0R * factor);
+            pOutputSamples[i*8+2] = (float)(temp1L * factor);
+            pOutputSamples[i*8+3] = (float)(temp1R * factor);
+            pOutputSamples[i*8+4] = (float)(temp2L * factor);
+            pOutputSamples[i*8+5] = (float)(temp2R * factor);
+            pOutputSamples[i*8+6] = (float)(temp3L * factor);
+            pOutputSamples[i*8+7] = (float)(temp3R * factor);
+        }
+    } else {
+        for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+            int mid0  = pInputSamples0[i*4+0] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid1  = pInputSamples0[i*4+1] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid2  = pInputSamples0[i*4+2] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int mid3  = pInputSamples0[i*4+3] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+
+            int side0 = pInputSamples1[i*4+0] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side1 = pInputSamples1[i*4+1] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side2 = pInputSamples1[i*4+2] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+            int side3 = pInputSamples1[i*4+3] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+
+            mid0 = (((drflac_uint32)mid0) << 1) | (side0 & 0x01);
+            mid1 = (((drflac_uint32)mid1) << 1) | (side1 & 0x01);
+            mid2 = (((drflac_uint32)mid2) << 1) | (side2 & 0x01);
+            mid3 = (((drflac_uint32)mid3) << 1) | (side3 & 0x01);
+
+            int temp0L = ((mid0 + side0) >> 1);
+            int temp1L = ((mid1 + side1) >> 1);
+            int temp2L = ((mid2 + side2) >> 1);
+            int temp3L = ((mid3 + side3) >> 1);
+
+            int temp0R = ((mid0 - side0) >> 1);
+            int temp1R = ((mid1 - side1) >> 1);
+            int temp2R = ((mid2 - side2) >> 1);
+            int temp3R = ((mid3 - side3) >> 1);
+
+            pOutputSamples[i*8+0] = (float)(temp0L * factor);
+            pOutputSamples[i*8+1] = (float)(temp0R * factor);
+            pOutputSamples[i*8+2] = (float)(temp1L * factor);
+            pOutputSamples[i*8+3] = (float)(temp1R * factor);
+            pOutputSamples[i*8+4] = (float)(temp2L * factor);
+            pOutputSamples[i*8+5] = (float)(temp2R * factor);
+            pOutputSamples[i*8+6] = (float)(temp3L * factor);
+            pOutputSamples[i*8+7] = (float)(temp3R * factor);
+        }
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        int mid  = pInputSamples0[i] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+        int side = pInputSamples1[i] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+                        
+        mid = (((drflac_uint32)mid) << 1) | (side & 0x01);
+
+        pOutputSamples[i*2+0] = (float)((((mid + side) >> 1) << unusedBitsPerSample) * factor);
+        pOutputSamples[i*2+1] = (float)((((mid - side) >> 1) << unusedBitsPerSample) * factor);
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_assert(pFlac->bitsPerSample <= 24);
+
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1.0f / 8388608.0f;
+    __m128 factor128 = _mm_set1_ps(1.0f / 8388608.0f);
+
+    int shift = unusedBitsPerSample - 8;
+    if (shift == 0) {
+        for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+            __m128i inputSample0 = _mm_loadu_si128((const __m128i*)pInputSamples0 + i);
+            __m128i inputSample1 = _mm_loadu_si128((const __m128i*)pInputSamples1 + i);
+
+            __m128i mid  = _mm_slli_epi32(inputSample0, pFlac->currentFrame.subframes[0].wastedBitsPerSample);
+            __m128i side = _mm_slli_epi32(inputSample1, pFlac->currentFrame.subframes[1].wastedBitsPerSample);
+
+            mid = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+
+            __m128i tempL = _mm_add_epi32(mid, side);
+            __m128i tempR = _mm_sub_epi32(mid, side);
+
+            // Signed bit shift.
+            tempL = _mm_or_si128(_mm_srli_epi32(tempL, 1), _mm_and_si128(tempL, _mm_set1_epi32(0x80000000)));
+            tempR = _mm_or_si128(_mm_srli_epi32(tempR, 1), _mm_and_si128(tempR, _mm_set1_epi32(0x80000000)));
+
+            __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
+
+            pOutputSamples[i*8+0] = ((float*)&leftf)[0];
+            pOutputSamples[i*8+1] = ((float*)&rightf)[0];
+            pOutputSamples[i*8+2] = ((float*)&leftf)[1];
+            pOutputSamples[i*8+3] = ((float*)&rightf)[1];
+            pOutputSamples[i*8+4] = ((float*)&leftf)[2];
+            pOutputSamples[i*8+5] = ((float*)&rightf)[2];
+            pOutputSamples[i*8+6] = ((float*)&leftf)[3];
+            pOutputSamples[i*8+7] = ((float*)&rightf)[3];
+        }
+
+        for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+            int mid  = pInputSamples0[i] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int side = pInputSamples1[i] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+                        
+            mid = (((drflac_uint32)mid) << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (float)(((mid + side) >> 1) * factor);
+            pOutputSamples[i*2+1] = (float)(((mid - side) >> 1) * factor);
+        }
+    } else {
+        for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+            __m128i inputSample0 = _mm_loadu_si128((const __m128i*)pInputSamples0 + i);
+            __m128i inputSample1 = _mm_loadu_si128((const __m128i*)pInputSamples1 + i);
+
+            __m128i mid  = _mm_slli_epi32(inputSample0, pFlac->currentFrame.subframes[0].wastedBitsPerSample);
+            __m128i side = _mm_slli_epi32(inputSample1, pFlac->currentFrame.subframes[1].wastedBitsPerSample);
+
+            mid = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+
+            __m128i tempL = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(mid, side), 1), shift);
+            __m128i tempR = _mm_slli_epi32(_mm_srli_epi32(_mm_sub_epi32(mid, side), 1), shift);
+
+            __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
+
+            pOutputSamples[i*8+0] = ((float*)&leftf)[0];
+            pOutputSamples[i*8+1] = ((float*)&rightf)[0];
+            pOutputSamples[i*8+2] = ((float*)&leftf)[1];
+            pOutputSamples[i*8+3] = ((float*)&rightf)[1];
+            pOutputSamples[i*8+4] = ((float*)&leftf)[2];
+            pOutputSamples[i*8+5] = ((float*)&rightf)[2];
+            pOutputSamples[i*8+6] = ((float*)&leftf)[3];
+            pOutputSamples[i*8+7] = ((float*)&rightf)[3];
+        }
+
+        for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+            int mid  = pInputSamples0[i] << pFlac->currentFrame.subframes[0].wastedBitsPerSample;
+            int side = pInputSamples1[i] << pFlac->currentFrame.subframes[1].wastedBitsPerSample;
+                        
+            mid = (((drflac_uint32)mid) << 1) | (side & 0x01);
+
+            pOutputSamples[i*2+0] = (float)((((mid + side) >> 1) << shift) * factor);
+            pOutputSamples[i*2+1] = (float)((((mid - side) >> 1) << shift) * factor);
+        }
+    }
+}
+#endif
+
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        // Scalar fallback.
+#if 0
+        drflac_read_pcm_frames_f32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+#if 0
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (drflac_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1 / 2147483648.0;
+
+    int shift0 = (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample);
+    int shift1 = (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample);
+
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        int tempL0 = pInputSamples0[i*4+0] << shift0;
+        int tempL1 = pInputSamples0[i*4+1] << shift0;
+        int tempL2 = pInputSamples0[i*4+2] << shift0;
+        int tempL3 = pInputSamples0[i*4+3] << shift0;
+
+        int tempR0 = pInputSamples1[i*4+0] << shift1;
+        int tempR1 = pInputSamples1[i*4+1] << shift1;
+        int tempR2 = pInputSamples1[i*4+2] << shift1;
+        int tempR3 = pInputSamples1[i*4+3] << shift1;
+
+        pOutputSamples[i*8+0] = (float)(tempL0 * factor);
+        pOutputSamples[i*8+1] = (float)(tempR0 * factor);
+        pOutputSamples[i*8+2] = (float)(tempL1 * factor);
+        pOutputSamples[i*8+3] = (float)(tempR1 * factor);
+        pOutputSamples[i*8+4] = (float)(tempL2 * factor);
+        pOutputSamples[i*8+5] = (float)(tempR2 * factor);
+        pOutputSamples[i*8+6] = (float)(tempL3 * factor);
+        pOutputSamples[i*8+7] = (float)(tempR3 * factor);
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << shift0) * factor);
+        pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << shift1) * factor);
+    }
+}
+
+#if defined(DRFLAC_SUPPORT_SSE2)
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+    drflac_uint64 frameCount4 = frameCount >> 2;
+
+    float factor = 1.0f / 8388608.0f;
+    __m128 factor128 = _mm_set1_ps(1.0f / 8388608.0f);
+
+    int shift0 = (unusedBitsPerSample + pFlac->currentFrame.subframes[0].wastedBitsPerSample) - 8;
+    int shift1 = (unusedBitsPerSample + pFlac->currentFrame.subframes[1].wastedBitsPerSample) - 8;
+
+    for (drflac_uint64 i = 0; i < frameCount4; ++i) {
+        __m128i inputSample0 = _mm_loadu_si128((const __m128i*)pInputSamples0 + i);
+        __m128i inputSample1 = _mm_loadu_si128((const __m128i*)pInputSamples1 + i);
+
+        __m128i i32L = _mm_slli_epi32(inputSample0, shift0);
+        __m128i i32R = _mm_slli_epi32(inputSample1, shift1);
+
+        __m128 f32L = _mm_mul_ps(_mm_cvtepi32_ps(i32L), factor128);
+        __m128 f32R = _mm_mul_ps(_mm_cvtepi32_ps(i32R), factor128);
+
+        pOutputSamples[i*8+0] = ((float*)&f32L)[0];
+        pOutputSamples[i*8+1] = ((float*)&f32R)[0];
+        pOutputSamples[i*8+2] = ((float*)&f32L)[1];
+        pOutputSamples[i*8+3] = ((float*)&f32R)[1];
+        pOutputSamples[i*8+4] = ((float*)&f32L)[2];
+        pOutputSamples[i*8+5] = ((float*)&f32R)[2];
+        pOutputSamples[i*8+6] = ((float*)&f32L)[3];
+        pOutputSamples[i*8+7] = ((float*)&f32R)[3];
+    }
+
+    for (drflac_uint64 i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (float)((pInputSamples0[i] << shift0) * factor);
+        pOutputSamples[i*2+1] = (float)((pInputSamples1[i] << shift1) * factor);
+    }
+}
+#endif
+
+static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_int32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(DRFLAC_SUPPORT_SSE2)
+    if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+        // Scalar fallback.
+#if 0
+        drflac_read_pcm_frames_f32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+
+drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut)
+{
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
+    }
+
+    if (pBufferOut == NULL) {
+        return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
+
+    drflac_uint64 framesRead = 0;
+    while (framesToRead > 0) {
+        // If we've run out of samples in this frame, go to the next.
+        if (pFlac->currentFrame.samplesRemaining == 0) {
+            if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
+                break;  // Couldn't read the next frame, so just break from the loop and return.
+            }
+        } else {
+            unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.header.channelAssignment);
+            drflac_uint64 totalFramesInPacket = pFlac->currentFrame.header.blockSize;
+            drflac_uint64 framesReadFromPacketSoFar = totalFramesInPacket - (pFlac->currentFrame.samplesRemaining/channelCount);
+            drflac_uint64 iFirstPCMFrame = framesReadFromPacketSoFar;
+            drflac_int32 unusedBitsPerSample = 32 - pFlac->bitsPerSample;
+
+            drflac_uint64 frameCountThisIteration = framesToRead;
+            if (frameCountThisIteration > pFlac->currentFrame.samplesRemaining / channelCount) {
+                frameCountThisIteration = pFlac->currentFrame.samplesRemaining / channelCount;
+            }
+
+            if (channelCount == 2) {
+                const drflac_int32* pDecodedSamples0 = pFlac->currentFrame.subframes[0].pDecodedSamples + iFirstPCMFrame;
+                const drflac_int32* pDecodedSamples1 = pFlac->currentFrame.subframes[1].pDecodedSamples + iFirstPCMFrame;
+
+                switch (pFlac->currentFrame.header.channelAssignment)
+                {
+                    case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                
+                    case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        drflac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+
+                    case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        drflac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                // Generic interleaving.
+                for (drflac_uint64 i = 0; i < frameCountThisIteration; ++i) {
+                    for (unsigned int j = 0; j < channelCount; ++j) {
+                        pBufferOut[(i*channelCount)+j] = (float)(((pFlac->currentFrame.subframes[j].pDecodedSamples[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFrame.subframes[j].wastedBitsPerSample)) / 2147483648.0);
+                    }
+                }
+            }
+
+            drflac_uint64 samplesReadThisIteration = frameCountThisIteration * channelCount;
+            framesRead                += frameCountThisIteration;
+            framesReadFromPacketSoFar += frameCountThisIteration;
+            pBufferOut                += samplesReadThisIteration;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentSample      += samplesReadThisIteration;
+            pFlac->currentFrame.samplesRemaining -= (unsigned int)samplesReadThisIteration;
+        }
+    }
+
+    return framesRead;
+}
+
 drflac_bool32 drflac_seek_to_sample(drflac* pFlac, drflac_uint64 sampleIndex)
 {
     if (pFlac == NULL) {
@@ -5765,6 +7381,72 @@ drflac_bool32 drflac_seek_to_sample(drflac* pFlac, drflac_uint64 sampleIndex)
     }
 }
 
+drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
+{
+    if (pFlac == NULL) {
+        return DRFLAC_FALSE;
+    }
+
+    // If we don't know where the first frame begins then we can't seek. This will happen when the STREAMINFO block was not present
+    // when the decoder was opened.
+    if (pFlac->firstFramePos == 0) {
+        return DRFLAC_FALSE;
+    }
+
+    if (pcmFrameIndex == 0) {
+        pFlac->currentSample = 0;
+        return drflac__seek_to_first_frame(pFlac);
+    } else {
+        drflac_bool32 wasSuccessful = DRFLAC_FALSE;
+
+        // Clamp the sample to the end.
+        if (pcmFrameIndex >= pFlac->totalPCMFrameCount) {
+            pcmFrameIndex  = pFlac->totalPCMFrameCount - 1;
+        }
+
+        // If the target sample and the current sample are in the same frame we just move the position forward.
+        if (pcmFrameIndex*pFlac->channels > pFlac->currentSample) {
+            // Forward.
+            drflac_uint32 offset = (drflac_uint32)(pcmFrameIndex*pFlac->channels - pFlac->currentSample);
+            if (pFlac->currentFrame.samplesRemaining >  offset) {
+                pFlac->currentFrame.samplesRemaining -= offset;
+                pFlac->currentSample = pcmFrameIndex*pFlac->channels;
+                return DRFLAC_TRUE;
+            }
+        } else {
+            // Backward.
+            drflac_uint32 offsetAbs = (drflac_uint32)(pFlac->currentSample - pcmFrameIndex*pFlac->channels);
+            drflac_uint32 currentFrameSampleCount = pFlac->currentFrame.header.blockSize * drflac__get_channel_count_from_channel_assignment(pFlac->currentFrame.header.channelAssignment);
+            drflac_uint32 currentFrameSamplesConsumed = (drflac_uint32)(currentFrameSampleCount - pFlac->currentFrame.samplesRemaining);
+            if (currentFrameSamplesConsumed > offsetAbs) {
+                pFlac->currentFrame.samplesRemaining += offsetAbs;
+                pFlac->currentSample = pcmFrameIndex*pFlac->channels;
+                return DRFLAC_TRUE;
+            }
+        }
+
+        // Different techniques depending on encapsulation. Using the native FLAC seektable with Ogg encapsulation is a bit awkward so
+        // we'll instead use Ogg's natural seeking facility.
+#ifndef DR_FLAC_NO_OGG
+        if (pFlac->container == drflac_container_ogg)
+        {
+            wasSuccessful = drflac_ogg__seek_to_sample(pFlac, pcmFrameIndex*pFlac->channels);
+        }
+        else
+#endif
+        {
+            // First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower.
+            wasSuccessful = drflac__seek_to_sample__seek_table(pFlac, pcmFrameIndex*pFlac->channels);
+            if (!wasSuccessful) {
+                wasSuccessful = drflac__seek_to_sample__brute_force(pFlac, pcmFrameIndex*pFlac->channels);
+            }
+        }
+
+        pFlac->currentSample = pcmFrameIndex*pFlac->channels;
+        return wasSuccessful;
+    }
+}
+
 
 
 //// High Level APIs ////
@@ -5781,15 +7463,15 @@ drflac_bool32 drflac_seek_to_sample(drflac* pFlac, drflac_uint64 sampleIndex)
 
 
 // Using a macro as the definition of the drflac__full_decode_and_close_*() API family. Sue me.
-#define DRFLAC_DEFINE_FULL_DECODE_AND_CLOSE(extension, type) \
-static type* drflac__full_decode_and_close_ ## extension (drflac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)\
+#define DRFLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
+static type* drflac__full_read_and_close_ ## extension (drflac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)\
 {                                                                                                                                                                   \
     drflac_assert(pFlac != NULL);                                                                                                                                   \
                                                                                                                                                                     \
     type* pSampleData = NULL;                                                                                                                                       \
-    drflac_uint64 totalSampleCount = pFlac->totalSampleCount;                                                                                                       \
+    drflac_uint64 totalPCMFrameCount = pFlac->totalPCMFrameCount;                                                                                                   \
                                                                                                                                                                     \
-    if (totalSampleCount == 0) {                                                                                                                                    \
+    if (totalPCMFrameCount == 0) {                                                                                                                                  \
         type buffer[4096];                                                                                                                                          \
                                                                                                                                                                     \
         size_t sampleDataBufferSize = sizeof(buffer);                                                                                                               \
@@ -5798,9 +7480,9 @@ static type* drflac__full_decode_and_close_ ## extension (drflac* pFlac, unsigne
             goto on_error;                                                                                                                                          \
         }                                                                                                                                                           \
                                                                                                                                                                     \
-        drflac_uint64 samplesRead;                                                                                                                                  \
-        while ((samplesRead = (drflac_uint64)drflac_read_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0]), buffer)) > 0) {                                       \
-            if (((totalSampleCount + samplesRead) * sizeof(type)) > sampleDataBufferSize) {                                                                         \
+        drflac_uint64 pcmFramesRead;                                                                                                                                \
+        while ((pcmFramesRead = (drflac_uint64)drflac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {          \
+            if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                   \
                 sampleDataBufferSize *= 2;                                                                                                                          \
                 type* pNewSampleData = (type*)DRFLAC_REALLOC(pSampleData, sampleDataBufferSize);                                                                    \
                 if (pNewSampleData == NULL) {                                                                                                                       \
@@ -5811,15 +7493,15 @@ static type* drflac__full_decode_and_close_ ## extension (drflac* pFlac, unsigne
                 pSampleData = pNewSampleData;                                                                                                                       \
             }                                                                                                                                                       \
                                                                                                                                                                     \
-            drflac_copy_memory(pSampleData + totalSampleCount, buffer, (size_t)(samplesRead*sizeof(type)));                                                         \
-            totalSampleCount += samplesRead;                                                                                                                        \
+            drflac_copy_memory(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                   \
+            totalPCMFrameCount += pcmFramesRead;                                                                                                                    \
         }                                                                                                                                                           \
                                                                                                                                                                     \
         /* At this point everything should be decoded, but we just want to fill the unused part buffer with silence - need to                                       \
            protect those ears from random noise! */                                                                                                                 \
-        drflac_zero_memory(pSampleData + totalSampleCount, (size_t)(sampleDataBufferSize - totalSampleCount*sizeof(type)));                                         \
+        drflac_zero_memory(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
     } else {                                                                                                                                                        \
-        drflac_uint64 dataSize = totalSampleCount * sizeof(type);                                                                                                   \
+        drflac_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
         if (dataSize > DRFLAC_SIZE_MAX) {                                                                                                                           \
             goto on_error;  /* The decoded data is too big. */                                                                                                      \
         }                                                                                                                                                           \
@@ -5829,12 +7511,12 @@ static type* drflac__full_decode_and_close_ ## extension (drflac* pFlac, unsigne
             goto on_error;                                                                                                                                          \
         }                                                                                                                                                           \
                                                                                                                                                                     \
-        totalSampleCount = drflac_read_##extension(pFlac, pFlac->totalSampleCount, pSampleData);                                                                    \
+        totalPCMFrameCount = drflac_read_pcm_frames_##extension(pFlac, pFlac->totalPCMFrameCount, pSampleData);                                                     \
     }                                                                                                                                                               \
                                                                                                                                                                     \
     if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
     if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
-    if (totalSampleCountOut) *totalSampleCountOut = totalSampleCount;                                                                                               \
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
                                                                                                                                                                     \
     drflac_close(pFlac);                                                                                                                                            \
     return pSampleData;                                                                                                                                             \
@@ -5844,141 +7526,356 @@ on_error:
     return NULL;                                                                                                                                                    \
 }
 
-DRFLAC_DEFINE_FULL_DECODE_AND_CLOSE(s32, drflac_int32)
-DRFLAC_DEFINE_FULL_DECODE_AND_CLOSE(s16, drflac_int16)
-DRFLAC_DEFINE_FULL_DECODE_AND_CLOSE(f32, float)
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s32, drflac_int32)
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s16, drflac_int16)
+DRFLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
 
-drflac_int32* drflac_open_and_decode_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)
 {
     // Safety.
-    if (sampleRate) *sampleRate = 0;
-    if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = 0;
 
     drflac* pFlac = drflac_open(onRead, onSeek, pUserData);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
 }
 
-drflac_int16* drflac_open_and_decode_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_and_decode_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
 {
     // Safety.
-    if (sampleRate) *sampleRate = 0;
-    if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
 
-    drflac* pFlac = drflac_open(onRead, onSeek, pUserData);
-    if (pFlac == NULL) {
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int32* pResult = drflac_open_and_read_pcm_frames_s32(onRead, onSeek, pUserData, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s16(pFlac, channels, sampleRate, totalSampleCount);
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
 }
 
-float* drflac_open_and_decode_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+
+
+drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)
 {
     // Safety.
-    if (sampleRate) *sampleRate = 0;
-    if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = 0;
 
     drflac* pFlac = drflac_open(onRead, onSeek, pUserData);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_f32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
+}
+
+drflac_int16* drflac_open_and_decode_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int16* pResult = drflac_open_and_read_pcm_frames_s16(onRead, onSeek, pUserData, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
+}
+
+
+float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = 0;
+
+    drflac* pFlac = drflac_open(onRead, onSeek, pUserData);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+
+    return drflac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
+}
+
+float* drflac_open_and_decode_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    float* pResult = drflac_open_and_read_pcm_frames_f32(onRead, onSeek, pUserData, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
 }
 
 #ifndef DR_FLAC_NO_STDIO
-drflac_int32* drflac_open_and_decode_file_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
 
     drflac* pFlac = drflac_open_file(filename);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 
-drflac_int16* drflac_open_and_decode_file_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_and_decode_file_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
 {
-    if (sampleRate) *sampleRate = 0;
-    if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
 
-    drflac* pFlac = drflac_open_file(filename);
-    if (pFlac == NULL) {
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int32* pResult = drflac_open_file_and_read_pcm_frames_s32(filename, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s16(pFlac, channels, sampleRate, totalSampleCount);
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
 }
 
-float* drflac_open_and_decode_file_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+
+drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
 
     drflac* pFlac = drflac_open_file(filename);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_f32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+
+drflac_int16* drflac_open_and_decode_file_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int16* pResult = drflac_open_file_and_read_pcm_frames_s16(filename, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
+}
+
+
+float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
+{
+    if (sampleRate) *sampleRate = 0;
+    if (channels) *channels = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
+
+    drflac* pFlac = drflac_open_file(filename);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+
+    return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+
+float* drflac_open_and_decode_file_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    float* pResult = drflac_open_file_and_read_pcm_frames_f32(filename, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
 }
 #endif
 
-drflac_int32* drflac_open_and_decode_memory_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
 
     drflac* pFlac = drflac_open_memory(data, dataSize);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 
-drflac_int16* drflac_open_and_decode_memory_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int32* drflac_open_and_decode_memory_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int32* pResult = drflac_open_memory_and_read_pcm_frames_s32(data, dataSize, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
+}
+
+
+drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
 
     drflac* pFlac = drflac_open_memory(data, dataSize);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_s16(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 
-float* drflac_open_and_decode_memory_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalSampleCount)
+drflac_int16* drflac_open_and_decode_memory_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    drflac_int16* pResult = drflac_open_memory_and_read_pcm_frames_s16(data, dataSize, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
+}
+
+
+float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
-    if (totalSampleCount) *totalSampleCount = 0;
+    if (totalPCMFrameCount) *totalPCMFrameCount = 0;
 
     drflac* pFlac = drflac_open_memory(data, dataSize);
     if (pFlac == NULL) {
         return NULL;
     }
 
-    return drflac__full_decode_and_close_f32(pFlac, channels, sampleRate, totalSampleCount);
+    return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
 }
 
+float* drflac_open_and_decode_memory_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalSampleCountOut)
+{
+    // Safety.
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalSampleCountOut) *totalSampleCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drflac_uint64 totalPCMFrameCount;
+
+    float* pResult = drflac_open_memory_and_read_pcm_frames_f32(data, dataSize, &channels, &sampleRate, &totalPCMFrameCount);
+    if (pResult == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalSampleCountOut) *totalSampleCountOut = totalPCMFrameCount * channels;
+
+    return pResult;
+}
+
+
 void drflac_free(void* pSampleDataReturnedByOpenAndDecode)
 {
     DRFLAC_FREE(pSampleDataReturnedByOpenAndDecode);
@@ -6061,6 +7958,17 @@ drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter,
 
 // REVISION HISTORY
 //
+// v0.11.0 - 2018-12-xx
+//   - API CHANGE: Deprecated drflac_read_s32(), drflac_read_s16() and drflac_read_f32() and replaced them with 
+//     drflac_read_pcm_frames_s32(), drflac_read_pcm_frames_s16() and drflac_read_pcm_frames_f32(). The new APIs take
+//     and return PCM frame counts instead of sample counts. To upgrade you will need to change the input count by
+//     dividing it by the channel count, and then do the same with the return value.
+//   - API_CHANGE: Deprecated drflac_seek_to_sample() and replaced with drflac_seek_to_pcm_frame(). Same rules as
+//     the changes to drflac_read_*() apply.
+//   - API CHANGE: Deprecated drflac_open_and_decode_*() and replaced with drflac_open_*_and_read_*(). Same rules as
+//     the changes to drflac_read_*() apply.
+//   - Optimizations.
+//
 // v0.10.0 - 2018-09-11
 //   - Remove the DR_FLAC_NO_WIN32_IO option and the Win32 file IO functionality. If you need to use Win32 file IO you
 //     need to do it yourself via the callback API.
diff --git a/extras/dr_mp3.h b/extras/dr_mp3.h
index cd8920a1..e951cbdb 100644
--- a/extras/dr_mp3.h
+++ b/extras/dr_mp3.h
@@ -1,5 +1,5 @@
 // MP3 audio decoder. Public domain. See "unlicense" statement at the end of this file.
-// dr_mp3 - v0.3.2 - 2018-09-11
+// dr_mp3 - v0.4.0 - 2018-xx-xx
 //
 // David Reid - mackron@gmail.com
 //
@@ -22,7 +22,7 @@
 //
 //     ...
 //
-//     drmp3_uint64 framesRead = drmp3_read_f32(pMP3, framesToRead, pFrames);
+//     drmp3_uint64 framesRead = drmp3_read_pcm_frames_f32(pMP3, framesToRead, pFrames);
 //
 // The drmp3 object is transparent so you can get access to the channel count and sample rate like so:
 //
@@ -35,12 +35,12 @@
 // The example above initializes a decoder from a file, but you can also initialize it from a block of memory and read and seek
 // callbacks with drmp3_init_memory() and drmp3_init() respectively.
 //
-// You do need to do any annoying memory management when reading PCM frames - this is all managed internally. You can request
-// any number of PCM frames in each call to drmp3_read_f32() and it will return as many PCM frames as it can, up to the requested
-// amount.
+// You do not need to do any annoying memory management when reading PCM frames - this is all managed internally. You can request
+// any number of PCM frames in each call to drmp3_read_pcm_frames_f32() and it will return as many PCM frames as it can, up to the
+// requested amount.
 //
-// You can also decode an entire file in one go with drmp3_open_and_decode_f32(), drmp3_open_and_decode_memory_f32() and
-// drmp3_open_and_decode_file_f32().
+// You can also decode an entire file in one go with drmp3_open_and_read_f32(), drmp3_open_memory_and_read_f32() and
+// drmp3_open_file_and_read_f32().
 //
 //
 // OPTIONS
@@ -52,11 +52,6 @@
 //
 // #define DR_MP3_NO_SIMD
 //   Disable SIMD optimizations.
-//
-//
-// LIMITATIONS
-// ===========
-// - Seeking is extremely inefficient.
 
 #ifndef dr_mp3_h
 #define dr_mp3_h
@@ -92,7 +87,8 @@ typedef drmp3_uint32     drmp3_bool32;
 #define DRMP3_TRUE       1
 #define DRMP3_FALSE      0
 
-#define DRMP3_MAX_SAMPLES_PER_FRAME (1152*2)
+#define DRMP3_MAX_PCM_FRAMES_PER_MP3_FRAME  1152
+#define DRMP3_MAX_SAMPLES_PER_FRAME         (DRMP3_MAX_PCM_FRAMES_PER_MP3_FRAME*2)
 
 
 // Low Level Push API
@@ -162,7 +158,7 @@ struct drmp3_src
     {
         struct
         {
-            float alpha;
+            double alpha;
             drmp3_bool32 isPrevFramesLoaded : 1;
             drmp3_bool32 isNextFramesLoaded : 1;
         } linear;
@@ -175,6 +171,14 @@ typedef enum
     drmp3_seek_origin_current
 } drmp3_seek_origin;
 
+typedef struct
+{
+    drmp3_uint64 seekPosInBytes;        // Points to the first byte of an MP3 frame.
+    drmp3_uint64 pcmFrameIndex;         // The index of the PCM frame this seek point targets.
+    drmp3_uint16 mp3FramesToDiscard;    // The number of whole MP3 frames to be discarded before pcmFramesToDiscard.
+    drmp3_uint16 pcmFramesToDiscard;    // The number of leading samples to read and discard. These are discarded after mp3FramesToDiscard.
+} drmp3_seek_point;
+
 // Callback for when data is read. Return value is the number of bytes actually read.
 //
 // pUserData   [in]  The user data that was passed to drmp3_init(), drmp3_open() and family.
@@ -214,12 +218,16 @@ typedef struct
     drmp3_read_proc onRead;
     drmp3_seek_proc onSeek;
     void* pUserData;
-    drmp3_uint32 frameChannels;     // The number of channels in the currently loaded MP3 frame. Internal use only.
-    drmp3_uint32 frameSampleRate;   // The sample rate of the currently loaded MP3 frame. Internal use only.
-    drmp3_uint32 framesConsumed;
-    drmp3_uint32 framesRemaining;
-    drmp3_uint8 frames[sizeof(float)*DRMP3_MAX_SAMPLES_PER_FRAME];  // <-- Multipled by sizeof(float) to ensure there's enough room for DR_MP3_FLOAT_OUTPUT.
+    drmp3_uint32 mp3FrameChannels;      // The number of channels in the currently loaded MP3 frame. Internal use only.
+    drmp3_uint32 mp3FrameSampleRate;    // The sample rate of the currently loaded MP3 frame. Internal use only.
+    drmp3_uint32 pcmFramesConsumedInMP3Frame;
+    drmp3_uint32 pcmFramesRemainingInMP3Frame;
+    drmp3_uint8 pcmFrames[sizeof(float)*DRMP3_MAX_SAMPLES_PER_FRAME];  // <-- Multipled by sizeof(float) to ensure there's enough room for DR_MP3_FLOAT_OUTPUT.
+    drmp3_uint64 currentPCMFrame;       // The current PCM frame, globally, based on the output sample rate. Mainly used for seeking.
+    drmp3_uint64 streamCursor;          // The current byte the decoder is sitting on in the raw stream.
     drmp3_src src;
+    drmp3_seek_point* pSeekPoints;      // NULL by default. Set with drmp3_bind_seek_table(). Memory is owned by the client. dr_mp3 will never attempt to free this pointer.
+    drmp3_uint32 seekPointCount;        // The number of items in pSeekPoints. When set to 0 assumes to no seek table. Defaults to zero.
     size_t dataSize;
     size_t dataCapacity;
     drmp3_uint8* pData;
@@ -268,12 +276,38 @@ void drmp3_uninit(drmp3* pMP3);
 // Reads PCM frames as interleaved 32-bit IEEE floating point PCM.
 //
 // Note that framesToRead specifies the number of PCM frames to read, _not_ the number of MP3 frames.
-drmp3_uint64 drmp3_read_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBufferOut);
+drmp3_uint64 drmp3_read_pcm_frames_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBufferOut);
 
 // Seeks to a specific frame.
 //
 // Note that this is _not_ an MP3 frame, but rather a PCM frame.
-drmp3_bool32 drmp3_seek_to_frame(drmp3* pMP3, drmp3_uint64 frameIndex);
+drmp3_bool32 drmp3_seek_to_pcm_frame(drmp3* pMP3, drmp3_uint64 frameIndex);
+
+// Calculates the total number of PCM frames in the MP3 stream. Cannot be used for infinite streams such as internet
+// radio. Runs in linear time. Returns 0 on error.
+drmp3_uint64 drmp3_get_pcm_frame_count(drmp3* pMP3);
+
+// Calculates the total number of MP3 frames in the MP3 stream. Cannot be used for infinite streams such as internet
+// radio. Runs in linear time. Returns 0 on error.
+drmp3_uint64 drmp3_get_mp3_frame_count(drmp3* pMP3);
+
+// Calculates the seekpoints based on PCM frames. This is slow.
+//
+// pSeekpoint count is a pointer to a uint32 containing the seekpoint count. On input it contains the desired count.
+// On output it contains the actual count. The reason for this design is that the client may request too many
+// seekpoints, in which case dr_mp3 will return a corrected count.
+//
+// Note that seektable seeking is not quite sample exact when the MP3 stream contains inconsistent sample rates.
+drmp3_bool32 drmp3_calculate_seek_points(drmp3* pMP3, drmp3_uint32* pSeekPointCount, drmp3_seek_point* pSeekPoints);
+
+// Binds a seek table to the decoder.
+//
+// This does _not_ make a copy of pSeekPoints - it only references it. It is up to the application to ensure this
+// remains valid while it is bound to the decoder.
+//
+// Use drmp3_calculate_seek_points() to calculate the seek points.
+drmp3_bool32 drmp3_bind_seek_table(drmp3* pMP3, drmp3_uint32 seekPointCount, drmp3_seek_point* pSeekPoints);
+
 
 
 // Opens an decodes an entire MP3 stream as a single operation.
@@ -281,10 +315,10 @@ drmp3_bool32 drmp3_seek_to_frame(drmp3* pMP3, drmp3_uint64 frameIndex);
 // pConfig is both an input and output. On input it contains what you want. On output it contains what you got.
 //
 // Free the returned pointer with drmp3_free().
-float* drmp3_open_and_decode_f32(drmp3_read_proc onRead, drmp3_seek_proc onSeek, void* pUserData, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
-float* drmp3_open_and_decode_memory_f32(const void* pData, size_t dataSize, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
+float* drmp3_open_and_read_f32(drmp3_read_proc onRead, drmp3_seek_proc onSeek, void* pUserData, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
+float* drmp3_open_memory_and_read_f32(const void* pData, size_t dataSize, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
 #ifndef DR_MP3_NO_STDIO
-float* drmp3_open_and_decode_file_f32(const char* filePath, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
+float* drmp3_open_file_and_read_f32(const char* filePath, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount);
 #endif
 
 // Frees any memory that was allocated by a public drmp3 API.
@@ -1958,11 +1992,6 @@ int drmp3dec_decode_frame(drmp3dec *dec, const unsigned char *mp3, int mp3_bytes
     info->layer = 4 - DRMP3_HDR_GET_LAYER(hdr);
     info->bitrate_kbps = drmp3_hdr_bitrate_kbps(hdr);
 
-    if (!pcm)
-    {
-        return drmp3_hdr_frame_samples(hdr);
-    }
-
     drmp3_bs_init(bs_frame, hdr + DRMP3_HDR_SIZE, frame_size - DRMP3_HDR_SIZE);
     if (DRMP3_HDR_IS_CRC(hdr))
     {
@@ -1978,7 +2007,7 @@ int drmp3dec_decode_frame(drmp3dec *dec, const unsigned char *mp3, int mp3_bytes
             return 0;
         }
         success = drmp3_L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
-        if (success)
+        if (success && pcm != NULL)
         {
             for (igr = 0; igr < (DRMP3_HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm = DRMP3_OFFSET_PTR(pcm, sizeof(drmp3d_sample_t)*576*info->channels))
             {
@@ -1993,6 +2022,10 @@ int drmp3dec_decode_frame(drmp3dec *dec, const unsigned char *mp3, int mp3_bytes
 #ifdef DR_MP3_ONLY_MP3
         return 0;
 #else
+        if (pcm == NULL) {
+            return drmp3_hdr_frame_samples(hdr);
+        }
+
         drmp3_L12_scale_info sci[1];
         drmp3_L12_read_scale_info(hdr, bs_frame, sci);
 
@@ -2015,6 +2048,7 @@ int drmp3dec_decode_frame(drmp3dec *dec, const unsigned char *mp3, int mp3_bytes
         }
 #endif
     }
+
     return success*drmp3_hdr_frame_samples(dec->header);
 }
 
@@ -2097,10 +2131,13 @@ void drmp3dec_f32_to_s16(const float *in, drmp3_int16 *out, int num_samples)
 
 // Options.
 #ifndef DR_MP3_DEFAULT_CHANNELS
-#define DR_MP3_DEFAULT_CHANNELS      2
+#define DR_MP3_DEFAULT_CHANNELS         2
 #endif
 #ifndef DR_MP3_DEFAULT_SAMPLE_RATE
-#define DR_MP3_DEFAULT_SAMPLE_RATE   44100
+#define DR_MP3_DEFAULT_SAMPLE_RATE      44100
+#endif
+#ifndef DRMP3_SEEK_LEADING_MP3_FRAMES
+#define DRMP3_SEEK_LEADING_MP3_FRAMES   2
 #endif
 
 
@@ -2323,7 +2360,7 @@ drmp3_uint64 drmp3_src_read_frames_linear(drmp3_src* pSRC, drmp3_uint64 frameCou
         pSRC->algo.linear.isNextFramesLoaded = DRMP3_TRUE;
     }
 
-    float factor = (float)pSRC->config.sampleRateIn / pSRC->config.sampleRateOut;
+    double factor = (double)pSRC->config.sampleRateIn / pSRC->config.sampleRateOut;
 
     drmp3_uint64 totalFramesRead = 0;
     while (frameCount > 0) {
@@ -2331,7 +2368,7 @@ drmp3_uint64 drmp3_src_read_frames_linear(drmp3_src* pSRC, drmp3_uint64 frameCou
         float* pPrevFrame = pSRC->bin;
         float* pNextFrame = pSRC->bin + pSRC->config.channels;
 
-        drmp3_blend_f32((float*)pFramesOut, pPrevFrame, pNextFrame, pSRC->algo.linear.alpha, pSRC->config.channels);
+        drmp3_blend_f32((float*)pFramesOut, pPrevFrame, pNextFrame, (float)pSRC->algo.linear.alpha, pSRC->config.channels);
 
         pSRC->algo.linear.alpha += factor;
 
@@ -2376,35 +2413,91 @@ drmp3_uint64 drmp3_src_read_frames_linear(drmp3_src* pSRC, drmp3_uint64 frameCou
 }
 
 
+static size_t drmp3__on_read(drmp3* pMP3, void* pBufferOut, size_t bytesToRead)
+{
+    size_t bytesRead = pMP3->onRead(pMP3->pUserData, pBufferOut, bytesToRead);
+    pMP3->streamCursor += bytesRead;
+    return bytesRead;
+}
 
-static drmp3_bool32 drmp3_decode_next_frame(drmp3* pMP3)
+static drmp3_bool32 drmp3__on_seek(drmp3* pMP3, int offset, drmp3_seek_origin origin)
+{
+    drmp3_assert(offset >= 0);
+
+    if (!pMP3->onSeek(pMP3->pUserData, offset, origin)) {
+        return DRMP3_FALSE;
+    }
+
+    if (origin == drmp3_seek_origin_start) {
+        pMP3->streamCursor = (drmp3_uint64)offset;
+    } else {
+        pMP3->streamCursor += offset;
+    }
+
+    return DRMP3_TRUE;
+}
+
+static drmp3_bool32 drmp3__on_seek_64(drmp3* pMP3, drmp3_uint64 offset, drmp3_seek_origin origin)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return drmp3__on_seek(pMP3, (int)offset, origin);
+    }
+
+
+    // Getting here "offset" is too large for a 32-bit integer. We just keep seeking forward until we hit the offset.
+    if (!drmp3__on_seek(pMP3, 0x7FFFFFFF, drmp3_seek_origin_start)) {
+        return DRMP3_FALSE;
+    }
+
+    offset -= 0x7FFFFFFF;
+    while (offset > 0) {
+        if (offset <= 0x7FFFFFFF) {
+            if (!drmp3__on_seek(pMP3, (int)offset, drmp3_seek_origin_current)) {
+                return DRMP3_FALSE;
+            }
+            offset = 0;
+        } else {
+            if (!drmp3__on_seek(pMP3, 0x7FFFFFFF, drmp3_seek_origin_current)) {
+                return DRMP3_FALSE;
+            }
+            offset -= 0x7FFFFFFF;
+        }
+    }
+
+    return DRMP3_TRUE;
+}
+
+
+
+
+static drmp3_uint32 drmp3_decode_next_frame_ex(drmp3* pMP3, drmp3d_sample_t* pPCMFrames, drmp3_bool32 discard)
 {
     drmp3_assert(pMP3 != NULL);
     drmp3_assert(pMP3->onRead != NULL);
 
     if (pMP3->atEnd) {
-        return DRMP3_FALSE;
+        return 0;
     }
 
-    do
-    {
+    drmp3_uint32 pcmFramesRead = 0;
+    do {
         // minimp3 recommends doing data submission in 16K chunks. If we don't have at least 16K bytes available, get more.
         if (pMP3->dataSize < DRMP3_DATA_CHUNK_SIZE) {
             if (pMP3->dataCapacity < DRMP3_DATA_CHUNK_SIZE) {
                 pMP3->dataCapacity = DRMP3_DATA_CHUNK_SIZE;
                 drmp3_uint8* pNewData = (drmp3_uint8*)drmp3_realloc(pMP3->pData, pMP3->dataCapacity);
                 if (pNewData == NULL) {
-                    return DRMP3_FALSE; // Out of memory.
+                    return 0; // Out of memory.
                 }
 
                 pMP3->pData = pNewData;
             }
 
-            size_t bytesRead = pMP3->onRead(pMP3->pUserData, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
+            size_t bytesRead = drmp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
             if (bytesRead == 0) {
                 if (pMP3->dataSize == 0) {
                     pMP3->atEnd = DRMP3_TRUE;
-                    return DRMP3_FALSE; // No data.
+                    return 0; // No data.
                 }
             }
 
@@ -2413,51 +2506,81 @@ static drmp3_bool32 drmp3_decode_next_frame(drmp3* pMP3)
 
         if (pMP3->dataSize > INT_MAX) {
             pMP3->atEnd = DRMP3_TRUE;
-            return DRMP3_FALSE; // File too big.
+            return 0; // File too big.
         }
 
         drmp3dec_frame_info info;
-        drmp3_uint32 samplesRead = drmp3dec_decode_frame(&pMP3->decoder, pMP3->pData, (int)pMP3->dataSize, (drmp3d_sample_t*)pMP3->frames, &info);    // <-- Safe size_t -> int conversion thanks to the check above.
-        if (samplesRead != 0) {
-            size_t leftoverDataSize = (pMP3->dataSize - (size_t)info.frame_bytes);
-            for (size_t i = 0; i < leftoverDataSize; ++i) {
-                pMP3->pData[i] = pMP3->pData[i + (size_t)info.frame_bytes];
-            }
-                
+        pcmFramesRead = drmp3dec_decode_frame(&pMP3->decoder, pMP3->pData, (int)pMP3->dataSize, pPCMFrames, &info);    // <-- Safe size_t -> int conversion thanks to the check above.
+        
+        // Consume the data.
+        size_t leftoverDataSize = (pMP3->dataSize - (size_t)info.frame_bytes);
+        if (info.frame_bytes > 0) {
+            memmove(pMP3->pData, pMP3->pData + info.frame_bytes, leftoverDataSize);
             pMP3->dataSize = leftoverDataSize;
-            pMP3->framesConsumed = 0;
-            pMP3->framesRemaining = samplesRead;
-            pMP3->frameChannels = info.channels;
-            pMP3->frameSampleRate = info.hz;
-            drmp3_src_set_input_sample_rate(&pMP3->src, pMP3->frameSampleRate);
+        }
+
+        // pcmFramesRead will be equal to 0 if decoding failed. If it is zero and info.frame_bytes > 0 then we have successfully
+        // decoded the frame. A special case is if we are wanting to discard the frame, in which case we return successfully.
+        if (pcmFramesRead > 0 || (info.frame_bytes > 0 && discard)) {
+            pcmFramesRead = drmp3_hdr_frame_samples(pMP3->decoder.header);
+            pMP3->pcmFramesConsumedInMP3Frame = 0;
+            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
+            pMP3->mp3FrameChannels = info.channels;
+            pMP3->mp3FrameSampleRate = info.hz;
+            drmp3_src_set_input_sample_rate(&pMP3->src, pMP3->mp3FrameSampleRate);
             break;
-        } else {
+        } else if (info.frame_bytes == 0) {
             // Need more data. minimp3 recommends doing data submission in 16K chunks.
             if (pMP3->dataCapacity == pMP3->dataSize) {
                 // No room. Expand.
                 pMP3->dataCapacity += DRMP3_DATA_CHUNK_SIZE;
                 drmp3_uint8* pNewData = (drmp3_uint8*)drmp3_realloc(pMP3->pData, pMP3->dataCapacity);
                 if (pNewData == NULL) {
-                    return DRMP3_FALSE; // Out of memory.
+                    return 0; // Out of memory.
                 }
 
                 pMP3->pData = pNewData;
             }
 
             // Fill in a chunk.
-            size_t bytesRead = pMP3->onRead(pMP3->pUserData, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
+            size_t bytesRead = drmp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
             if (bytesRead == 0) {
                 pMP3->atEnd = DRMP3_TRUE;
-                return DRMP3_FALSE; // Error reading more data.
+                return 0; // Error reading more data.
             }
 
             pMP3->dataSize += bytesRead;
         }
     } while (DRMP3_TRUE);
 
-    return DRMP3_TRUE;
+    return pcmFramesRead;
 }
 
+static drmp3_uint32 drmp3_decode_next_frame(drmp3* pMP3)
+{
+    drmp3_assert(pMP3 != NULL);
+    return drmp3_decode_next_frame_ex(pMP3, (drmp3d_sample_t*)pMP3->pcmFrames, DRMP3_FALSE);
+}
+
+#if 0
+static drmp3_uint32 drmp3_seek_next_frame(drmp3* pMP3)
+{
+    drmp3_assert(pMP3 != NULL);
+
+    drmp3_uint32 pcmFrameCount = drmp3_decode_next_frame_ex(pMP3, NULL);
+    if (pcmFrameCount == 0) {
+        return 0;
+    }
+
+    // We have essentially just skipped past the frame, so just set the remaining samples to 0.
+    pMP3->currentPCMFrame             += pcmFrameCount;
+    pMP3->pcmFramesConsumedInMP3Frame  = pcmFrameCount;
+    pMP3->pcmFramesRemainingInMP3Frame = 0;
+
+    return pcmFrameCount;
+}
+#endif
+
 static drmp3_uint64 drmp3_read_src(drmp3_src* pSRC, drmp3_uint64 frameCount, void* pFramesOut, void* pUserData)
 {
     drmp3* pMP3 = (drmp3*)pUserData;
@@ -2465,64 +2588,64 @@ static drmp3_uint64 drmp3_read_src(drmp3_src* pSRC, drmp3_uint64 frameCount, voi
     drmp3_assert(pMP3->onRead != NULL);
 
     float* pFramesOutF = (float*)pFramesOut;
-    drmp3_uint32 totalFramesRead = 0;
+    drmp3_uint64 totalFramesRead = 0;
 
     while (frameCount > 0) {
         // Read from the in-memory buffer first.
-        while (pMP3->framesRemaining > 0 && frameCount > 0) {
-            drmp3d_sample_t* frames = (drmp3d_sample_t*)pMP3->frames;
+        while (pMP3->pcmFramesRemainingInMP3Frame > 0 && frameCount > 0) {
+            drmp3d_sample_t* frames = (drmp3d_sample_t*)pMP3->pcmFrames;
 #ifndef DR_MP3_FLOAT_OUTPUT
-            if (pMP3->frameChannels == 1) {
+            if (pMP3->mp3FrameChannels == 1) {
                 if (pMP3->channels == 1) {
                     // Mono -> Mono.
-                    pFramesOutF[0] = frames[pMP3->framesConsumed] / 32768.0f;
+                    pFramesOutF[0] = frames[pMP3->pcmFramesConsumedInMP3Frame] / 32768.0f;
                 } else {
                     // Mono -> Stereo.
-                    pFramesOutF[0] = frames[pMP3->framesConsumed] / 32768.0f;
-                    pFramesOutF[1] = frames[pMP3->framesConsumed] / 32768.0f;
+                    pFramesOutF[0] = frames[pMP3->pcmFramesConsumedInMP3Frame] / 32768.0f;
+                    pFramesOutF[1] = frames[pMP3->pcmFramesConsumedInMP3Frame] / 32768.0f;
                 }
             } else {
                 if (pMP3->channels == 1) {
                     // Stereo -> Mono
                     float sample = 0;
-                    sample += frames[(pMP3->framesConsumed*pMP3->frameChannels)+0] / 32768.0f;
-                    sample += frames[(pMP3->framesConsumed*pMP3->frameChannels)+1] / 32768.0f;
+                    sample += frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+0] / 32768.0f;
+                    sample += frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+1] / 32768.0f;
                     pFramesOutF[0] = sample * 0.5f;
                 } else {
                     // Stereo -> Stereo
-                    pFramesOutF[0] = frames[(pMP3->framesConsumed*pMP3->frameChannels)+0] / 32768.0f;
-                    pFramesOutF[1] = frames[(pMP3->framesConsumed*pMP3->frameChannels)+1] / 32768.0f;
+                    pFramesOutF[0] = frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+0] / 32768.0f;
+                    pFramesOutF[1] = frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+1] / 32768.0f;
                 }
             }
 #else
-            if (pMP3->frameChannels == 1) {
+            if (pMP3->mp3FrameChannels == 1) {
                 if (pMP3->channels == 1) {
                     // Mono -> Mono.
-                    pFramesOutF[0] = frames[pMP3->framesConsumed];
+                    pFramesOutF[0] = frames[pMP3->pcmFramesConsumedInMP3Frame];
                 } else {
                     // Mono -> Stereo.
-                    pFramesOutF[0] = frames[pMP3->framesConsumed];
-                    pFramesOutF[1] = frames[pMP3->framesConsumed];
+                    pFramesOutF[0] = frames[pMP3->pcmFramesConsumedInMP3Frame];
+                    pFramesOutF[1] = frames[pMP3->pcmFramesConsumedInMP3Frame];
                 }
             } else {
                 if (pMP3->channels == 1) {
                     // Stereo -> Mono
                     float sample = 0;
-                    sample += frames[(pMP3->framesConsumed*pMP3->frameChannels)+0];
-                    sample += frames[(pMP3->framesConsumed*pMP3->frameChannels)+1];
+                    sample += frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+0];
+                    sample += frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+1];
                     pFramesOutF[0] = sample * 0.5f;
                 } else {
                     // Stereo -> Stereo
-                    pFramesOutF[0] = frames[(pMP3->framesConsumed*pMP3->frameChannels)+0];
-                    pFramesOutF[1] = frames[(pMP3->framesConsumed*pMP3->frameChannels)+1];
+                    pFramesOutF[0] = frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+0];
+                    pFramesOutF[1] = frames[(pMP3->pcmFramesConsumedInMP3Frame*pMP3->mp3FrameChannels)+1];
                 }
             }
 #endif
 
-            pMP3->framesConsumed += 1;
-            pMP3->framesRemaining -= 1;
-            frameCount -= 1;
+            pMP3->pcmFramesConsumedInMP3Frame += 1;
+            pMP3->pcmFramesRemainingInMP3Frame -= 1;
             totalFramesRead += 1;
+            frameCount -= 1;
             pFramesOutF += pSRC->config.channels;
         }
 
@@ -2530,11 +2653,11 @@ static drmp3_uint64 drmp3_read_src(drmp3_src* pSRC, drmp3_uint64 frameCount, voi
             break;
         }
 
-        drmp3_assert(pMP3->framesRemaining == 0);
+        drmp3_assert(pMP3->pcmFramesRemainingInMP3Frame == 0);
 
         // At this point we have exhausted our in-memory buffer so we need to re-fill. Note that the sample rate may have changed
         // at this point which means we'll also need to update our sample rate conversion pipeline.
-        if (!drmp3_decode_next_frame(pMP3)) {
+        if (drmp3_decode_next_frame(pMP3) == 0) {
             break;
         }
     }
@@ -2710,7 +2833,9 @@ drmp3_bool32 drmp3_init_file(drmp3* pMP3, const char* filePath, const drmp3_conf
 
 void drmp3_uninit(drmp3* pMP3)
 {
-    if (pMP3 == NULL) return;
+    if (pMP3 == NULL) {
+        return;
+    }
     
 #ifndef DR_MP3_NO_STDIO
     if (pMP3->onRead == drmp3__on_read_stdio) {
@@ -2721,9 +2846,11 @@ void drmp3_uninit(drmp3* pMP3)
     drmp3_free(pMP3->pData);
 }
 
-drmp3_uint64 drmp3_read_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBufferOut)
+drmp3_uint64 drmp3_read_pcm_frames_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBufferOut)
 {
-    if (pMP3 == NULL || pMP3->onRead == NULL) return 0;
+    if (pMP3 == NULL || pMP3->onRead == NULL) {
+        return 0;
+    }
 
     drmp3_uint64 totalFramesRead = 0;
 
@@ -2735,7 +2862,7 @@ drmp3_uint64 drmp3_read_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBuff
                 framesToReadRightNow = framesToRead;
             }
 
-            drmp3_uint64 framesJustRead = drmp3_read_f32(pMP3, framesToReadRightNow, temp);
+            drmp3_uint64 framesJustRead = drmp3_read_pcm_frames_f32(pMP3, framesToReadRightNow, temp);
             if (framesJustRead == 0) {
                 break;
             }
@@ -2745,40 +2872,471 @@ drmp3_uint64 drmp3_read_f32(drmp3* pMP3, drmp3_uint64 framesToRead, float* pBuff
         }
     } else {
         totalFramesRead = drmp3_src_read_frames_ex(&pMP3->src, framesToRead, pBufferOut, DRMP3_TRUE);
+        pMP3->currentPCMFrame += totalFramesRead;
     }
 
     return totalFramesRead;
 }
 
-drmp3_bool32 drmp3_seek_to_frame(drmp3* pMP3, drmp3_uint64 frameIndex)
+void drmp3_reset(drmp3* pMP3)
 {
-    if (pMP3 == NULL || pMP3->onSeek == NULL) return DRMP3_FALSE;
+    drmp3_assert(pMP3 != NULL);
+
+    pMP3->pcmFramesConsumedInMP3Frame = 0;
+    pMP3->pcmFramesRemainingInMP3Frame = 0;
+    pMP3->currentPCMFrame = 0;
+    pMP3->dataSize = 0;
+    pMP3->atEnd = DRMP3_FALSE;
+    pMP3->src.bin[0] = 0;
+    pMP3->src.bin[1] = 0;
+    pMP3->src.bin[2] = 0;
+    pMP3->src.bin[3] = 0;
+    pMP3->src.cache.cachedFrameCount = 0;
+    pMP3->src.cache.iNextFrame = 0;
+    pMP3->src.algo.linear.alpha = 0;
+    pMP3->src.algo.linear.isNextFramesLoaded = 0;
+    pMP3->src.algo.linear.isPrevFramesLoaded = 0;
+    //drmp3_zero_object(&pMP3->decoder);
+    drmp3dec_init(&pMP3->decoder);
+}
+
+drmp3_bool32 drmp3_seek_to_start_of_stream(drmp3* pMP3)
+{
+    drmp3_assert(pMP3 != NULL);
+    drmp3_assert(pMP3->onSeek != NULL);
 
     // Seek to the start of the stream to begin with.
-    if (!pMP3->onSeek(pMP3->pUserData, 0, drmp3_seek_origin_start)) {
+    if (!drmp3__on_seek(pMP3, 0, drmp3_seek_origin_start)) {
         return DRMP3_FALSE;
     }
 
     // Clear any cached data.
-    pMP3->framesConsumed = 0;
-    pMP3->framesRemaining = 0;
-    pMP3->dataSize = 0;
-    pMP3->atEnd = DRMP3_FALSE;
+    drmp3_reset(pMP3);
+    return DRMP3_TRUE;
+}
 
-    // TODO: Optimize.
-    //
-    // This is inefficient. We simply read frames from the start of the stream.
-    drmp3_uint64 framesRead = drmp3_read_f32(pMP3, frameIndex, NULL);
-    if (framesRead != frameIndex) {
+float drmp3_get_cached_pcm_frame_count_from_src(drmp3* pMP3)
+{
+    return (pMP3->src.cache.cachedFrameCount - pMP3->src.cache.iNextFrame) + (float)pMP3->src.algo.linear.alpha;
+}
+
+float drmp3_get_pcm_frames_remaining_in_mp3_frame(drmp3* pMP3)
+{
+    float factor = (float)pMP3->src.config.sampleRateOut / (float)pMP3->src.config.sampleRateIn;
+    float frameCountPreSRC = drmp3_get_cached_pcm_frame_count_from_src(pMP3) + pMP3->pcmFramesRemainingInMP3Frame;
+    return frameCountPreSRC * factor;
+}
+
+// NOTE ON SEEKING
+// ===============
+// The seeking code below is a complete mess and is broken for cases when the sample rate changes. The problem
+// is with the resampling and the crappy resampler used by dr_mp3. What needs to happen is the following:
+//
+// 1) The resampler needs to be replaced.
+// 2) The resampler has state which needs to be updated whenever an MP3 frame is decoded outside of
+//    drmp3_read_pcm_frames_f32(). The resampler needs an API to "flush" some imaginary input so that it's
+//    state is updated accordingly.
+
+drmp3_bool32 drmp3_seek_forward_by_pcm_frames__brute_force(drmp3* pMP3, drmp3_uint64 frameOffset)
+{
+#if 0
+    // MP3 is a bit annoying when it comes to seeking because of the bit reservoir. It basically means that an MP3 frame can possibly
+    // depend on some of the data of prior frames. This means it's not as simple as seeking to the first byte of the MP3 frame that
+    // contains the sample because that MP3 frame will need the data from the previous MP3 frame (which we just seeked past!). To
+    // resolve this we seek past a number of MP3 frames up to a point, and then read-and-discard the remainder.
+    drmp3_uint64 maxFramesToReadAndDiscard = (drmp3_uint64)(DRMP3_MAX_PCM_FRAMES_PER_MP3_FRAME * 3 * ((float)pMP3->src.config.sampleRateOut / (float)pMP3->src.config.sampleRateIn));
+
+    // Now get rid of leading whole frames.
+    while (frameOffset > maxFramesToReadAndDiscard) {
+        float        pcmFramesRemainingInCurrentMP3FrameF = drmp3_get_pcm_frames_remaining_in_mp3_frame(pMP3);
+        drmp3_uint32 pcmFramesRemainingInCurrentMP3Frame  = (drmp3_uint32)pcmFramesRemainingInCurrentMP3FrameF;
+        if (frameOffset > pcmFramesRemainingInCurrentMP3Frame) {
+            frameOffset                       -= pcmFramesRemainingInCurrentMP3Frame;
+            pMP3->currentPCMFrame             += pcmFramesRemainingInCurrentMP3Frame;
+            pMP3->pcmFramesConsumedInMP3Frame += pMP3->pcmFramesRemainingInMP3Frame;
+            pMP3->pcmFramesRemainingInMP3Frame = 0;
+        } else {
+            break;
+        }
+
+        drmp3_uint32 pcmFrameCount = drmp3_decode_next_frame_ex(pMP3, pMP3->pcmFrames, DRMP3_FALSE);
+        if (pcmFrameCount == 0) {
+            break;
+        }
+    }
+
+    // The last step is to read-and-discard any remaining PCM frames to make it sample-exact.
+    drmp3_uint64 framesRead = drmp3_read_pcm_frames_f32(pMP3, frameOffset, NULL);
+    if (framesRead != frameOffset) {
         return DRMP3_FALSE;
     }
+#else
+    // Just using a dumb read-and-discard for now pending updates to the resampler.
+    drmp3_uint64 framesRead = drmp3_read_pcm_frames_f32(pMP3, frameOffset, NULL);
+    if (framesRead != frameOffset) {
+        return DRMP3_FALSE;
+    }
+#endif
+
+    return DRMP3_TRUE;
+}
+
+drmp3_bool32 drmp3_seek_to_pcm_frame__brute_force(drmp3* pMP3, drmp3_uint64 frameIndex)
+{
+    drmp3_assert(pMP3 != NULL);
+
+    if (frameIndex == pMP3->currentPCMFrame) {
+        return DRMP3_TRUE;
+    }
+
+    // If we're moving foward we just read from where we're at. Otherwise we need to move back to the start of
+    // the stream and read from the beginning.
+    //drmp3_uint64 framesToReadAndDiscard;
+    if (frameIndex < pMP3->currentPCMFrame) {
+        // Moving backward. Move to the start of the stream and then move forward.
+        if (!drmp3_seek_to_start_of_stream(pMP3)) {
+            return DRMP3_FALSE;
+        }
+    }
+
+    drmp3_assert(frameIndex >= pMP3->currentPCMFrame);
+    return drmp3_seek_forward_by_pcm_frames__brute_force(pMP3, (frameIndex - pMP3->currentPCMFrame));
+}
+
+drmp3_bool32 drmp3_find_closest_seek_point(drmp3* pMP3, drmp3_uint64 frameIndex, drmp3_uint32* pSeekPointIndex)
+{
+    drmp3_assert(pSeekPointIndex != NULL);
+
+    if (frameIndex < pMP3->pSeekPoints[0].pcmFrameIndex) {
+        return DRMP3_FALSE;
+    }
+
+    // Linear search for simplicity to begin with while I'm getting this thing working. Once it's all working change this to a binary search.
+    for (drmp3_uint32 iSeekPoint = 0; iSeekPoint < pMP3->seekPointCount; ++iSeekPoint) {
+        if (pMP3->pSeekPoints[iSeekPoint].pcmFrameIndex > frameIndex) {
+            break;  // Found it.
+        }
+
+        *pSeekPointIndex = iSeekPoint;
+    }
+
+    return DRMP3_TRUE;
+}
+
+drmp3_bool32 drmp3_seek_to_pcm_frame__seek_table(drmp3* pMP3, drmp3_uint64 frameIndex)
+{
+    drmp3_assert(pMP3 != NULL);
+    drmp3_assert(pMP3->pSeekPoints != NULL);
+    drmp3_assert(pMP3->seekPointCount > 0);
+
+    drmp3_seek_point seekPoint;
+
+    // If there is no prior seekpoint it means the target PCM frame comes before the first seek point. Just assume a seekpoint at the start of the file in this case.
+    drmp3_uint32 priorSeekPointIndex;
+    if (drmp3_find_closest_seek_point(pMP3, frameIndex, &priorSeekPointIndex)) {
+        seekPoint = pMP3->pSeekPoints[priorSeekPointIndex];
+    } else {
+        seekPoint.seekPosInBytes     = 0;
+        seekPoint.pcmFrameIndex      = 0;
+        seekPoint.mp3FramesToDiscard = 0;
+        seekPoint.pcmFramesToDiscard = 0;
+    }
+
+    // First thing to do is seek to the first byte of the relevant MP3 frame.
+    if (!drmp3__on_seek_64(pMP3, seekPoint.seekPosInBytes, drmp3_seek_origin_start)) {
+        return DRMP3_FALSE; // Failed to seek.
+    }
+
+    // Clear any cached data.
+    drmp3_reset(pMP3);
+
+    // Whole MP3 frames need to be discarded first.
+    for (drmp3_uint16 iMP3Frame = 0; iMP3Frame < seekPoint.mp3FramesToDiscard; ++iMP3Frame) {
+        // Pass in non-null for the last frame because we want to ensure the sample rate converter is preloaded correctly.
+        drmp3d_sample_t* pPCMFrames = NULL;
+        if (iMP3Frame == seekPoint.mp3FramesToDiscard-1) {
+            pPCMFrames = (drmp3d_sample_t*)pMP3->pcmFrames;
+        }
+
+        // We first need to decode the next frame, and then we need to flush the resampler.
+        drmp3_uint32 pcmFramesReadPreSRC = drmp3_decode_next_frame_ex(pMP3, pPCMFrames, DRMP3_TRUE);
+        if (pcmFramesReadPreSRC == 0) {
+            return DRMP3_FALSE;
+        }
+    }
+
+    // We seeked to an MP3 frame in the raw stream so we need to make sure the current PCM frame is set correctly.
+    pMP3->currentPCMFrame = seekPoint.pcmFrameIndex - seekPoint.pcmFramesToDiscard;
+
+    // Update resampler. This is wrong. Need to instead update it on a per MP3 frame basis. Also broken for cases when
+    // the sample rate is being reduced in my testing. Should work fine when the input and output sample rate is the same
+    // or a clean multiple.
+    pMP3->src.algo.linear.alpha = pMP3->currentPCMFrame * ((double)pMP3->src.config.sampleRateIn / pMP3->src.config.sampleRateOut);
+    pMP3->src.algo.linear.alpha = pMP3->src.algo.linear.alpha - (drmp3_uint32)(pMP3->src.algo.linear.alpha);
+    if (pMP3->src.algo.linear.alpha > 0) {
+        pMP3->src.algo.linear.isPrevFramesLoaded = 1;
+    }
+
+    // Now at this point we can follow the same process as the brute force technique where we just skip over unnecessary MP3 frames and then
+    // read-and-discard at least 2 whole MP3 frames.
+    drmp3_uint64 leftoverFrames = frameIndex - pMP3->currentPCMFrame;
+    return drmp3_seek_forward_by_pcm_frames__brute_force(pMP3, leftoverFrames);
+}
+
+drmp3_bool32 drmp3_seek_to_pcm_frame(drmp3* pMP3, drmp3_uint64 frameIndex)
+{
+    if (pMP3 == NULL || pMP3->onSeek == NULL) {
+        return DRMP3_FALSE;
+    }
+
+    if (frameIndex == 0) {
+        return drmp3_seek_to_start_of_stream(pMP3);
+    }
+
+    // Use the seek table if we have one.
+    if (pMP3->pSeekPoints != NULL && pMP3->seekPointCount > 0) {
+        return drmp3_seek_to_pcm_frame__seek_table(pMP3, frameIndex);
+    } else {
+        return drmp3_seek_to_pcm_frame__brute_force(pMP3, frameIndex);
+    }
+}
+
+drmp3_bool32 drmp3_get_mp3_and_pcm_frame_count(drmp3* pMP3, drmp3_uint64* pMP3FrameCount, drmp3_uint64* pPCMFrameCount)
+{
+    if (pMP3 == NULL) {
+        return DRMP3_FALSE;
+    }
+
+    // The way this works is we move back to the start of the stream, iterate over each MP3 frame and calculate the frame count based
+    // on our output sample rate, the seek back to the PCM frame we were sitting on before calling this function.
+
+    // The stream must support seeking for this to work.
+    if (pMP3->onSeek == NULL) {
+        return DRMP3_FALSE;
+    }
+
+    // We'll need to seek back to where we were, so grab the PCM frame we're currently sitting on so we can restore later.
+    drmp3_uint64 currentPCMFrame = pMP3->currentPCMFrame;
+    
+    if (!drmp3_seek_to_start_of_stream(pMP3)) {
+        return DRMP3_FALSE;
+    }
+
+    drmp3_uint64 totalPCMFrameCount = 0;
+    drmp3_uint64 totalMP3FrameCount = 0;
+
+    float totalPCMFrameCountFractionalPart = 0; // <-- With resampling there will be a fractional part to each MP3 frame that we need to accumulate.
+    for (;;) {
+        drmp3_uint32 pcmFramesInCurrentMP3FrameIn = drmp3_decode_next_frame_ex(pMP3, NULL, DRMP3_FALSE);
+        if (pcmFramesInCurrentMP3FrameIn == 0) {
+            break;
+        }
+
+        float srcRatio = (float)pMP3->mp3FrameSampleRate / (float)pMP3->sampleRate;
+        drmp3_assert(srcRatio > 0);
+
+        float        pcmFramesInCurrentMP3FrameOutF = totalPCMFrameCountFractionalPart + (pcmFramesInCurrentMP3FrameIn / srcRatio);
+        drmp3_uint32 pcmFramesInCurrentMP3FrameOut  = (drmp3_uint32)pcmFramesInCurrentMP3FrameOutF;
+        totalPCMFrameCountFractionalPart = pcmFramesInCurrentMP3FrameOutF - pcmFramesInCurrentMP3FrameOut;
+        totalPCMFrameCount += pcmFramesInCurrentMP3FrameOut;
+        totalMP3FrameCount += 1;
+    }
+
+    // Finally, we need to seek back to where we were.
+    if (!drmp3_seek_to_start_of_stream(pMP3)) {
+        return DRMP3_FALSE;
+    }
+
+    if (!drmp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
+        return DRMP3_FALSE;
+    }
+
+    if (pMP3FrameCount != NULL) {
+        *pMP3FrameCount = totalMP3FrameCount;
+    }
+    if (pPCMFrameCount != NULL) {
+        *pPCMFrameCount = totalPCMFrameCount;
+    }
+
+    return DRMP3_TRUE;
+}
+
+drmp3_uint64 drmp3_get_pcm_frame_count(drmp3* pMP3)
+{
+    drmp3_uint64 totalPCMFrameCount;
+    if (!drmp3_get_mp3_and_pcm_frame_count(pMP3, NULL, &totalPCMFrameCount)) {
+        return 0;
+    }
+
+    return totalPCMFrameCount;
+}
+
+drmp3_uint64 drmp3_get_mp3_frame_count(drmp3* pMP3)
+{
+    drmp3_uint64 totalMP3FrameCount;
+    if (!drmp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, NULL)) {
+        return 0;
+    }
+
+    return totalMP3FrameCount;
+}
+
+void drmp3__accumulate_running_pcm_frame_count(drmp3* pMP3, drmp3_uint32 pcmFrameCountIn, drmp3_uint64* pRunningPCMFrameCount, float* pRunningPCMFrameCountFractionalPart)
+{
+    float srcRatio = (float)pMP3->mp3FrameSampleRate / (float)pMP3->sampleRate;
+    drmp3_assert(srcRatio > 0);
+
+    float        pcmFrameCountOutF = *pRunningPCMFrameCountFractionalPart + (pcmFrameCountIn / srcRatio);
+    drmp3_uint32 pcmFrameCountOut  = (drmp3_uint32)pcmFrameCountOutF;
+    *pRunningPCMFrameCountFractionalPart = pcmFrameCountOutF - pcmFrameCountOut;
+    *pRunningPCMFrameCount += pcmFrameCountOut;
+}
+
+typedef struct
+{
+    drmp3_uint64 bytePos;
+    drmp3_uint64 pcmFrameIndex; // <-- After sample rate conversion.
+} drmp3__seeking_mp3_frame_info;
+
+drmp3_bool32 drmp3_calculate_seek_points(drmp3* pMP3, drmp3_uint32* pSeekPointCount, drmp3_seek_point* pSeekPoints)
+{
+    if (pMP3 == NULL || pSeekPointCount == NULL || pSeekPoints == NULL) {
+        return DRMP3_FALSE; // Invalid args.
+    }
+
+    drmp3_uint32 seekPointCount = *pSeekPointCount;
+    if (seekPointCount == 0) {
+        return DRMP3_FALSE;  // The client has requested no seek points. Consider this to be invalid arguments since the client has probably not intended this.
+    }
+
+    // We'll need to seek back to the current sample after calculating the seekpoints so we need to go ahead and grab the current location at the top.
+    drmp3_uint64 currentPCMFrame = pMP3->currentPCMFrame;
+    
+    // We never do more than the total number of MP3 frames and we limit it to 32-bits.
+    drmp3_uint64 totalMP3FrameCount;
+    drmp3_uint64 totalPCMFrameCount;
+    if (!drmp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, &totalPCMFrameCount)) {
+        return DRMP3_FALSE;
+    }
+
+    // If there's less than DRMP3_SEEK_LEADING_MP3_FRAMES+1 frames we just report 1 seek point which will be the very start of the stream.
+    if (totalMP3FrameCount < DRMP3_SEEK_LEADING_MP3_FRAMES+1) {
+        seekPointCount = 1;
+        pSeekPoints[0].seekPosInBytes     = 0;
+        pSeekPoints[0].pcmFrameIndex      = 0;
+        pSeekPoints[0].mp3FramesToDiscard = 0;
+        pSeekPoints[0].pcmFramesToDiscard = 0;
+    } else {
+        if (seekPointCount > totalMP3FrameCount-1) {
+            seekPointCount = (drmp3_uint32)totalMP3FrameCount-1;
+        }
+
+        drmp3_uint64 pcmFramesBetweenSeekPoints = totalPCMFrameCount / (seekPointCount+1);
+
+        // Here is where we actually calculate the seek points. We need to start by moving the start of the stream. We then enumerate over each
+        // MP3 frame.
+        if (!drmp3_seek_to_start_of_stream(pMP3)) {
+            return DRMP3_FALSE;
+        }
+
+        // We need to cache the byte positions of the previous MP3 frames. As a new MP3 frame is iterated, we cycle the byte positions in this
+        // array. The value in the first item in this array is the byte position that will be reported in the next seek point.
+        drmp3__seeking_mp3_frame_info mp3FrameInfo[DRMP3_SEEK_LEADING_MP3_FRAMES+1];
+
+        drmp3_uint64 runningPCMFrameCount = 0;
+        float runningPCMFrameCountFractionalPart = 0;
+
+        // We need to initialize the array of MP3 byte positions for the leading MP3 frames.
+        for (int iMP3Frame = 0; iMP3Frame < DRMP3_SEEK_LEADING_MP3_FRAMES+1; ++iMP3Frame) {
+            // The byte position of the next frame will be the stream's cursor position, minus whatever is sitting in the buffer.
+            drmp3_assert(pMP3->streamCursor >= pMP3->dataSize);
+            mp3FrameInfo[iMP3Frame].bytePos       = pMP3->streamCursor - pMP3->dataSize;
+            mp3FrameInfo[iMP3Frame].pcmFrameIndex = runningPCMFrameCount;
+
+            // We need to get information about this frame so we can know how many samples it contained.
+            drmp3_uint32 pcmFramesInCurrentMP3FrameIn = drmp3_decode_next_frame_ex(pMP3, NULL, DRMP3_FALSE);
+            if (pcmFramesInCurrentMP3FrameIn == 0) {
+                return DRMP3_FALSE; // This should never happen.
+            }
+
+            drmp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
+        }
+
+        // At this point we will have extracted the byte positions of the leading MP3 frames. We can now start iterating over each seek point and
+        // calculate them.
+        drmp3_uint64 nextTargetPCMFrame = 0;
+        for (drmp3_uint32 iSeekPoint = 0; iSeekPoint < seekPointCount; ++iSeekPoint) {
+            nextTargetPCMFrame += pcmFramesBetweenSeekPoints;
+
+            for (;;) {
+                if (nextTargetPCMFrame < runningPCMFrameCount) {
+                    // The next seek point is in the current MP3 frame.
+                    pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
+                    pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
+                    pSeekPoints[iSeekPoint].mp3FramesToDiscard = DRMP3_SEEK_LEADING_MP3_FRAMES;
+                    pSeekPoints[iSeekPoint].pcmFramesToDiscard = (drmp3_uint16)(nextTargetPCMFrame - mp3FrameInfo[DRMP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
+                    break;
+                } else {
+                    // The next seek point is not in the current MP3 frame, so continue on to the next one. The first thing to do is cycle the cached
+                    // MP3 frame info.
+                    for (int i = 0; i < drmp3_countof(mp3FrameInfo)-1; ++i) {
+                        mp3FrameInfo[i] = mp3FrameInfo[i+1];
+                    }
+
+                    // Cache previous MP3 frame info.
+                    mp3FrameInfo[drmp3_countof(mp3FrameInfo)-1].bytePos       = pMP3->streamCursor - pMP3->dataSize;
+                    mp3FrameInfo[drmp3_countof(mp3FrameInfo)-1].pcmFrameIndex = runningPCMFrameCount;
+
+                    // Go to the next MP3 frame. This shouldn't ever fail, but just in case it does we just set the seek point and break. If it happens, it
+                    // should only ever do it for the last seek point.
+                    drmp3_uint32 pcmFramesInCurrentMP3FrameIn = drmp3_decode_next_frame_ex(pMP3, NULL, DRMP3_TRUE);
+                    if (pcmFramesInCurrentMP3FrameIn == 0) {
+                        pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
+                        pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
+                        pSeekPoints[iSeekPoint].mp3FramesToDiscard = DRMP3_SEEK_LEADING_MP3_FRAMES;
+                        pSeekPoints[iSeekPoint].pcmFramesToDiscard = (drmp3_uint16)(nextTargetPCMFrame - mp3FrameInfo[DRMP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
+                        break;
+                    }
+
+                    drmp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
+                }
+            }
+        }
+
+        // Finally, we need to seek back to where we were.
+        if (!drmp3_seek_to_start_of_stream(pMP3)) {
+            return DRMP3_FALSE;
+        }
+        if (!drmp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
+            return DRMP3_FALSE;
+        }
+    }
+
+    *pSeekPointCount = seekPointCount;
+    return DRMP3_TRUE;
+}
+
+drmp3_bool32 drmp3_bind_seek_table(drmp3* pMP3, drmp3_uint32 seekPointCount, drmp3_seek_point* pSeekPoints)
+{
+    if (pMP3 == NULL) {
+        return DRMP3_FALSE;
+    }
+
+    if (seekPointCount == 0 || pSeekPoints == NULL) {
+        // Unbinding.
+        pMP3->seekPointCount = 0;
+        pMP3->pSeekPoints = NULL;
+    } else {
+        // Binding.
+        pMP3->seekPointCount = seekPointCount;
+        pMP3->pSeekPoints = pSeekPoints;
+    }
 
     return DRMP3_TRUE;
 }
 
 
-
-float* drmp3__full_decode_and_close_f32(drmp3* pMP3, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
+float* drmp3__full_read_and_close_f32(drmp3* pMP3, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
 {
     drmp3_assert(pMP3 != NULL);
 
@@ -2789,7 +3347,7 @@ float* drmp3__full_decode_and_close_f32(drmp3* pMP3, drmp3_config* pConfig, drmp
     float temp[4096];
     for (;;) {
         drmp3_uint64 framesToReadRightNow = drmp3_countof(temp) / pMP3->channels;
-        drmp3_uint64 framesJustRead = drmp3_read_f32(pMP3, framesToReadRightNow, temp);
+        drmp3_uint64 framesJustRead = drmp3_read_pcm_frames_f32(pMP3, framesToReadRightNow, temp);
         if (framesJustRead == 0) {
             break;
         }
@@ -2835,35 +3393,35 @@ float* drmp3__full_decode_and_close_f32(drmp3* pMP3, drmp3_config* pConfig, drmp
     return pFrames;
 }
 
-float* drmp3_open_and_decode_f32(drmp3_read_proc onRead, drmp3_seek_proc onSeek, void* pUserData, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
+float* drmp3_open_and_read_f32(drmp3_read_proc onRead, drmp3_seek_proc onSeek, void* pUserData, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
 {
     drmp3 mp3;
     if (!drmp3_init(&mp3, onRead, onSeek, pUserData, pConfig)) {
         return NULL;
     }
 
-    return drmp3__full_decode_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+    return drmp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
 }
 
-float* drmp3_open_and_decode_memory_f32(const void* pData, size_t dataSize, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
+float* drmp3_open_memory_and_read_f32(const void* pData, size_t dataSize, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
 {
     drmp3 mp3;
     if (!drmp3_init_memory(&mp3, pData, dataSize, pConfig)) {
         return NULL;
     }
 
-    return drmp3__full_decode_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+    return drmp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
 }
 
 #ifndef DR_MP3_NO_STDIO
-float* drmp3_open_and_decode_file_f32(const char* filePath, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
+float* drmp3_open_file_and_read_f32(const char* filePath, drmp3_config* pConfig, drmp3_uint64* pTotalFrameCount)
 {
     drmp3 mp3;
     if (!drmp3_init_file(&mp3, filePath, pConfig)) {
         return NULL;
     }
 
-    return drmp3__full_decode_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+    return drmp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
 }
 #endif
 
@@ -2890,7 +3448,18 @@ void drmp3_free(void* p)
 
 
 // REVISION HISTORY
-// ===============
+// ================
+//
+// v0.4.0 - 2018-xx-xx
+//   - API CHANGE: Rename some APIs:
+//     - drmp3_read_f32 -> to drmp3_read_pcm_frames_f32
+//     - drmp3_seek_to_frame -> drmp3_seek_to_pcm_frame
+//     - drmp3_open_and_decode_f32 -> drmp3_open_and_read_f32
+//     - drmp3_open_and_decode_memory_f32 -> drmp3_open_memory_and_read_f32
+//     - drmp3_open_and_decode_file_f32 -> drmp3_open_file_and_read_f32
+//   - Add drmp3_get_pcm_frame_count().
+//   - Add drmp3_get_mp3_frame_count().
+//   - Improve seeking performance.
 //
 // v0.3.2 - 2018-09-11
 //   - Fix a couple of memory leaks.
diff --git a/extras/dr_wav.h b/extras/dr_wav.h
index 6aff80a9..c139472d 100644
--- a/extras/dr_wav.h
+++ b/extras/dr_wav.h
@@ -1,5 +1,5 @@
 // WAV audio loader and writer. Public domain. See "unlicense" statement at the end of this file.
-// dr_wav - v0.8.5 - 2018-09-11
+// dr_wav - v0.9.0-dev - 2018-xx-xx
 //
 // David Reid - mackron@gmail.com
 
@@ -40,7 +40,7 @@
 //     unsigned int channels;
 //     unsigned int sampleRate;
 //     drwav_uint64 totalSampleCount;
-//     float* pSampleData = drwav_open_and_read_file_s32("my_song.wav", &channels, &sampleRate, &totalSampleCount);
+//     float* pSampleData = drwav_open_file_and_read_f32("my_song.wav", &channels, &sampleRate, &totalSampleCount);
 //     if (pSampleData == NULL) {
 //         // Error opening and reading WAV file.
 //     }
@@ -154,6 +154,14 @@ extern "C" {
 #define DR_WAVE_FORMAT_DVI_ADPCM    0x11
 #define DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
 
+// Constants.
+#ifndef DRWAV_MAX_SMPL_LOOPS
+#define DRWAV_MAX_SMPL_LOOPS        1
+#endif
+
+// Flags to pass into drwav_init_ex(), etc.
+#define DRWAV_SEQUENTIAL            0x00000001
+
 typedef enum
 {
     drwav_seek_origin_start,
@@ -166,6 +174,22 @@ typedef enum
     drwav_container_w64
 } drwav_container;
 
+typedef struct
+{
+    union
+    {
+        drwav_uint8 fourcc[4];
+        drwav_uint8 guid[16];
+    } id;
+
+    // The size in bytes of the chunk.
+    drwav_uint64 sizeInBytes;
+
+    // RIFF = 2 byte alignment.
+    // W64  = 8 byte alignment.
+    unsigned int paddingSize;
+} drwav_chunk_header;
+
 // Callback for when data is read. Return value is the number of bytes actually read.
 //
 // pUserData   [in]  The user data that was passed to drwav_init(), drwav_open() and family.
@@ -201,6 +225,22 @@ typedef size_t (* drwav_write_proc)(void* pUserData, const void* pData, size_t b
 // will be either drwav_seek_origin_start or drwav_seek_origin_current.
 typedef drwav_bool32 (* drwav_seek_proc)(void* pUserData, int offset, drwav_seek_origin origin);
 
+// Callback for when drwav_init_ex/drwav_open_ex finds a chunk.
+//
+// pChunkUserData    [in] The user data that was passed to the pChunkUserData parameter of drwav_init_ex(), drwav_open_ex() and family.
+// onRead            [in] A pointer to the function to call when reading.
+// onSeek            [in] A pointer to the function to call when seeking.
+// pReadSeekUserData [in] The user data that was passed to the pReadSeekUserData parameter of drwav_init_ex(), drwav_open_ex() and family.
+// pChunkHeader      [in] A pointer to an object containing basic header information about the chunk. Use this to identify the chunk.
+//
+// Returns the number of bytes read + seeked.
+//
+// To read data from the chunk, call onRead(), passing in pReadSeekUserData as the first parameter. Do the same
+// for seeking with onSeek(). The return value must be the total number of bytes you have read _plus_ seeked.
+//
+// You must not attempt to read beyond the boundary of the chunk.
+typedef drwav_uint64 (* drwav_chunk_proc)(void* pChunkUserData, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_chunk_header* pChunkHeader);
+
 // Structure for internal use. Only used for loaders opened with drwav_open_memory().
 typedef struct
 {
@@ -264,6 +304,30 @@ typedef struct
     drwav_uint8 subFormat[16];
 } drwav_fmt;
 
+typedef struct
+{
+    drwav_uint32 cuePointId;
+    drwav_uint32 type;
+    drwav_uint32 start;
+    drwav_uint32 end;
+    drwav_uint32 fraction;
+    drwav_uint32 playCount;
+} drwav_smpl_loop;
+
+ typedef struct
+{
+    drwav_uint32 manufacturer;
+    drwav_uint32 product;
+    drwav_uint32 samplePeriod;
+    drwav_uint32 midiUnityNotes;
+    drwav_uint32 midiPitchFraction;
+    drwav_uint32 smpteFormat;
+    drwav_uint32 smpteOffset;
+    drwav_uint32 numSampleLoops;
+    drwav_uint32 samplerData;
+    drwav_smpl_loop loops[DRWAV_MAX_SMPL_LOOPS];
+} drwav_smpl;
+
 typedef struct
 {
     // A pointer to the function to call when more data is needed.
@@ -324,6 +388,10 @@ typedef struct
     drwav_bool32 isSequentialWrite;
 
 
+    // smpl chunk.
+    drwav_smpl smpl;
+
+
     // A hack to avoid a DRWAV_MALLOC() when opening a decoder with drwav_open_memory().
     drwav__memory_stream memoryStream;
     drwav__memory_stream_write memoryStreamWrite;
@@ -359,9 +427,13 @@ typedef struct
 
 // Initializes a pre-allocated drwav object.
 //
-// onRead    [in]           The function to call when data needs to be read from the client.
-// onSeek    [in]           The function to call when the read position of the client data needs to move.
-// pUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek.
+// pWav                         [out]          A pointer to the drwav object being initialized.
+// onRead                       [in]           The function to call when data needs to be read from the client.
+// onSeek                       [in]           The function to call when the read position of the client data needs to move.
+// onChunk                      [in, optional] The function to call when a chunk is enumerated at initialized time.
+// pUserData, pReadSeekUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek.
+// pChunkUserData               [in, optional] A pointer to application defined data that will be passed to onChunk.
+// flags                        [in, optional] A set of flags for controlling how things are loaded.
 //
 // Returns true if successful; false otherwise.
 //
@@ -373,8 +445,18 @@ typedef struct
 // If you want dr_wav to manage the memory allocation for you, consider using drwav_open() instead. This will allocate
 // a drwav object on the heap and return a pointer to it.
 //
+// Possible values for flags:
+//   DRWAV_SEQUENTIAL: Never perform a backwards seek while loading. This disables the chunk callback and will cause this function
+//                     to return as soon as the data chunk is found. Any chunks after the data chunk will be ignored.
+//
+// drwav_init() is equivalent to "drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);".
+//
+// The onChunk is callback is not called for the WAVE or FMT chunks. The contents of the FMT chunk can be read from pWav->fmt
+// after the function returns.
+//
 // See also: drwav_init_file(), drwav_init_memory(), drwav_uninit()
 drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData);
+drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags);
 
 // Initializes a pre-allocated drwav object for writing.
 //
@@ -421,8 +503,9 @@ void drwav_uninit(drwav* pWav);
 // This is different from drwav_init() in that it will allocate the drwav object for you via DRWAV_MALLOC() before
 // initializing it.
 //
-// See also: drwav_open_file(), drwav_open_memory(), drwav_close()
+// See also: drwav_init(), drwav_open_file(), drwav_open_memory(), drwav_close()
 drwav* drwav_open(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData);
+drwav* drwav_open_ex(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags);
 
 // Opens a wav file for writing using the given callbacks.
 //
@@ -473,11 +556,18 @@ size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut);
 // using a compressed format consider using drwav_read_raw() or drwav_read_s16/s32/f32/etc().
 drwav_uint64 drwav_read(drwav* pWav, drwav_uint64 samplesToRead, void* pBufferOut);
 
+// Same as drwav_read(), except works on PCM frames instead of samples. Returns the number of PCM
+// freames read.
+drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
+
 // Seeks to the given sample.
 //
 // Returns true if successful; false otherwise.
 drwav_bool32 drwav_seek_to_sample(drwav* pWav, drwav_uint64 sample);
 
+// Same as drwav_seek_to_sample() except workd on PCM frames.
+drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex);
+
 
 // Writes raw audio data.
 //
@@ -489,6 +579,8 @@ size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData);
 // Returns the number of samples written.
 drwav_uint64 drwav_write(drwav* pWav, drwav_uint64 samplesToWrite, const void* pData);
 
+// Same as drwav_write(), but works on PCM frames instead of samples. Returns the number of PCM frames written.
+drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
 
 
 //// Conversion Utilities ////
@@ -501,6 +593,9 @@ drwav_uint64 drwav_write(drwav* pWav, drwav_uint64 samplesToWrite, const void* p
 // If the return value is less than <samplesToRead> it means the end of the file has been reached.
 drwav_uint64 drwav_read_s16(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
 
+// Same as drwav_read_s16(), except works on PCM frames instead of samples.
+drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
+
 // Low-level function for converting unsigned 8-bit PCM samples to signed 16-bit PCM samples.
 void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
 
@@ -530,6 +625,9 @@ void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sample
 // If the return value is less than <samplesToRead> it means the end of the file has been reached.
 drwav_uint64 drwav_read_f32(drwav* pWav, drwav_uint64 samplesToRead, float* pBufferOut);
 
+// Same as drwav_read_f32(), except works on PCM frames instead of samples.
+drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
+
 // Low-level function for converting unsigned 8-bit PCM samples to IEEE 32-bit floating point samples.
 void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
 
@@ -559,6 +657,9 @@ void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
 // If the return value is less than <samplesToRead> it means the end of the file has been reached.
 drwav_uint64 drwav_read_s32(drwav* pWav, drwav_uint64 samplesToRead, drwav_int32* pBufferOut);
 
+// Same as drwav_read_s32(), except works on PCM frames instead of samples.
+drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
+
 // Low-level function for converting unsigned 8-bit PCM samples to signed 32-bit PCM samples.
 void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
 
@@ -593,6 +694,7 @@ void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sample
 // objects because the operating system may restrict the number of file handles an application can have open at
 // any given time.
 drwav_bool32 drwav_init_file(drwav* pWav, const char* filename);
+drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags);
 
 // Helper for initializing a wave file for writing using stdio.
 //
@@ -608,6 +710,7 @@ drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename,
 // objects because the operating system may restrict the number of file handles an application can have open at
 // any given time.
 drwav* drwav_open_file(const char* filename);
+drwav* drwav_open_file_ex(const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags);
 
 // Helper for opening a wave file for writing using stdio.
 //
@@ -626,6 +729,7 @@ drwav* drwav_open_file_write_sequential(const char* filename, const drwav_data_f
 //
 // The buffer should contain the contents of the entire wave file, not just the sample data.
 drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize);
+drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags);
 
 // Helper for initializing a writer which outputs data to a memory buffer.
 //
@@ -643,6 +747,7 @@ drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size
 //
 // The buffer should contain the contents of the entire wave file, not just the sample data.
 drwav* drwav_open_memory(const void* data, size_t dataSize);
+drwav* drwav_open_memory_ex(const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags);
 
 // Helper for opening a writer which outputs data to a memory buffer.
 //
@@ -657,19 +762,28 @@ drwav* drwav_open_memory_write_sequential(void** ppData, size_t* pDataSize, cons
 #ifndef DR_WAV_NO_CONVERSION_API
 // Opens and reads a wav file in a single operation.
 drwav_int16* drwav_open_and_read_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
 float* drwav_open_and_read_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
 drwav_int32* drwav_open_and_read_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
 #ifndef DR_WAV_NO_STDIO
 // Opens and decodes a wav file in a single operation.
-drwav_int16* drwav_open_and_read_file_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
-float* drwav_open_and_read_file_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
-drwav_int32* drwav_open_and_read_file_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int16* drwav_open_file_and_read_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
+float* drwav_open_file_and_read_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
+drwav_int32* drwav_open_file_and_read_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
 #endif
 
 // Opens and decodes a wav file from a block of memory in a single operation.
-drwav_int16* drwav_open_and_read_memory_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
-float* drwav_open_and_read_memory_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
-drwav_int32* drwav_open_and_read_memory_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int16* drwav_open_memory_and_read_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
+float* drwav_open_memory_and_read_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
+drwav_int32* drwav_open_memory_and_read_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount);
+drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount);
 #endif
 
 // Frees data that was allocated internally by dr_wav.
@@ -727,6 +841,13 @@ void drwav_free(void* pDataReturnedByOpenAndRead);
 #define drwav_copy_memory                  DRWAV_COPY_MEMORY
 #define drwav_zero_memory                  DRWAV_ZERO_MEMORY
 
+typedef drwav_int32 drwav_result;
+#define DRWAV_SUCCESS            0
+#define DRWAV_ERROR             -1
+#define DRWAV_INVALID_ARGS      -2
+#define DRWAV_INVALID_OPERATION -3
+#define DRWAV_INVALID_FILE      -100
+#define DRWAV_EOF               -101
 
 #define DRWAV_MAX_SIMD_VECTOR_SIZE         64  // 64 for AVX-512 in the future.
 
@@ -756,6 +877,7 @@ static const drwav_uint8 drwavGUID_W64_JUNK[16] = {0x6A,0x75,0x6E,0x6B, 0xF3,0xA
 static const drwav_uint8 drwavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    // 20746D66-ACF3-11D3-8CD1-00C04F8EDB8A
 static const drwav_uint8 drwavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    // 74636166-ACF3-11D3-8CD1-00C04F8EDB8A
 static const drwav_uint8 drwavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    // 61746164-ACF3-11D3-8CD1-00C04F8EDB8A
+static const drwav_uint8 drwavGUID_W64_SMPL[16] = {0x73,0x6D,0x70,0x6C, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    // 6C706D73-ACF3-11D3-8CD1-00C04F8EDB8A
 
 static DRWAV_INLINE drwav_bool32 drwav__guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
 {
@@ -823,39 +945,21 @@ static DRWAV_INLINE drwav_bool32 drwav__is_compressed_format_tag(drwav_uint16 fo
         formatTag == DR_WAVE_FORMAT_DVI_ADPCM;
 }
 
-
 drwav_uint64 drwav_read_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
 drwav_uint64 drwav_read_s16__ima(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
 drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData);
 drwav* drwav_open_write__internal(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData);
 
-typedef struct
-{
-    union
-    {
-        drwav_uint8 fourcc[4];
-        drwav_uint8 guid[16];
-    } id;
-
-    // The size in bytes of the chunk.
-    drwav_uint64 sizeInBytes;
-
-    // RIFF = 2 byte alignment.
-    // W64  = 8 byte alignment.
-    unsigned int paddingSize;
-
-} drwav__chunk_header;
-
-static drwav_bool32 drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav__chunk_header* pHeaderOut)
+static drwav_result drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_chunk_header* pHeaderOut)
 {
     if (container == drwav_container_riff) {
         if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
-            return DRWAV_FALSE;
+            return DRWAV_EOF;
         }
 
         unsigned char sizeInBytes[4];
         if (onRead(pUserData, sizeInBytes, 4) != 4) {
-            return DRWAV_FALSE;
+            return DRWAV_INVALID_FILE;
         }
 
         pHeaderOut->sizeInBytes = drwav__bytes_to_u32(sizeInBytes);
@@ -863,12 +967,12 @@ static drwav_bool32 drwav__read_chunk_header(drwav_read_proc onRead, void* pUser
         *pRunningBytesReadOut += 8;
     } else {
         if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
-            return DRWAV_FALSE;
+            return DRWAV_EOF;
         }
 
         unsigned char sizeInBytes[8];
         if (onRead(pUserData, sizeInBytes, 8) != 8) {
-            return DRWAV_FALSE;
+            return DRWAV_INVALID_FILE;
         }
 
         pHeaderOut->sizeInBytes = drwav__bytes_to_u64(sizeInBytes) - 24;    // <-- Subtract 24 because w64 includes the size of the header.
@@ -876,7 +980,7 @@ static drwav_bool32 drwav__read_chunk_header(drwav_read_proc onRead, void* pUser
         *pRunningBytesReadOut += 24;
     }
 
-    return DRWAV_TRUE;
+    return DRWAV_SUCCESS;
 }
 
 static drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
@@ -899,11 +1003,38 @@ static drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 off
     return DRWAV_TRUE;
 }
 
+static drwav_bool32 drwav__seek_from_start(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return onSeek(pUserData, (int)offset, drwav_seek_origin_start);
+    }
+
+    // Larger than 32-bit seek.
+    if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_start)) {
+        return DRWAV_FALSE;
+    }
+    offset -= 0x7FFFFFFF;
+
+    for (;;) {
+        if (offset <= 0x7FFFFFFF) {
+            return onSeek(pUserData, (int)offset, drwav_seek_origin_current);
+        }
+
+        if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) {
+            return DRWAV_FALSE;
+        }
+        offset -= 0x7FFFFFFF;
+    }
+
+    // Should never get here.
+    //return DRWAV_TRUE;
+}
+
 
 static drwav_bool32 drwav__read_fmt(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_fmt* fmtOut)
 {
-    drwav__chunk_header header;
-    if (!drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header)) {
+    drwav_chunk_header header;
+    if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
         return DRWAV_FALSE;
     }
 
@@ -916,7 +1047,7 @@ static drwav_bool32 drwav__read_fmt(drwav_read_proc onRead, drwav_seek_proc onSe
         *pRunningBytesReadOut += header.sizeInBytes + header.paddingSize;
 
         // Try the next header.
-        if (!drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header)) {
+        if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
             return DRWAV_FALSE;
         }
     }
@@ -1041,13 +1172,18 @@ static drwav_bool32 drwav__on_seek_stdio(void* pUserData, int offset, drwav_seek
 }
 
 drwav_bool32 drwav_init_file(drwav* pWav, const char* filename)
+{
+    return drwav_init_file_ex(pWav, filename, NULL, NULL, 0);
+}
+
+drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
 {
     FILE* pFile = drwav_fopen(filename, "rb");
     if (pFile == NULL) {
         return DRWAV_FALSE;
     }
 
-    return drwav_init(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, (void*)pFile);
+    return drwav_init_ex(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, onChunk, (void*)pFile, pChunkUserData, flags);
 }
 
 
@@ -1072,13 +1208,18 @@ drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename,
 }
 
 drwav* drwav_open_file(const char* filename)
+{
+    return drwav_open_file_ex(filename, NULL, NULL, 0);
+}
+
+drwav* drwav_open_file_ex(const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
 {
     FILE* pFile = drwav_fopen(filename, "rb");
     if (pFile == NULL) {
         return DRWAV_FALSE;
     }
 
-    drwav* pWav = drwav_open(drwav__on_read_stdio, drwav__on_seek_stdio, (void*)pFile);
+    drwav* pWav = drwav_open_ex(drwav__on_read_stdio, drwav__on_seek_stdio, onChunk, (void*)pFile, pChunkUserData, flags);
     if (pWav == NULL) {
         fclose(pFile);
         return NULL;
@@ -1232,6 +1373,11 @@ static drwav_bool32 drwav__on_seek_memory_write(void* pUserData, int offset, drw
 }
 
 drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize)
+{
+    return drwav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0);
+}
+
+drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
 {
     if (data == NULL || dataSize == 0) {
         return DRWAV_FALSE;
@@ -1243,7 +1389,7 @@ drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize)
     memoryStream.dataSize = dataSize;
     memoryStream.currentReadPos = 0;
 
-    if (!drwav_init(pWav, drwav__on_read_memory, drwav__on_seek_memory, (void*)&memoryStream)) {
+    if (!drwav_init_ex(pWav, drwav__on_read_memory, drwav__on_seek_memory, onChunk, (void*)&memoryStream, pChunkUserData, flags)) {
         return DRWAV_FALSE;
     }
 
@@ -1291,6 +1437,11 @@ drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size
 
 
 drwav* drwav_open_memory(const void* data, size_t dataSize)
+{
+    return drwav_open_memory_ex(data, dataSize, NULL, NULL, 0);
+}
+
+drwav* drwav_open_memory_ex(const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
 {
     if (data == NULL || dataSize == 0) {
         return NULL;
@@ -1302,7 +1453,7 @@ drwav* drwav_open_memory(const void* data, size_t dataSize)
     memoryStream.dataSize = dataSize;
     memoryStream.currentReadPos = 0;
 
-    drwav* pWav = drwav_open(drwav__on_read_memory, drwav__on_seek_memory, (void*)&memoryStream);
+    drwav* pWav = drwav_open_ex(drwav__on_read_memory, drwav__on_seek_memory, onChunk, (void*)&memoryStream, pChunkUserData, flags);
     if (pWav == NULL) {
         return NULL;
     }
@@ -1351,19 +1502,58 @@ drwav* drwav_open_memory_write_sequential(void** ppData, size_t* pDataSize, cons
 }
 
 
+size_t drwav__on_read(drwav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor)
+{
+    drwav_assert(onRead != NULL);
+    drwav_assert(pCursor != NULL);
+
+    size_t bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
+    *pCursor += bytesRead;
+    return bytesRead;
+}
+
+drwav_bool32 drwav__on_seek(drwav_seek_proc onSeek, void* pUserData, int offset, drwav_seek_origin origin, drwav_uint64* pCursor)
+{
+    drwav_assert(onSeek != NULL);
+    drwav_assert(pCursor != NULL);
+
+    if (!onSeek(pUserData, offset, origin)) {
+        return DRWAV_FALSE;
+    }
+
+    if (origin == drwav_seek_origin_start) {
+        *pCursor = offset;
+    } else {
+        *pCursor += offset;
+    }
+
+    return DRWAV_TRUE;
+}
+
+
 drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData)
+{
+    return drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);
+}
+
+drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags)
 {
     if (onRead == NULL || onSeek == NULL) {
         return DRWAV_FALSE;
     }
 
-    drwav_zero_memory(pWav, sizeof(*pWav));
+    drwav_uint64 cursor = 0;    // <-- Keeps track of the byte position so we can seek to specific locations.
+    drwav_bool32 sequential = (flags & DRWAV_SEQUENTIAL) != 0;
 
+    drwav_zero_memory(pWav, sizeof(*pWav));
+    pWav->onRead    = onRead;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pReadSeekUserData;
 
     // The first 4 bytes should be the RIFF identifier.
     unsigned char riff[4];
-    if (onRead(pUserData, riff, sizeof(riff)) != sizeof(riff)) {
-        return DRWAV_FALSE;    // Failed to read data.
+    if (drwav__on_read(onRead, pReadSeekUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
+        return DRWAV_FALSE;
     }
 
     // The first 4 bytes can be used to identify the container. For RIFF files it will start with "RIFF" and for
@@ -1375,7 +1565,7 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
 
         // Check the rest of the GUID for validity.
         drwav_uint8 riff2[12];
-        if (onRead(pUserData, riff2, sizeof(riff2)) != sizeof(riff2)) {
+        if (drwav__on_read(onRead, pReadSeekUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
             return DRWAV_FALSE;
         }
 
@@ -1392,7 +1582,7 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
     if (pWav->container == drwav_container_riff) {
         // RIFF/WAVE
         unsigned char chunkSizeBytes[4];
-        if (onRead(pUserData, chunkSizeBytes, sizeof(chunkSizeBytes)) != sizeof(chunkSizeBytes)) {
+        if (drwav__on_read(onRead, pReadSeekUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
             return DRWAV_FALSE;
         }
 
@@ -1402,19 +1592,17 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
         }
 
         unsigned char wave[4];
-        if (onRead(pUserData, wave, sizeof(wave)) != sizeof(wave)) {
+        if (drwav__on_read(onRead, pReadSeekUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
             return DRWAV_FALSE;
         }
 
         if (!drwav__fourcc_equal(wave, "WAVE")) {
             return DRWAV_FALSE;    // Expecting "WAVE".
         }
-
-        pWav->dataChunkDataPos = 4 + sizeof(chunkSizeBytes) + sizeof(wave);
     } else {
         // W64
         unsigned char chunkSize[8];
-        if (onRead(pUserData, chunkSize, sizeof(chunkSize)) != sizeof(chunkSize)) {
+        if (drwav__on_read(onRead, pReadSeekUserData, chunkSize, sizeof(chunkSize), &cursor) != sizeof(chunkSize)) {
             return DRWAV_FALSE;
         }
 
@@ -1423,21 +1611,19 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
         }
 
         drwav_uint8 wave[16];
-        if (onRead(pUserData, wave, sizeof(wave)) != sizeof(wave)) {
+        if (drwav__on_read(onRead, pReadSeekUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
             return DRWAV_FALSE;
         }
 
         if (!drwav__guid_equal(wave, drwavGUID_W64_WAVE)) {
             return DRWAV_FALSE;
         }
-
-        pWav->dataChunkDataPos = 16 + sizeof(chunkSize) + sizeof(wave);
     }
 
 
     // The next bytes should be the "fmt " chunk.
     drwav_fmt fmt;
-    if (!drwav__read_fmt(onRead, onSeek, pUserData, pWav->container, &pWav->dataChunkDataPos, &fmt)) {
+    if (!drwav__read_fmt(onRead, onSeek, pReadSeekUserData, pWav->container, &cursor, &fmt)) {
         return DRWAV_FALSE;    // Failed to read the "fmt " chunk.
     }
 
@@ -1454,37 +1640,80 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
     }
 
 
+
     drwav_uint64 sampleCountFromFactChunk = 0;
 
+    // We need to enumerate over each chunk for two reasons:
+    //   1) The "data" chunk may not be the next one
+    //   2) We may want to report each chunk back to the client
+    //
+    // In order to correctly report each chunk back to the client we will need to keep looping until the end of the file.
+    drwav_bool32 foundDataChunk = DRWAV_FALSE;
+    drwav_uint64 dataChunkSize = 0;
+
     // The next chunk we care about is the "data" chunk. This is not necessarily the next chunk so we'll need to loop.
-    drwav_uint64 dataSize;
+    drwav_uint64 chunkSize = 0;
     for (;;)
     {
-        drwav__chunk_header header;
-        if (!drwav__read_chunk_header(onRead, pUserData, pWav->container, &pWav->dataChunkDataPos, &header)) {
-            return DRWAV_FALSE;
+        drwav_chunk_header header;
+        drwav_result result = drwav__read_chunk_header(onRead, pReadSeekUserData, pWav->container, &cursor, &header);
+        if (result != DRWAV_SUCCESS) {
+            if (!foundDataChunk) {
+                return DRWAV_FALSE;
+            } else {
+                break;  // Probably at the end of the file. Get out of the loop.
+            }
         }
 
-        dataSize = header.sizeInBytes;
+        // Tell the client about this chunk.
+        if (!sequential && onChunk != NULL) {
+            drwav_uint64 callbackBytesRead = onChunk(pChunkUserData, onRead, onSeek, pReadSeekUserData, &header);
+
+            // dr_wav may need to read the contents of the chunk, so we now need to seek back to the position before
+            // we called the callback.
+            if (callbackBytesRead > 0) {
+                if (!drwav__seek_from_start(onSeek, cursor, pReadSeekUserData)) {
+                    return DRWAV_FALSE;
+                }
+            }
+        }
+        
+
+        if (!foundDataChunk) {
+            pWav->dataChunkDataPos = cursor;
+        }
+
+        chunkSize = header.sizeInBytes;
         if (pWav->container == drwav_container_riff) {
             if (drwav__fourcc_equal(header.id.fourcc, "data")) {
-                break;
+                foundDataChunk = DRWAV_TRUE;
+                dataChunkSize = chunkSize;
             }
         } else {
             if (drwav__guid_equal(header.id.guid, drwavGUID_W64_DATA)) {
-                break;
+                foundDataChunk = DRWAV_TRUE;
+                dataChunkSize = chunkSize;
             }
         }
 
+        // If at this point we have found the data chunk and we're running in sequential mode, we need to break out of this loop. The reason for
+        // this is that we would otherwise require a backwards seek which sequential mode forbids.
+        if (foundDataChunk && sequential) {
+            break;
+        }
+
         // Optional. Get the total sample count from the FACT chunk. This is useful for compressed formats.
         if (pWav->container == drwav_container_riff) {
             if (drwav__fourcc_equal(header.id.fourcc, "fact")) {
                 drwav_uint32 sampleCount;
-                if (onRead(pUserData, &sampleCount, 4) != 4) {
+                if (drwav__on_read(onRead, pReadSeekUserData, &sampleCount, 4, &cursor) != 4) {
                     return DRWAV_FALSE;
                 }
-                pWav->dataChunkDataPos += 4;
-                dataSize -= 4;
+                chunkSize -= 4;
+
+                if (!foundDataChunk) {
+                    pWav->dataChunkDataPos = cursor;
+                }
 
                 // The sample count in the "fact" chunk is either unreliable, or I'm not understanding it properly. For now I am only enabling this
                 // for Microsoft ADPCM formats.
@@ -1496,52 +1725,118 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
             }
         } else {
             if (drwav__guid_equal(header.id.guid, drwavGUID_W64_FACT)) {
-                if (onRead(pUserData, &sampleCountFromFactChunk, 8) != 8) {
+                if (drwav__on_read(onRead, pReadSeekUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
                     return DRWAV_FALSE;
                 }
-                pWav->dataChunkDataPos += 8;
-                dataSize -= 8;
+                chunkSize -= 8;
+
+                if (!foundDataChunk) {
+                    pWav->dataChunkDataPos = cursor;
+                }
             }
         }
 
-        // If we get here it means we didn't find the "data" chunk. Seek past it.
+        // "smpl" chunk.
+        if (pWav->container == drwav_container_riff) {
+            if (drwav__fourcc_equal(header.id.fourcc, "smpl")) {
+                unsigned char smplHeaderData[36];    // 36 = size of the smpl header section, not including the loop data.
+                if (chunkSize >= sizeof(smplHeaderData)) {
+                    drwav_uint64 bytesJustRead = drwav__on_read(onRead, pReadSeekUserData, smplHeaderData, sizeof(smplHeaderData), &cursor);
+                    chunkSize -= bytesJustRead;
+
+                    if (bytesJustRead == sizeof(smplHeaderData)) {
+                        pWav->smpl.manufacturer      = drwav__bytes_to_u32(smplHeaderData+0);
+                        pWav->smpl.product           = drwav__bytes_to_u32(smplHeaderData+4);
+                        pWav->smpl.samplePeriod      = drwav__bytes_to_u32(smplHeaderData+8);
+                        pWav->smpl.midiUnityNotes    = drwav__bytes_to_u32(smplHeaderData+12);
+                        pWav->smpl.midiPitchFraction = drwav__bytes_to_u32(smplHeaderData+16);
+                        pWav->smpl.smpteFormat       = drwav__bytes_to_u32(smplHeaderData+20);
+                        pWav->smpl.smpteOffset       = drwav__bytes_to_u32(smplHeaderData+24);
+                        pWav->smpl.numSampleLoops    = drwav__bytes_to_u32(smplHeaderData+28);
+                        pWav->smpl.samplerData       = drwav__bytes_to_u32(smplHeaderData+32);
+
+                        for (drwav_uint32 iLoop = 0; iLoop < pWav->smpl.numSampleLoops && iLoop < drwav_countof(pWav->smpl.loops); ++iLoop) {
+                            unsigned char smplLoopData[24];  // 24 = size of a loop section in the smpl chunk.
+                            bytesJustRead = drwav__on_read(onRead, pReadSeekUserData, smplLoopData, sizeof(smplLoopData), &cursor);
+                            chunkSize -= bytesJustRead;
+
+                            if (bytesJustRead == sizeof(smplLoopData)) {
+                                pWav->smpl.loops[iLoop].cuePointId = drwav__bytes_to_u32(smplLoopData+0);
+                                pWav->smpl.loops[iLoop].type       = drwav__bytes_to_u32(smplLoopData+4);
+                                pWav->smpl.loops[iLoop].start      = drwav__bytes_to_u32(smplLoopData+8);
+                                pWav->smpl.loops[iLoop].end        = drwav__bytes_to_u32(smplLoopData+12);
+                                pWav->smpl.loops[iLoop].fraction   = drwav__bytes_to_u32(smplLoopData+16);
+                                pWav->smpl.loops[iLoop].playCount  = drwav__bytes_to_u32(smplLoopData+20);
+                            } else {
+                                break;  // Break from the smpl loop for loop.
+                            }
+                        }
+                    }
+                } else {
+                    // Looks like invalid data. Ignore the chunk.
+                }
+            }
+        } else {
+            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_SMPL)) {
+                // This path will be hit when a W64 WAV file contains a smpl chunk. I don't have a sample file to test this path, so a contribution
+                // is welcome to add support for this.
+            }
+        }
 
         // Make sure we seek past the padding.
-        dataSize += header.paddingSize;
-        drwav__seek_forward(onSeek, dataSize, pUserData);
-        pWav->dataChunkDataPos += dataSize;
+        chunkSize += header.paddingSize;
+        if (!drwav__seek_forward(onSeek, chunkSize, pReadSeekUserData)) {
+            break;
+        }
+        cursor += chunkSize;
+
+        if (!foundDataChunk) {
+            pWav->dataChunkDataPos = cursor;
+        }
     }
 
+    // If we haven't found a data chunk, return an error.
+    if (!foundDataChunk) {
+        return DRWAV_FALSE;
+    }
+
+    // We may have moved passed the data chunk. If so we need to move back. If running in sequential mode we can assume we are already sitting on the data chunk.
+    if (!sequential) {
+        if (!drwav__seek_from_start(onSeek, pWav->dataChunkDataPos, pReadSeekUserData)) {
+            return DRWAV_FALSE;
+        }
+        cursor = pWav->dataChunkDataPos;
+    }
+    
+
     // At this point we should be sitting on the first byte of the raw audio data.
 
-    pWav->onRead              = onRead;
-    pWav->onSeek              = onSeek;
-    pWav->pUserData           = pUserData;
     pWav->fmt                 = fmt;
     pWav->sampleRate          = fmt.sampleRate;
     pWav->channels            = fmt.channels;
     pWav->bitsPerSample       = fmt.bitsPerSample;
-    pWav->bytesPerSample      = fmt.blockAlign / fmt.channels;
-    pWav->bytesRemaining      = dataSize;
+    pWav->bytesRemaining      = dataChunkSize;
     pWav->translatedFormatTag = translatedFormatTag;
-    pWav->dataChunkDataSize   = dataSize;
+    pWav->dataChunkDataSize   = dataChunkSize;
 
-    // The bytes per sample should never be 0 at this point. This would indicate an invalid WAV file.
-    if (pWav->bytesPerSample == 0) {
-        return DRWAV_FALSE;
+    // The number of bytes per sample is based on the bits per sample or the block align. We prioritize floor(bitsPerSample/8), but if
+    // this is zero of the bits per sample is not a multiple of 8 we need to fall back to the block align.
+    pWav->bytesPerSample = pWav->bitsPerSample/8;
+    if (pWav->bytesPerSample == 0 || (pWav->bitsPerSample & 0x7) != 0 /*|| pWav->bytesPerSample < fmt.blockAlign/fmt.channels*/) {
+        pWav->bytesPerSample = fmt.blockAlign/fmt.channels;
     }
 
     if (sampleCountFromFactChunk != 0) {
         pWav->totalSampleCount = sampleCountFromFactChunk * fmt.channels;
     } else {
-        pWav->totalSampleCount = dataSize / pWav->bytesPerSample;
+        pWav->totalSampleCount = dataChunkSize / pWav->bytesPerSample;
 
         if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-            drwav_uint64 blockCount = dataSize / fmt.blockAlign;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
             pWav->totalSampleCount = (blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2;  // x2 because two samples per byte.
         }
         if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-            drwav_uint64 blockCount = dataSize / fmt.blockAlign;
+            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
             pWav->totalSampleCount = ((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels);
         }
     }
@@ -1566,11 +1861,11 @@ drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onS
     // always include the sample count. This little block of code below is only used to emulate the libsndfile logic so I can properly run my
     // correctness tests against libsndfile, and is disabled by default.
     if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-        drwav_uint64 blockCount = dataSize / fmt.blockAlign;
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
         pWav->totalSampleCount = (blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2;  // x2 because two samples per byte.
     }
     if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        drwav_uint64 blockCount = dataSize / fmt.blockAlign;
+        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
         pWav->totalSampleCount = ((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels);
     }
 #endif
@@ -1819,13 +2114,18 @@ void drwav_uninit(drwav* pWav)
 
 
 drwav* drwav_open(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData)
+{
+    return drwav_open_ex(onRead, onSeek, NULL, pUserData, NULL, 0);
+}
+
+drwav* drwav_open_ex(drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags)
 {
     drwav* pWav = (drwav*)DRWAV_MALLOC(sizeof(*pWav));
     if (pWav == NULL) {
         return NULL;
     }
 
-    if (!drwav_init(pWav, onRead, onSeek, pUserData)) {
+    if (!drwav_init_ex(pWav, onRead, onSeek, onChunk, pReadSeekUserData, pChunkUserData, flags)) {
         DRWAV_FREE(pWav);
         return NULL;
     }
@@ -1884,7 +2184,7 @@ size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut)
 
 drwav_uint64 drwav_read(drwav* pWav, drwav_uint64 samplesToRead, void* pBufferOut)
 {
-    if (pWav == NULL || samplesToRead == 0 || pBufferOut == NULL) {
+    if (pWav == NULL || samplesToRead == 0 || pBufferOut == NULL || pWav->bytesPerSample == 0) {
         return 0;
     }
 
@@ -1902,6 +2202,11 @@ drwav_uint64 drwav_read(drwav* pWav, drwav_uint64 samplesToRead, void* pBufferOu
     return bytesRead / pWav->bytesPerSample;
 }
 
+drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
+{
+    return drwav_read(pWav, framesToRead * pWav->channels, pBufferOut) / pWav->channels;
+}
+
 drwav_bool32 drwav_seek_to_first_sample(drwav* pWav)
 {
     if (pWav->onWrite != NULL) {
@@ -2015,6 +2320,11 @@ drwav_bool32 drwav_seek_to_sample(drwav* pWav, drwav_uint64 sample)
     return DRWAV_TRUE;
 }
 
+drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex)
+{
+    return drwav_seek_to_sample(pWav, targetFrameIndex * pWav->channels);
+}
+
 
 size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData)
 {
@@ -2060,6 +2370,11 @@ drwav_uint64 drwav_write(drwav* pWav, drwav_uint64 samplesToWrite, const void* p
     return (bytesWritten * 8) / pWav->bitsPerSample;
 }
 
+drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
+{
+    return drwav_write(pWav, framesToWrite * pWav->channels, pData) / pWav->channels;
+}
+
 
 
 drwav_uint64 drwav_read_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut)
@@ -2482,9 +2797,13 @@ static void drwav__ieee_to_s16(drwav_int16* pOut, const unsigned char* pIn, size
 drwav_uint64 drwav_read_s16__pcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut)
 {
     // Fast path.
-    if (pWav->bytesPerSample == 2) {
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) {
         return drwav_read(pWav, samplesToRead, pBufferOut);
     }
+    
+    if (pWav->bytesPerSample == 0) {
+        return 0;
+    }
 
     drwav_uint64 totalSamplesRead = 0;
     unsigned char sampleData[4096];
@@ -2506,6 +2825,10 @@ drwav_uint64 drwav_read_s16__pcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_
 
 drwav_uint64 drwav_read_s16__ieee(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut)
 {
+    if (pWav->bytesPerSample == 0) {
+        return 0;
+    }
+
     drwav_uint64 totalSamplesRead = 0;
     unsigned char sampleData[4096];
     while (samplesToRead > 0) {
@@ -2526,6 +2849,10 @@ drwav_uint64 drwav_read_s16__ieee(drwav* pWav, drwav_uint64 samplesToRead, drwav
 
 drwav_uint64 drwav_read_s16__alaw(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut)
 {
+    if (pWav->bytesPerSample == 0) {
+        return 0;
+    }
+
     drwav_uint64 totalSamplesRead = 0;
     unsigned char sampleData[4096];
     while (samplesToRead > 0) {
@@ -2546,6 +2873,10 @@ drwav_uint64 drwav_read_s16__alaw(drwav* pWav, drwav_uint64 samplesToRead, drwav
 
 drwav_uint64 drwav_read_s16__mulaw(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut)
 {
+    if (pWav->bytesPerSample == 0) {
+        return 0;
+    }
+
     drwav_uint64 totalSamplesRead = 0;
     unsigned char sampleData[4096];
     while (samplesToRead > 0) {
@@ -2602,6 +2933,11 @@ drwav_uint64 drwav_read_s16(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16
     return 0;
 }
 
+drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
+{
+    return drwav_read_s16(pWav, framesToRead * pWav->channels, pBufferOut) / pWav->channels;
+}
+
 void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
 {
     int r;
@@ -2812,10 +3148,10 @@ drwav_uint64 drwav_read_f32__ima(drwav* pWav, drwav_uint64 samplesToRead, float*
 drwav_uint64 drwav_read_f32__ieee(drwav* pWav, drwav_uint64 samplesToRead, float* pBufferOut)
 {
     // Fast path.
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bytesPerSample == 4) {
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
         return drwav_read(pWav, samplesToRead, pBufferOut);
     }
-
+    
     if (pWav->bytesPerSample == 0) {
         return 0;
     }
@@ -2924,6 +3260,11 @@ drwav_uint64 drwav_read_f32(drwav* pWav, drwav_uint64 samplesToRead, float* pBuf
     return 0;
 }
 
+drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
+{
+    return drwav_read_f32(pWav, framesToRead * pWav->channels, pBufferOut) / pWav->channels;
+}
+
 void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
 {
     if (pOut == NULL || pIn == NULL) {
@@ -3085,10 +3426,10 @@ static void drwav__ieee_to_s32(drwav_int32* pOut, const unsigned char* pIn, size
 drwav_uint64 drwav_read_s32__pcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int32* pBufferOut)
 {
     // Fast path.
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bytesPerSample == 4) {
+    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
         return drwav_read(pWav, samplesToRead, pBufferOut);
     }
-
+    
     if (pWav->bytesPerSample == 0) {
         return 0;
     }
@@ -3266,6 +3607,11 @@ drwav_uint64 drwav_read_s32(drwav* pWav, drwav_uint64 samplesToRead, drwav_int32
     return 0;
 }
 
+drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
+{
+    return drwav_read_s32(pWav, framesToRead * pWav->channels, pBufferOut) / pWav->channels;
+}
+
 void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
 {
     if (pOut == NULL || pIn == NULL) {
@@ -3446,8 +3792,8 @@ drwav_int32* drwav__read_and_close_s32(drwav* pWav, unsigned int* channels, unsi
 
 drwav_int16* drwav_open_and_read_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
-    if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
+    if (sampleRate) *sampleRate = 0;
     if (totalSampleCount) *totalSampleCount = 0;
 
     drwav wav;
@@ -3458,6 +3804,27 @@ drwav_int16* drwav_open_and_read_s16(drwav_read_proc onRead, drwav_seek_proc onS
     return drwav__read_and_close_s16(&wav, channels, sampleRate, totalSampleCount);
 }
 
+drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int16* result = drwav_open_and_read_s16(onRead, onSeek, pUserData, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
 float* drwav_open_and_read_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
@@ -3472,6 +3839,27 @@ float* drwav_open_and_read_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, v
     return drwav__read_and_close_f32(&wav, channels, sampleRate, totalSampleCount);
 }
 
+float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    float* result = drwav_open_and_read_f32(onRead, onSeek, pUserData, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
 drwav_int32* drwav_open_and_read_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
@@ -3486,8 +3874,29 @@ drwav_int32* drwav_open_and_read_s32(drwav_read_proc onRead, drwav_seek_proc onS
     return drwav__read_and_close_s32(&wav, channels, sampleRate, totalSampleCount);
 }
 
+drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int32* result = drwav_open_and_read_s32(onRead, onSeek, pUserData, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
 #ifndef DR_WAV_NO_STDIO
-drwav_int16* drwav_open_and_read_file_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+drwav_int16* drwav_open_file_and_read_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3501,7 +3910,28 @@ drwav_int16* drwav_open_and_read_file_s16(const char* filename, unsigned int* ch
     return drwav__read_and_close_s16(&wav, channels, sampleRate, totalSampleCount);
 }
 
-float* drwav_open_and_read_file_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int16* result = drwav_open_file_and_read_s16(filename, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
+float* drwav_open_file_and_read_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3515,7 +3945,28 @@ float* drwav_open_and_read_file_f32(const char* filename, unsigned int* channels
     return drwav__read_and_close_f32(&wav, channels, sampleRate, totalSampleCount);
 }
 
-drwav_int32* drwav_open_and_read_file_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    float* result = drwav_open_file_and_read_f32(filename, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
+drwav_int32* drwav_open_file_and_read_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3528,9 +3979,30 @@ drwav_int32* drwav_open_and_read_file_s32(const char* filename, unsigned int* ch
 
     return drwav__read_and_close_s32(&wav, channels, sampleRate, totalSampleCount);
 }
+
+drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int32* result = drwav_open_file_and_read_s32(filename, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
 #endif
 
-drwav_int16* drwav_open_and_read_memory_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+drwav_int16* drwav_open_memory_and_read_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3544,7 +4016,28 @@ drwav_int16* drwav_open_and_read_memory_s16(const void* data, size_t dataSize, u
     return drwav__read_and_close_s16(&wav, channels, sampleRate, totalSampleCount);
 }
 
-float* drwav_open_and_read_memory_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int16* result = drwav_open_memory_and_read_s16(data, dataSize, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
+float* drwav_open_memory_and_read_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3558,7 +4051,28 @@ float* drwav_open_and_read_memory_f32(const void* data, size_t dataSize, unsigne
     return drwav__read_and_close_f32(&wav, channels, sampleRate, totalSampleCount);
 }
 
-drwav_int32* drwav_open_and_read_memory_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
+float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    float* result = drwav_open_memory_and_read_f32(data, dataSize, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
+
+drwav_int32* drwav_open_memory_and_read_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalSampleCount)
 {
     if (sampleRate) *sampleRate = 0;
     if (channels) *channels = 0;
@@ -3571,6 +4085,27 @@ drwav_int32* drwav_open_and_read_memory_s32(const void* data, size_t dataSize, u
 
     return drwav__read_and_close_s32(&wav, channels, sampleRate, totalSampleCount);
 }
+
+drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut)
+{
+    if (channelsOut) *channelsOut = 0;
+    if (sampleRateOut) *sampleRateOut = 0;
+    if (totalFrameCountOut) *totalFrameCountOut = 0;
+
+    unsigned int channels;
+    unsigned int sampleRate;
+    drwav_uint64 totalSampleCount;
+    drwav_int32* result = drwav_open_memory_and_read_s32(data, dataSize, &channels, &sampleRate, &totalSampleCount);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (channelsOut) *channelsOut = channels;
+    if (sampleRateOut) *sampleRateOut = sampleRate;
+    if (totalFrameCountOut) *totalFrameCountOut = totalSampleCount / channels;
+
+    return result;
+}
 #endif  //DR_WAV_NO_CONVERSION_API
 
 
@@ -3584,6 +4119,15 @@ void drwav_free(void* pDataReturnedByOpenAndRead)
 
 // REVISION HISTORY
 //
+// v0.9.0 - 2018-xx-xx
+//   - API CHANGE: Rename drwav_open_and_read_file_*() to drwav_open_file_and_read_*().
+//   - API CHANGE: Rename drwav_open_and_read_memory_*() to drwav_open_memory_and_read_*().
+//   - Add built-in support for smpl chunks.
+//   - Add support for firing a callback for each chunk in the file at initialization time.
+//     - This is enabled through the drwav_init_ex(), etc. family of APIs.
+//   - Add new reading APIs for reading by PCM frames instead of samples.
+//   - Handle invalid FMT chunks more robustly.
+//
 // v0.8.5 - 2018-09-11
 //   - Const correctness.
 //   - Fix a potential stack overflow.
diff --git a/mini_al.h b/mini_al.h
index 4dbc17f8..08210c15 100644
--- a/mini_al.h
+++ b/mini_al.h
@@ -1,5 +1,5 @@
 // Audio playback and capture library. Public domain. See "unlicense" statement at the end of this file.
-// mini_al - v0.8.13 - 2018-12-04
+// mini_al - v0.8.14 - 2018-12-16
 //
 // David Reid - davidreidsoftware@gmail.com
 
@@ -18247,10 +18247,16 @@ void mal_device_uninit__opensl(mal_device* pDevice)
 
     // Uninit device.
     if (pDevice->type == mal_device_type_playback) {
-        if (pDevice->opensl.pAudioPlayerObj) MAL_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioPlayerObj);
-        if (pDevice->opensl.pOutputMixObj) MAL_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Destroy((SLObjectItf)pDevice->opensl.pOutputMixObj);
+        if (pDevice->opensl.pAudioPlayerObj) {
+            MAL_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioPlayerObj);
+        }
+        if (pDevice->opensl.pOutputMixObj) {
+            MAL_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Destroy((SLObjectItf)pDevice->opensl.pOutputMixObj);
+        }
     } else {
-        if (pDevice->opensl.pAudioRecorderObj) MAL_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioRecorderObj);
+        if (pDevice->opensl.pAudioRecorderObj) {
+            MAL_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioRecorderObj);
+        }
     }
 
     mal_free(pDevice->opensl.pBuffer);
@@ -27083,7 +27089,7 @@ mal_result mal_decoder_internal_on_seek_to_frame__wav(mal_decoder* pDecoder, mal
     drwav* pWav = (drwav*)pDecoder->pInternalDecoder;
     mal_assert(pWav != NULL);
 
-    drwav_bool32 result = drwav_seek_to_sample(pWav, frameIndex*pWav->channels);
+    drwav_bool32 result = drwav_seek_to_pcm_frame(pWav, frameIndex);
     if (result) {
         return MAL_SUCCESS;
     } else {
@@ -27108,9 +27114,9 @@ mal_uint32 mal_decoder_internal_on_read_frames__wav(mal_dsp* pDSP, mal_uint32 fr
     mal_assert(pWav != NULL);
 
     switch (pDecoder->internalFormat) {
-        case mal_format_s16: return (mal_uint32)drwav_read_s16(pWav, frameCount*pDecoder->internalChannels, (drwav_int16*)pSamplesOut) / pDecoder->internalChannels;
-        case mal_format_s32: return (mal_uint32)drwav_read_s32(pWav, frameCount*pDecoder->internalChannels, (drwav_int32*)pSamplesOut) / pDecoder->internalChannels;
-        case mal_format_f32: return (mal_uint32)drwav_read_f32(pWav, frameCount*pDecoder->internalChannels,       (float*)pSamplesOut) / pDecoder->internalChannels;
+        case mal_format_s16: return (mal_uint32)drwav_read_pcm_frames_s16(pWav, frameCount, (drwav_int16*)pSamplesOut);
+        case mal_format_s32: return (mal_uint32)drwav_read_pcm_frames_s32(pWav, frameCount, (drwav_int32*)pSamplesOut);
+        case mal_format_f32: return (mal_uint32)drwav_read_pcm_frames_f32(pWav, frameCount,       (float*)pSamplesOut);
         default: break;
     }
 
@@ -27210,7 +27216,7 @@ mal_result mal_decoder_internal_on_seek_to_frame__flac(mal_decoder* pDecoder, ma
     drflac* pFlac = (drflac*)pDecoder->pInternalDecoder;
     mal_assert(pFlac != NULL);
 
-    drflac_bool32 result = drflac_seek_to_sample(pFlac, frameIndex*pFlac->channels);
+    drflac_bool32 result = drflac_seek_to_pcm_frame(pFlac, frameIndex);
     if (result) {
         return MAL_SUCCESS;
     } else {
@@ -27235,7 +27241,7 @@ mal_uint32 mal_decoder_internal_on_read_frames__flac(mal_dsp* pDSP, mal_uint32 f
     drflac* pFlac = (drflac*)pDecoder->pInternalDecoder;
     mal_assert(pFlac != NULL);
 
-    return (mal_uint32)drflac_read_s32(pFlac, frameCount*pDecoder->internalChannels, (drflac_int32*)pSamplesOut) / pDecoder->internalChannels;
+    return (mal_uint32)drflac_read_pcm_frames_s32(pFlac, frameCount, (drflac_int32*)pSamplesOut);
 }
 
 mal_result mal_decoder_init_flac__internal(const mal_decoder_config* pConfig, mal_decoder* pDecoder)
@@ -27574,7 +27580,7 @@ mal_result mal_decoder_internal_on_seek_to_frame__mp3(mal_decoder* pDecoder, mal
     drmp3* pMP3 = (drmp3*)pDecoder->pInternalDecoder;
     mal_assert(pMP3 != NULL);
 
-    drmp3_bool32 result = drmp3_seek_to_frame(pMP3, frameIndex);
+    drmp3_bool32 result = drmp3_seek_to_pcm_frame(pMP3, frameIndex);
     if (result) {
         return MAL_SUCCESS;
     } else {
@@ -27600,7 +27606,7 @@ mal_uint32 mal_decoder_internal_on_read_frames__mp3(mal_dsp* pDSP, mal_uint32 fr
     drmp3* pMP3 = (drmp3*)pDecoder->pInternalDecoder;
     mal_assert(pMP3 != NULL);
 
-    return (mal_uint32)drmp3_read_f32(pMP3, frameCount, (float*)pSamplesOut);
+    return (mal_uint32)drmp3_read_pcm_frames_f32(pMP3, frameCount, (float*)pSamplesOut);
 }
 
 mal_result mal_decoder_init_mp3__internal(const mal_decoder_config* pConfig, mal_decoder* pDecoder)
@@ -28484,6 +28490,10 @@ mal_uint64 mal_sine_wave_read_ex(mal_sine_wave* pSineWave, mal_uint64 frameCount
 // REVISION HISTORY
 // ================
 //
+// v0.8.14 - 2018-12-16
+//   - Core Audio: Fix a bug where the device state is not set correctly after stopping.
+//   - Update decoders to use updated APIs in dr_flac, dr_mp3 and dr_wav.
+//
 // v0.8.13 - 2018-12-04
 //   - Core Audio: Fix a bug with channel mapping.
 //   - Fix a bug with channel routing where the back/left and back/right channels have the wrong weight.