diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d21a859..253b4d17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -883,6 +883,9 @@ if(MINIAUDIO_BUILD_TESTS) add_miniaudio_test(miniaudio_generation generation/generation.c) add_test(NAME miniaudio_generation COMMAND miniaudio_generation) + add_miniaudio_test(miniaudio_profiling profiling/profiling.c) + #add_test(NAME miniaudio_profiling COMMAND miniaudio_profiling) + add_miniaudio_test(miniaudio_resampling resampling/resampling.c) #add_test(NAME miniaudio_resampling COMMAND miniaudio_resampling) if(TARGET samplerate) diff --git a/miniaudio.h b/miniaudio.h index 04349df9..0f612298 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -52899,6 +52899,16 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ return; /* Invalid args. */ } + /* + You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called + generically without the caller explicitly checking the channel count. We'll implement this in as a + memcpy() to keep it fast. + */ + if (channels == 1) { + MA_COPY_MEMORY(ppDeinterleavedPCMFrames[0], pInterleavedPCMFrames, frameCount * ma_get_bytes_per_frame(format, channels)); + return; + } + /* For efficiency we do this per format. */ switch (format) { case ma_format_u8: @@ -52906,7 +52916,64 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ const ma_uint8* pSrcU8 = (const ma_uint8*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + iPCMFrame = 0; + + /* Specialization for stereo. We can do 4 frames at a time here. */ + if (channels == 2) { + ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0]; + ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1]; + ma_uint64 frameCount4 = frameCount >> 2; + + /* Check alignment of buffers. Use aligned fast path only if all buffers are 4-byte aligned. */ + if (((ma_uintptr)pSrcU8 & 3) == 0 && ((ma_uintptr)pDstU32_0 & 3) == 0 && ((ma_uintptr)pDstU32_1 & 3) == 0) { + #if 1 + { + const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcU8; + + for (iPCMFrame = 0; iPCMFrame < frameCount4; iPCMFrame += 1) { + ma_uint32 src0; + ma_uint32 src1; + ma_uint32 dst0; + ma_uint32 dst1; + + src0 = pSrcU32Running[0]; + src1 = pSrcU32Running[1]; + + dst0 = ((src0 & 0x00FF0000) >> 8) | ((src0 & 0x000000FF) >> 0) | ((src1 & 0x00FF0000) << 8) | ((src1 & 0x000000FF) << 16); + dst1 = ((src0 & 0xFF000000) >> 16) | ((src0 & 0x0000FF00) >> 8) | ((src1 & 0xFF000000) << 0) | ((src1 & 0x0000FF00) << 8); + + pDstU32_0[iPCMFrame] = dst0; + pDstU32_1[iPCMFrame] = dst1; + + pSrcU32Running += 2; + } + + iPCMFrame *= 4; + } + #else + { + const ma_uint8* pSrcU8Running = (const ma_uint8*)pSrcU8; + + for (iPCMFrame = 0; iPCMFrame + 4 < frameCount; iPCMFrame += 4) { + ma_uint32 dst0; + ma_uint32 dst1; + + dst0 = (pSrcU8Running[0] << 0) | (pSrcU8Running[2] << 8) | (pSrcU8Running[4] << 16) | (pSrcU8Running[6] << 24); + dst1 = (pSrcU8Running[1] << 0) | (pSrcU8Running[3] << 8) | (pSrcU8Running[5] << 16) | (pSrcU8Running[7] << 24); + + *pDstU32_0 = dst0; + *pDstU32_1 = dst1; + + pSrcU8Running += 4 * 2; + pDstU32_0 += 1; + pDstU32_1 += 1; + } + } + #endif + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; for (iChannel = 0; iChannel < channels; iChannel += 1) { ma_uint8* pDstU8 = (ma_uint8*)ppDeinterleavedPCMFrames[iChannel]; @@ -52919,7 +52986,65 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ { const ma_int16* pSrcS16 = (const ma_int16*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + + iPCMFrame = 0; + + /* Specialization for stereo. We can do 2 frames at a time here. */ + if (channels == 2) { + ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0]; + ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1]; + ma_uint64 frameCount2 = frameCount >> 1; + + /* Check alignment of buffers. Use aligned fast path only if all buffers are 4-byte aligned. */ + if (((ma_uintptr)pSrcS16 & 3) == 0 && ((ma_uintptr)pDstU32_0 & 3) == 0 && ((ma_uintptr)pDstU32_1 & 3) == 0) { + #if 1 + { + const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcS16; + + for (iPCMFrame = 0; iPCMFrame < frameCount2; iPCMFrame += 1) { + ma_uint32 src0; + ma_uint32 src1; + ma_uint32 dst0; + ma_uint32 dst1; + + src0 = pSrcU32Running[0]; + src1 = pSrcU32Running[1]; + + dst0 = ((src0 & 0x0000FFFF) << 0) | ((src1 & 0x0000FFFF) << 16); + dst1 = ((src0 & 0xFFFF0000) >> 16) | ((src1 & 0xFFFF0000) >> 0); + + pDstU32_0[iPCMFrame] = dst0; + pDstU32_1[iPCMFrame] = dst1; + + pSrcU32Running += 2; + } + + iPCMFrame *= 2; + } + #else + { + const ma_uint16* pSrcU16Running = (const ma_uint16*)pSrcS16; + + for (iPCMFrame = 0; iPCMFrame + 2 < frameCount; iPCMFrame += 2) { + ma_uint32 dst0; + ma_uint32 dst1; + + dst0 = (pSrcU16Running[0] << 0) | (pSrcU16Running[2] << 16); + dst1 = (pSrcU16Running[1] << 0) | (pSrcU16Running[3] << 16); + + *pDstU32_0 = dst0; + *pDstU32_1 = dst1; + + pSrcU16Running += 2 * 2; + pDstU32_0 += 1; + pDstU32_1 += 1; + } + } + #endif + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; for (iChannel = 0; iChannel < channels; iChannel += 1) { ma_int16* pDstS16 = (ma_int16*)ppDeinterleavedPCMFrames[iChannel]; @@ -52932,7 +53057,24 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ { const float* pSrcF32 = (const float*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + float* pDstF32_0 = (float*)ppDeinterleavedPCMFrames[0]; + float* pDstF32_1 = (float*)ppDeinterleavedPCMFrames[1]; + const float* pSrcF32Running = (const float*)pSrcF32; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstF32_0[iPCMFrame] = pSrcF32Running[0]; + pDstF32_1[iPCMFrame] = pSrcF32Running[1]; + + pSrcF32Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; for (iChannel = 0; iChannel < channels; iChannel += 1) { float* pDstF32 = (float*)ppDeinterleavedPCMFrames[iChannel]; @@ -52945,7 +53087,24 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ { const ma_int32* pSrcS32 = (const ma_int32*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0]; + ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1]; + const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcS32; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstU32_0[iPCMFrame] = pSrcU32Running[0]; + pDstU32_1[iPCMFrame] = pSrcU32Running[1]; + + pSrcU32Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; for (iChannel = 0; iChannel < channels; iChannel += 1) { ma_int32* pDstS32 = (ma_int32*)ppDeinterleavedPCMFrames[iChannel]; @@ -52958,7 +53117,30 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ { const ma_uint8* pSrcS24 = (const ma_uint8*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + + iPCMFrame = 0; + + if (channels == 2) { + ma_uint8* pDstS24_0 = (ma_uint8*)ppDeinterleavedPCMFrames[0]; + ma_uint8* pDstS24_1 = (ma_uint8*)ppDeinterleavedPCMFrames[1]; + const ma_uint8* pSrcS24Running = (const ma_uint8*)pSrcS24; + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstS24_0[0] = pSrcS24Running[0]; + pDstS24_0[1] = pSrcS24Running[1]; + pDstS24_0[2] = pSrcS24Running[2]; + + pDstS24_1[0] = pSrcS24Running[3]; + pDstS24_1[1] = pSrcS24Running[4]; + pDstS24_1[2] = pSrcS24Running[5]; + + pSrcS24Running += 6; + pDstS24_0 += 3; + pDstS24_1 += 3; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; for (iChannel = 0; iChannel < channels; iChannel += 1) { ma_uint8* pDstS24 = (ma_uint8*)ppDeinterleavedPCMFrames[iChannel]; diff --git a/tests/profiling/profiling.c b/tests/profiling/profiling.c new file mode 100644 index 00000000..bb791678 --- /dev/null +++ b/tests/profiling/profiling.c @@ -0,0 +1,170 @@ +#if 1 +#include "../../miniaudio.c" +#else +#define MINIAUDIO_IMPLEMENTATION +#include "../../miniaudio-11.h" +#endif + +const char* format_short_name(ma_format format) +{ + switch (format) + { + case ma_format_u8: return "u8"; + case ma_format_s16: return "s16"; + case ma_format_s32: return "s32"; + case ma_format_s24: return "s24"; + case ma_format_f32: return "f32"; + default: return "unknown"; + } +} + +void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames) +{ + ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format); + ma_uint64 iPCMFrame; + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; iChannel += 1) { + void* pDst = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes); + const void* pSrc = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes); + memcpy(pDst, pSrc, sampleSizeInBytes); + } + } +} + +void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount) +{ + ma_uint8 v = 0; + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + ma_uint32 iByte; + + /* Just fill byte-by-byte. */ + for (iByte = 0; iByte < frameCount * bpf; iByte += 1) { + ((ma_uint8*)pBuffer)[iByte] = v; + v += 1; /* Just let this overflow. */ + } +} + +ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels) +{ + ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */ + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + void* pInterleavedReference; + void* pDeinterleavedReference[MA_MAX_CHANNELS]; + void* pInterleavedOptimized; + void* pDeinterleavedOptimized[MA_MAX_CHANNELS]; + ma_uint32 iChannel; + + pInterleavedReference = ma_malloc(frameCount * bpf, NULL); + pInterleavedOptimized = ma_malloc(frameCount * bpf, NULL); + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pDeinterleavedReference[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + } + + fill_debug_frames(pInterleavedReference, frameCount, format, channels); + MA_COPY_MEMORY(pInterleavedOptimized, pInterleavedReference, frameCount * bpf); + + + deinterleave_reference (format, channels, frameCount, pInterleavedReference, pDeinterleavedReference); + ma_deinterleave_pcm_frames(format, channels, frameCount, pInterleavedOptimized, pDeinterleavedOptimized); + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + if (memcmp(pDeinterleavedReference[iChannel], pDeinterleavedOptimized[iChannel], frameCount * ma_get_bytes_per_sample(format)) != 0) { + return MA_FALSE; + } + } + + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ma_free(pDeinterleavedReference[iChannel], NULL); + ma_free(pDeinterleavedOptimized[iChannel], NULL); + } + + ma_free(pInterleavedReference, NULL); + ma_free(pInterleavedOptimized, NULL); + + return MA_TRUE; +} + +void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels) +{ + ma_uint64 frameCount = 1024 * 1024; + ma_uint32 iterationCount = 1000; + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + void* pInterleaved; + void* pDeinterleaved[MA_MAX_CHANNELS]; + ma_uint32 iChannel; + ma_timer timer; + double startTime; + + printf("%s %u: ", format_short_name(format), channels); + + if (verify_deinterleaving_by_format(format, channels) == MA_FALSE) { + printf("FAILED\n"); + return; + } + + + ma_timer_init(&timer); + + pInterleaved = ma_malloc(frameCount * bpf, NULL); + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + } + + ma_debug_fill_pcm_frames_with_sine_wave((float*)pInterleaved, frameCount, format, channels, 48000); /* The float* cast is to work around an API bug in miniaudio v0.11. It's harmless. */ + + + startTime = ma_timer_get_time_in_seconds(&timer); + { + ma_uint32 i; + for (i = 0; i < iterationCount; i += 1) { + ma_deinterleave_pcm_frames(format, channels, frameCount, pInterleaved, pDeinterleaved); + } + } + printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime); + + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ma_free(pDeinterleaved[iChannel], NULL); + } + + ma_free(pInterleaved, NULL); +} + +void profile_deinterleaving(void) +{ + /* Stereo has an optimized code path. */ + profile_deinterleaving_by_format(ma_format_u8, 2); + profile_deinterleaving_by_format(ma_format_s16, 2); + profile_deinterleaving_by_format(ma_format_f32, 2); + profile_deinterleaving_by_format(ma_format_s32, 2); + profile_deinterleaving_by_format(ma_format_s24, 2); + + /* We have a special case for mono streams so make sure we have coverage of that case. */ + profile_deinterleaving_by_format(ma_format_u8, 1); + profile_deinterleaving_by_format(ma_format_s16, 1); + profile_deinterleaving_by_format(ma_format_f32, 1); + profile_deinterleaving_by_format(ma_format_s32, 1); + profile_deinterleaving_by_format(ma_format_s24, 1); + + /* Channels > 2 run on a generic code path. */ + profile_deinterleaving_by_format(ma_format_u8, 3); + profile_deinterleaving_by_format(ma_format_s16, 3); + profile_deinterleaving_by_format(ma_format_f32, 3); + profile_deinterleaving_by_format(ma_format_s32, 3); + profile_deinterleaving_by_format(ma_format_s24, 3); +} + +int main(int argc, char** argv) +{ + profile_deinterleaving(); + + (void)argc; + (void)argv; + + return 0; +}