diff --git a/miniaudio.h b/miniaudio.h index 0f612298..55242913 100644 --- a/miniaudio.h +++ b/miniaudio.h @@ -52901,8 +52901,8 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ /* You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called - generically without the caller explicitly checking the channel count. We'll implement this in as a - memcpy() to keep it fast. + generically without the caller explicitly checking the channel count. We'll implement this as a memcpy() + to keep it fast. */ if (channels == 1) { MA_COPY_MEMORY(ppDeinterleavedPCMFrames[0], pInterleavedPCMFrames, frameCount * ma_get_bytes_per_frame(format, channels)); @@ -53170,15 +53170,76 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames) { + if (ppDeinterleavedPCMFrames == NULL || pInterleavedPCMFrames == NULL) { + return; /* Invalid args. */ + } + + /* + You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called + generically without the caller explicitly checking the channel count. We'll implement this as a memcpy() + to keep it fast. + */ + if (channels == 1) { + MA_COPY_MEMORY(pInterleavedPCMFrames, ppDeinterleavedPCMFrames[0], frameCount * ma_get_bytes_per_frame(format, channels)); + return; + } + switch (format) { + case ma_format_u8: + { + ma_uint8* pDstU8 = (ma_uint8*)pInterleavedPCMFrames; + ma_uint64 iPCMFrame; + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + const ma_uint8* pSrcU8_0 = (const ma_uint8*)ppDeinterleavedPCMFrames[0]; + const ma_uint8* pSrcU8_1 = (const ma_uint8*)ppDeinterleavedPCMFrames[1]; + ma_uint8* pDstU8Running = (ma_uint8*)pDstU8; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstU8Running[0] = pSrcU8_0[iPCMFrame]; + pDstU8Running[1] = pSrcU8_1[iPCMFrame]; + + pDstU8Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; iChannel += 1) { + const ma_uint8* pSrcU8 = (const ma_uint8*)ppDeinterleavedPCMFrames[iChannel]; + pDstU8[iPCMFrame*channels+iChannel] = pSrcU8[iPCMFrame]; + } + } + } break; + case ma_format_s16: { ma_int16* pDstS16 = (ma_int16*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) { + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + const ma_int16* pSrcS16_0 = (const ma_int16*)ppDeinterleavedPCMFrames[0]; + const ma_int16* pSrcS16_1 = (const ma_int16*)ppDeinterleavedPCMFrames[1]; + ma_int16* pDstS16Running = (ma_int16*)pDstS16; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstS16Running[0] = pSrcS16_0[iPCMFrame]; + pDstS16Running[1] = pSrcS16_1[iPCMFrame]; + + pDstS16Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; - for (iChannel = 0; iChannel < channels; ++iChannel) { + for (iChannel = 0; iChannel < channels; iChannel += 1) { const ma_int16* pSrcS16 = (const ma_int16*)ppDeinterleavedPCMFrames[iChannel]; pDstS16[iPCMFrame*channels+iChannel] = pSrcS16[iPCMFrame]; } @@ -53189,15 +53250,100 @@ MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ui { float* pDstF32 = (float*)pInterleavedPCMFrames; ma_uint64 iPCMFrame; - for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) { + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + const float* pSrcF32_0 = (const float*)ppDeinterleavedPCMFrames[0]; + const float* pSrcF32_1 = (const float*)ppDeinterleavedPCMFrames[1]; + float* pDstF32Running = (float*)pDstF32; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstF32Running[0] = pSrcF32_0[iPCMFrame]; + pDstF32Running[1] = pSrcF32_1[iPCMFrame]; + + pDstF32Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { ma_uint32 iChannel; - for (iChannel = 0; iChannel < channels; ++iChannel) { + for (iChannel = 0; iChannel < channels; iChannel += 1) { const float* pSrcF32 = (const float*)ppDeinterleavedPCMFrames[iChannel]; pDstF32[iPCMFrame*channels+iChannel] = pSrcF32[iPCMFrame]; } } } break; + case ma_format_s32: + { + ma_int32* pDstS32 = (ma_int32*)pInterleavedPCMFrames; + ma_uint64 iPCMFrame; + + iPCMFrame = 0; + + /* Specialization for stereo. */ + if (channels == 2) { + const ma_uint32* pSrcU32_0 = (const ma_uint32*)ppDeinterleavedPCMFrames[0]; + const ma_uint32* pSrcU32_1 = (const ma_uint32*)ppDeinterleavedPCMFrames[1]; + ma_uint32* pDstU32Running = (ma_uint32*)pDstS32; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstU32Running[0] = pSrcU32_0[iPCMFrame]; + pDstU32Running[1] = pSrcU32_1[iPCMFrame]; + + pDstU32Running += 2; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; iChannel += 1) { + const ma_int32* pSrcS32 = (const ma_int32*)ppDeinterleavedPCMFrames[iChannel]; + pDstS32[iPCMFrame*channels+iChannel] = pSrcS32[iPCMFrame]; + } + } + } break; + + case ma_format_s24: + { + ma_uint8* pDstS24 = (ma_uint8*)pInterleavedPCMFrames; + ma_uint64 iPCMFrame; + + iPCMFrame = 0; + + if (channels == 2) { + const ma_uint8* pSrcS24_0 = (const ma_uint8*)ppDeinterleavedPCMFrames[0]; + const ma_uint8* pSrcS24_1 = (const ma_uint8*)ppDeinterleavedPCMFrames[1]; + ma_uint8* pDstS24Running = (ma_uint8*)pDstS24; + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { + pDstS24Running[0] = pSrcS24_0[0]; + pDstS24Running[1] = pSrcS24_0[1]; + pDstS24Running[2] = pSrcS24_0[2]; + + pDstS24Running[3] = pSrcS24_1[0]; + pDstS24Running[4] = pSrcS24_1[1]; + pDstS24Running[5] = pSrcS24_1[2]; + + pDstS24Running += 6; + pSrcS24_0 += 3; + pSrcS24_1 += 3; + } + } + + for (; iPCMFrame < frameCount; iPCMFrame += 1) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; iChannel += 1) { + const ma_uint8* pSrcS24 = (const ma_uint8*)ppDeinterleavedPCMFrames[iChannel]; + pDstS24[(iPCMFrame*channels+iChannel)*3 + 0] = pSrcS24[iPCMFrame*3 + 0]; + pDstS24[(iPCMFrame*channels+iChannel)*3 + 1] = pSrcS24[iPCMFrame*3 + 1]; + pDstS24[(iPCMFrame*channels+iChannel)*3 + 2] = pSrcS24[iPCMFrame*3 + 2]; + } + } + } break; + default: { ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format); diff --git a/tests/profiling/profiling.c b/tests/profiling/profiling.c index dfa89a1a..6493f21d 100644 --- a/tests/profiling/profiling.c +++ b/tests/profiling/profiling.c @@ -18,6 +18,40 @@ const char* format_short_name(ma_format format) } } +void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount, ma_uint8 valueOffset) +{ + ma_uint8 v = valueOffset; + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + ma_uint32 iByte; + + /* Just fill byte-by-byte. */ + for (iByte = 0; iByte < frameCount * bpf; iByte += 1) { + ((ma_uint8*)pBuffer)[iByte] = v; + v += 1; /* Just let this overflow. */ + } +} + +ma_bool32 compare_interleaved_deinterleaved(ma_format format, ma_uint32 channels, ma_uint32 frameCount, void* pInterleaved, const void** ppDeinterleaved) +{ + ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format); + ma_uint32 iPCMFrame; + + for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; iChannel += 1) { + const void* pDeinterleavedSample = ma_offset_ptr(ppDeinterleaved[iChannel], iPCMFrame * sampleSizeInBytes); + const void* pInterleavedSample = ma_offset_ptr(pInterleaved, (iPCMFrame * channels + iChannel) * sampleSizeInBytes); + + if (memcmp(pDeinterleavedSample, pInterleavedSample, sampleSizeInBytes) != 0) { + return MA_FALSE; + } + } + } + + return MA_TRUE; +} + + void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames) { ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format); @@ -32,19 +66,6 @@ void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 fram } } -void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount) -{ - ma_uint8 v = 0; - ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); - ma_uint32 iByte; - - /* Just fill byte-by-byte. */ - for (iByte = 0; iByte < frameCount * bpf; iByte += 1) { - ((ma_uint8*)pBuffer)[iByte] = v; - v += 1; /* Just let this overflow. */ - } -} - ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels) { ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */ @@ -63,7 +84,7 @@ ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels) pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); } - fill_debug_frames(pInterleavedReference, format, channels, frameCount); + fill_debug_frames(pInterleavedReference, format, channels, frameCount, 0); MA_COPY_MEMORY(pInterleavedOptimized, pInterleavedReference, frameCount * bpf); @@ -99,7 +120,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels) ma_timer timer; double startTime; - printf("%s %u: ", format_short_name(format), channels); + printf("Deinterleave: %s %u: ", format_short_name(format), channels); if (verify_deinterleaving_by_format(format, channels) == MA_FALSE) { printf("FAILED\n"); @@ -115,7 +136,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels) pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); } - ma_debug_fill_pcm_frames_with_sine_wave((float*)pInterleaved, frameCount, format, channels, 48000); /* The float* cast is to work around an API bug in miniaudio v0.11. It's harmless. */ + fill_debug_frames(pInterleaved, format, channels, frameCount, 0); startTime = ma_timer_get_time_in_seconds(&timer); @@ -127,6 +148,14 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels) } printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime); + /* + I think Clang can recognize that we're not actually doing anything with the output data of our tests and then + optimizes out the entire thing. We'll do a simple comparision here. + */ + if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) { + printf("FAILED VERIFICATION\n"); + } + for (iChannel = 0; iChannel < channels; iChannel += 1) { ma_free(pDeinterleaved[iChannel], NULL); @@ -159,9 +188,148 @@ void profile_deinterleaving(void) profile_deinterleaving_by_format(ma_format_s24, 3); } + +void interleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames) +{ + ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format); + ma_uint64 iPCMFrame; + for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) { + ma_uint32 iChannel; + for (iChannel = 0; iChannel < channels; ++iChannel) { + void* pDst = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes); + const void* pSrc = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes); + memcpy(pDst, pSrc, sampleSizeInBytes); + } + } +} + +ma_bool32 verify_interleaving_by_format(ma_format format, ma_uint32 channels) +{ + ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */ + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + void* pDeinterleavedReference[MA_MAX_CHANNELS]; + void* pInterleavedReference; + void* pDeinterleavedOptimized[MA_MAX_CHANNELS]; + void* pInterleavedOptimized; + ma_uint32 iChannel; + + pInterleavedReference = ma_malloc(frameCount * bpf, NULL); + pInterleavedOptimized = ma_malloc(frameCount * bpf, NULL); + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pDeinterleavedReference[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + } + + /* Fill deinterleaved buffers with test data. */ + for (iChannel = 0; iChannel < channels; iChannel += 1) { + fill_debug_frames(pDeinterleavedReference[iChannel], format, 1, frameCount, (ma_uint8)iChannel); /* Last parameter is to ensure each channel position has different values. */ + MA_COPY_MEMORY(pDeinterleavedOptimized[iChannel], pDeinterleavedReference[iChannel], frameCount * ma_get_bytes_per_sample(format)); + } + + + interleave_reference (format, channels, frameCount, (const void**)pDeinterleavedReference, pInterleavedReference); + ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleavedOptimized, pInterleavedOptimized); + + if (memcmp(pInterleavedReference, pInterleavedOptimized, frameCount * bpf) != 0) { + return MA_FALSE; + } + + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ma_free(pDeinterleavedReference[iChannel], NULL); + ma_free(pDeinterleavedOptimized[iChannel], NULL); + } + + ma_free(pInterleavedReference, NULL); + ma_free(pInterleavedOptimized, NULL); + + return MA_TRUE; +} + +void profile_interleaving_by_format(ma_format format, ma_uint32 channels) +{ + ma_uint64 frameCount = 1024 * 1024; + ma_uint32 iterationCount = 1000; + ma_uint32 bpf = ma_get_bytes_per_frame(format, channels); + void* pDeinterleaved[MA_MAX_CHANNELS]; + void* pInterleaved; + ma_uint32 iChannel; + ma_timer timer; + double startTime; + + printf("Interleave: %s %u: ", format_short_name(format), channels); + + if (verify_interleaving_by_format(format, channels) == MA_FALSE) { + printf("FAILED\n"); + return; + } + + + ma_timer_init(&timer); + + pInterleaved = ma_malloc(frameCount * bpf, NULL); + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL); + fill_debug_frames(pDeinterleaved[iChannel], format, 1, frameCount, (ma_uint8)iChannel); + } + + + startTime = ma_timer_get_time_in_seconds(&timer); + { + ma_uint32 i; + for (i = 0; i < iterationCount; i += 1) { + ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleaved, pInterleaved); + } + } + printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime); + + /* + I think Clang can recognize that we're not actually doing anything with the output data of our tests and then + optimizes out the entire thing. We'll do a simple comparision here. + */ + if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) { + printf("FAILED VERIFICATION\n"); + } + + + for (iChannel = 0; iChannel < channels; iChannel += 1) { + ma_free(pDeinterleaved[iChannel], NULL); + } + + ma_free(pInterleaved, NULL); +} + +void profile_interleaving(void) +{ + /* Stereo has an optimized code path. */ + profile_interleaving_by_format(ma_format_u8, 2); + profile_interleaving_by_format(ma_format_s16, 2); + profile_interleaving_by_format(ma_format_f32, 2); + profile_interleaving_by_format(ma_format_s32, 2); + profile_interleaving_by_format(ma_format_s24, 2); + + /* We have a special case for mono streams so make sure we have coverage of that case. */ + profile_interleaving_by_format(ma_format_u8, 1); + profile_interleaving_by_format(ma_format_s16, 1); + profile_interleaving_by_format(ma_format_f32, 1); + profile_interleaving_by_format(ma_format_s32, 1); + profile_interleaving_by_format(ma_format_s24, 1); + + /* Channels > 2 run on a generic code path. */ + profile_interleaving_by_format(ma_format_u8, 3); + profile_interleaving_by_format(ma_format_s16, 3); + profile_interleaving_by_format(ma_format_f32, 3); + profile_interleaving_by_format(ma_format_s32, 3); + profile_interleaving_by_format(ma_format_s24, 3); +} + + int main(int argc, char** argv) { profile_deinterleaving(); + profile_interleaving(); (void)argc; (void)argv;