Optimizations to ma_interleave_pcm_frames().

This commit is contained in:
David Reid
2026-02-16 06:52:12 +10:00
parent 6851858937
commit e490db3085
2 changed files with 336 additions and 22 deletions
+152 -6
View File
@@ -52901,8 +52901,8 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
/*
You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called
generically without the caller explicitly checking the channel count. We'll implement this in as a
memcpy() to keep it fast.
generically without the caller explicitly checking the channel count. We'll implement this as a memcpy()
to keep it fast.
*/
if (channels == 1) {
MA_COPY_MEMORY(ppDeinterleavedPCMFrames[0], pInterleavedPCMFrames, frameCount * ma_get_bytes_per_frame(format, channels));
@@ -53170,15 +53170,76 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames)
{
if (ppDeinterleavedPCMFrames == NULL || pInterleavedPCMFrames == NULL) {
return; /* Invalid args. */
}
/*
You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called
generically without the caller explicitly checking the channel count. We'll implement this as a memcpy()
to keep it fast.
*/
if (channels == 1) {
MA_COPY_MEMORY(pInterleavedPCMFrames, ppDeinterleavedPCMFrames[0], frameCount * ma_get_bytes_per_frame(format, channels));
return;
}
switch (format)
{
case ma_format_u8:
{
ma_uint8* pDstU8 = (ma_uint8*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
const ma_uint8* pSrcU8_0 = (const ma_uint8*)ppDeinterleavedPCMFrames[0];
const ma_uint8* pSrcU8_1 = (const ma_uint8*)ppDeinterleavedPCMFrames[1];
ma_uint8* pDstU8Running = (ma_uint8*)pDstU8;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstU8Running[0] = pSrcU8_0[iPCMFrame];
pDstU8Running[1] = pSrcU8_1[iPCMFrame];
pDstU8Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const ma_uint8* pSrcU8 = (const ma_uint8*)ppDeinterleavedPCMFrames[iChannel];
pDstU8[iPCMFrame*channels+iChannel] = pSrcU8[iPCMFrame];
}
}
} break;
case ma_format_s16:
{
ma_int16* pDstS16 = (ma_int16*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
const ma_int16* pSrcS16_0 = (const ma_int16*)ppDeinterleavedPCMFrames[0];
const ma_int16* pSrcS16_1 = (const ma_int16*)ppDeinterleavedPCMFrames[1];
ma_int16* pDstS16Running = (ma_int16*)pDstS16;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstS16Running[0] = pSrcS16_0[iPCMFrame];
pDstS16Running[1] = pSrcS16_1[iPCMFrame];
pDstS16Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; ++iChannel) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const ma_int16* pSrcS16 = (const ma_int16*)ppDeinterleavedPCMFrames[iChannel];
pDstS16[iPCMFrame*channels+iChannel] = pSrcS16[iPCMFrame];
}
@@ -53189,15 +53250,100 @@ MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_ui
{
float* pDstF32 = (float*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
const float* pSrcF32_0 = (const float*)ppDeinterleavedPCMFrames[0];
const float* pSrcF32_1 = (const float*)ppDeinterleavedPCMFrames[1];
float* pDstF32Running = (float*)pDstF32;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstF32Running[0] = pSrcF32_0[iPCMFrame];
pDstF32Running[1] = pSrcF32_1[iPCMFrame];
pDstF32Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; ++iChannel) {
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const float* pSrcF32 = (const float*)ppDeinterleavedPCMFrames[iChannel];
pDstF32[iPCMFrame*channels+iChannel] = pSrcF32[iPCMFrame];
}
}
} break;
case ma_format_s32:
{
ma_int32* pDstS32 = (ma_int32*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
const ma_uint32* pSrcU32_0 = (const ma_uint32*)ppDeinterleavedPCMFrames[0];
const ma_uint32* pSrcU32_1 = (const ma_uint32*)ppDeinterleavedPCMFrames[1];
ma_uint32* pDstU32Running = (ma_uint32*)pDstS32;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstU32Running[0] = pSrcU32_0[iPCMFrame];
pDstU32Running[1] = pSrcU32_1[iPCMFrame];
pDstU32Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const ma_int32* pSrcS32 = (const ma_int32*)ppDeinterleavedPCMFrames[iChannel];
pDstS32[iPCMFrame*channels+iChannel] = pSrcS32[iPCMFrame];
}
}
} break;
case ma_format_s24:
{
ma_uint8* pDstS24 = (ma_uint8*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
iPCMFrame = 0;
if (channels == 2) {
const ma_uint8* pSrcS24_0 = (const ma_uint8*)ppDeinterleavedPCMFrames[0];
const ma_uint8* pSrcS24_1 = (const ma_uint8*)ppDeinterleavedPCMFrames[1];
ma_uint8* pDstS24Running = (ma_uint8*)pDstS24;
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstS24Running[0] = pSrcS24_0[0];
pDstS24Running[1] = pSrcS24_0[1];
pDstS24Running[2] = pSrcS24_0[2];
pDstS24Running[3] = pSrcS24_1[0];
pDstS24Running[4] = pSrcS24_1[1];
pDstS24Running[5] = pSrcS24_1[2];
pDstS24Running += 6;
pSrcS24_0 += 3;
pSrcS24_1 += 3;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const ma_uint8* pSrcS24 = (const ma_uint8*)ppDeinterleavedPCMFrames[iChannel];
pDstS24[(iPCMFrame*channels+iChannel)*3 + 0] = pSrcS24[iPCMFrame*3 + 0];
pDstS24[(iPCMFrame*channels+iChannel)*3 + 1] = pSrcS24[iPCMFrame*3 + 1];
pDstS24[(iPCMFrame*channels+iChannel)*3 + 2] = pSrcS24[iPCMFrame*3 + 2];
}
}
} break;
default:
{
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
+184 -16
View File
@@ -18,6 +18,40 @@ const char* format_short_name(ma_format format)
}
}
void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount, ma_uint8 valueOffset)
{
ma_uint8 v = valueOffset;
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
ma_uint32 iByte;
/* Just fill byte-by-byte. */
for (iByte = 0; iByte < frameCount * bpf; iByte += 1) {
((ma_uint8*)pBuffer)[iByte] = v;
v += 1; /* Just let this overflow. */
}
}
ma_bool32 compare_interleaved_deinterleaved(ma_format format, ma_uint32 channels, ma_uint32 frameCount, void* pInterleaved, const void** ppDeinterleaved)
{
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
ma_uint32 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
const void* pDeinterleavedSample = ma_offset_ptr(ppDeinterleaved[iChannel], iPCMFrame * sampleSizeInBytes);
const void* pInterleavedSample = ma_offset_ptr(pInterleaved, (iPCMFrame * channels + iChannel) * sampleSizeInBytes);
if (memcmp(pDeinterleavedSample, pInterleavedSample, sampleSizeInBytes) != 0) {
return MA_FALSE;
}
}
}
return MA_TRUE;
}
void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames)
{
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
@@ -32,19 +66,6 @@ void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 fram
}
}
void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount)
{
ma_uint8 v = 0;
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
ma_uint32 iByte;
/* Just fill byte-by-byte. */
for (iByte = 0; iByte < frameCount * bpf; iByte += 1) {
((ma_uint8*)pBuffer)[iByte] = v;
v += 1; /* Just let this overflow. */
}
}
ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels)
{
ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */
@@ -63,7 +84,7 @@ ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels)
pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
}
fill_debug_frames(pInterleavedReference, format, channels, frameCount);
fill_debug_frames(pInterleavedReference, format, channels, frameCount, 0);
MA_COPY_MEMORY(pInterleavedOptimized, pInterleavedReference, frameCount * bpf);
@@ -99,7 +120,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
ma_timer timer;
double startTime;
printf("%s %u: ", format_short_name(format), channels);
printf("Deinterleave: %s %u: ", format_short_name(format), channels);
if (verify_deinterleaving_by_format(format, channels) == MA_FALSE) {
printf("FAILED\n");
@@ -115,7 +136,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
}
ma_debug_fill_pcm_frames_with_sine_wave((float*)pInterleaved, frameCount, format, channels, 48000); /* The float* cast is to work around an API bug in miniaudio v0.11. It's harmless. */
fill_debug_frames(pInterleaved, format, channels, frameCount, 0);
startTime = ma_timer_get_time_in_seconds(&timer);
@@ -127,6 +148,14 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
}
printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime);
/*
I think Clang can recognize that we're not actually doing anything with the output data of our tests and then
optimizes out the entire thing. We'll do a simple comparision here.
*/
if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) {
printf("FAILED VERIFICATION\n");
}
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_free(pDeinterleaved[iChannel], NULL);
@@ -159,9 +188,148 @@ void profile_deinterleaving(void)
profile_deinterleaving_by_format(ma_format_s24, 3);
}
void interleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames)
{
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; ++iChannel) {
void* pDst = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
const void* pSrc = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
memcpy(pDst, pSrc, sampleSizeInBytes);
}
}
}
ma_bool32 verify_interleaving_by_format(ma_format format, ma_uint32 channels)
{
ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
void* pDeinterleavedReference[MA_MAX_CHANNELS];
void* pInterleavedReference;
void* pDeinterleavedOptimized[MA_MAX_CHANNELS];
void* pInterleavedOptimized;
ma_uint32 iChannel;
pInterleavedReference = ma_malloc(frameCount * bpf, NULL);
pInterleavedOptimized = ma_malloc(frameCount * bpf, NULL);
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pDeinterleavedReference[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
}
/* Fill deinterleaved buffers with test data. */
for (iChannel = 0; iChannel < channels; iChannel += 1) {
fill_debug_frames(pDeinterleavedReference[iChannel], format, 1, frameCount, (ma_uint8)iChannel); /* Last parameter is to ensure each channel position has different values. */
MA_COPY_MEMORY(pDeinterleavedOptimized[iChannel], pDeinterleavedReference[iChannel], frameCount * ma_get_bytes_per_sample(format));
}
interleave_reference (format, channels, frameCount, (const void**)pDeinterleavedReference, pInterleavedReference);
ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleavedOptimized, pInterleavedOptimized);
if (memcmp(pInterleavedReference, pInterleavedOptimized, frameCount * bpf) != 0) {
return MA_FALSE;
}
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_free(pDeinterleavedReference[iChannel], NULL);
ma_free(pDeinterleavedOptimized[iChannel], NULL);
}
ma_free(pInterleavedReference, NULL);
ma_free(pInterleavedOptimized, NULL);
return MA_TRUE;
}
void profile_interleaving_by_format(ma_format format, ma_uint32 channels)
{
ma_uint64 frameCount = 1024 * 1024;
ma_uint32 iterationCount = 1000;
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
void* pDeinterleaved[MA_MAX_CHANNELS];
void* pInterleaved;
ma_uint32 iChannel;
ma_timer timer;
double startTime;
printf("Interleave: %s %u: ", format_short_name(format), channels);
if (verify_interleaving_by_format(format, channels) == MA_FALSE) {
printf("FAILED\n");
return;
}
ma_timer_init(&timer);
pInterleaved = ma_malloc(frameCount * bpf, NULL);
for (iChannel = 0; iChannel < channels; iChannel += 1) {
pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
fill_debug_frames(pDeinterleaved[iChannel], format, 1, frameCount, (ma_uint8)iChannel);
}
startTime = ma_timer_get_time_in_seconds(&timer);
{
ma_uint32 i;
for (i = 0; i < iterationCount; i += 1) {
ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleaved, pInterleaved);
}
}
printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime);
/*
I think Clang can recognize that we're not actually doing anything with the output data of our tests and then
optimizes out the entire thing. We'll do a simple comparision here.
*/
if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) {
printf("FAILED VERIFICATION\n");
}
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_free(pDeinterleaved[iChannel], NULL);
}
ma_free(pInterleaved, NULL);
}
void profile_interleaving(void)
{
/* Stereo has an optimized code path. */
profile_interleaving_by_format(ma_format_u8, 2);
profile_interleaving_by_format(ma_format_s16, 2);
profile_interleaving_by_format(ma_format_f32, 2);
profile_interleaving_by_format(ma_format_s32, 2);
profile_interleaving_by_format(ma_format_s24, 2);
/* We have a special case for mono streams so make sure we have coverage of that case. */
profile_interleaving_by_format(ma_format_u8, 1);
profile_interleaving_by_format(ma_format_s16, 1);
profile_interleaving_by_format(ma_format_f32, 1);
profile_interleaving_by_format(ma_format_s32, 1);
profile_interleaving_by_format(ma_format_s24, 1);
/* Channels > 2 run on a generic code path. */
profile_interleaving_by_format(ma_format_u8, 3);
profile_interleaving_by_format(ma_format_s16, 3);
profile_interleaving_by_format(ma_format_f32, 3);
profile_interleaving_by_format(ma_format_s32, 3);
profile_interleaving_by_format(ma_format_s24, 3);
}
int main(int argc, char** argv)
{
profile_deinterleaving();
profile_interleaving();
(void)argc;
(void)argv;