Optimizations to ma_deinterleave_pcm_frames() for stereo.

This commit is contained in:
David Reid
2026-02-15 21:54:29 +10:00
parent 01e5042bfb
commit 242cbf4d8c
3 changed files with 360 additions and 5 deletions
+187 -5
View File
@@ -52899,6 +52899,16 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
return; /* Invalid args. */
}
/*
You shouldn't really be calling this for mono streams, but I can imagine a situation where it is called
generically without the caller explicitly checking the channel count. We'll implement this in as a
memcpy() to keep it fast.
*/
if (channels == 1) {
MA_COPY_MEMORY(ppDeinterleavedPCMFrames[0], pInterleavedPCMFrames, frameCount * ma_get_bytes_per_frame(format, channels));
return;
}
/* For efficiency we do this per format. */
switch (format) {
case ma_format_u8:
@@ -52906,7 +52916,64 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
const ma_uint8* pSrcU8 = (const ma_uint8*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
iPCMFrame = 0;
/* Specialization for stereo. We can do 4 frames at a time here. */
if (channels == 2) {
ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0];
ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1];
ma_uint64 frameCount4 = frameCount >> 2;
/* Check alignment of buffers. Use aligned fast path only if all buffers are 4-byte aligned. */
if (((ma_uintptr)pSrcU8 & 3) == 0 && ((ma_uintptr)pDstU32_0 & 3) == 0 && ((ma_uintptr)pDstU32_1 & 3) == 0) {
#if 1
{
const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcU8;
for (iPCMFrame = 0; iPCMFrame < frameCount4; iPCMFrame += 1) {
ma_uint32 src0;
ma_uint32 src1;
ma_uint32 dst0;
ma_uint32 dst1;
src0 = pSrcU32Running[0];
src1 = pSrcU32Running[1];
dst0 = ((src0 & 0x00FF0000) >> 8) | ((src0 & 0x000000FF) >> 0) | ((src1 & 0x00FF0000) << 8) | ((src1 & 0x000000FF) << 16);
dst1 = ((src0 & 0xFF000000) >> 16) | ((src0 & 0x0000FF00) >> 8) | ((src1 & 0xFF000000) << 0) | ((src1 & 0x0000FF00) << 8);
pDstU32_0[iPCMFrame] = dst0;
pDstU32_1[iPCMFrame] = dst1;
pSrcU32Running += 2;
}
iPCMFrame *= 4;
}
#else
{
const ma_uint8* pSrcU8Running = (const ma_uint8*)pSrcU8;
for (iPCMFrame = 0; iPCMFrame + 4 < frameCount; iPCMFrame += 4) {
ma_uint32 dst0;
ma_uint32 dst1;
dst0 = (pSrcU8Running[0] << 0) | (pSrcU8Running[2] << 8) | (pSrcU8Running[4] << 16) | (pSrcU8Running[6] << 24);
dst1 = (pSrcU8Running[1] << 0) | (pSrcU8Running[3] << 8) | (pSrcU8Running[5] << 16) | (pSrcU8Running[7] << 24);
*pDstU32_0 = dst0;
*pDstU32_1 = dst1;
pSrcU8Running += 4 * 2;
pDstU32_0 += 1;
pDstU32_1 += 1;
}
}
#endif
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_uint8* pDstU8 = (ma_uint8*)ppDeinterleavedPCMFrames[iChannel];
@@ -52919,7 +52986,65 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
{
const ma_int16* pSrcS16 = (const ma_int16*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
iPCMFrame = 0;
/* Specialization for stereo. We can do 2 frames at a time here. */
if (channels == 2) {
ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0];
ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1];
ma_uint64 frameCount2 = frameCount >> 1;
/* Check alignment of buffers. Use aligned fast path only if all buffers are 4-byte aligned. */
if (((ma_uintptr)pSrcS16 & 3) == 0 && ((ma_uintptr)pDstU32_0 & 3) == 0 && ((ma_uintptr)pDstU32_1 & 3) == 0) {
#if 1
{
const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcS16;
for (iPCMFrame = 0; iPCMFrame < frameCount2; iPCMFrame += 1) {
ma_uint32 src0;
ma_uint32 src1;
ma_uint32 dst0;
ma_uint32 dst1;
src0 = pSrcU32Running[0];
src1 = pSrcU32Running[1];
dst0 = ((src0 & 0x0000FFFF) << 0) | ((src1 & 0x0000FFFF) << 16);
dst1 = ((src0 & 0xFFFF0000) >> 16) | ((src1 & 0xFFFF0000) >> 0);
pDstU32_0[iPCMFrame] = dst0;
pDstU32_1[iPCMFrame] = dst1;
pSrcU32Running += 2;
}
iPCMFrame *= 2;
}
#else
{
const ma_uint16* pSrcU16Running = (const ma_uint16*)pSrcS16;
for (iPCMFrame = 0; iPCMFrame + 2 < frameCount; iPCMFrame += 2) {
ma_uint32 dst0;
ma_uint32 dst1;
dst0 = (pSrcU16Running[0] << 0) | (pSrcU16Running[2] << 16);
dst1 = (pSrcU16Running[1] << 0) | (pSrcU16Running[3] << 16);
*pDstU32_0 = dst0;
*pDstU32_1 = dst1;
pSrcU16Running += 2 * 2;
pDstU32_0 += 1;
pDstU32_1 += 1;
}
}
#endif
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_int16* pDstS16 = (ma_int16*)ppDeinterleavedPCMFrames[iChannel];
@@ -52932,7 +53057,24 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
{
const float* pSrcF32 = (const float*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
float* pDstF32_0 = (float*)ppDeinterleavedPCMFrames[0];
float* pDstF32_1 = (float*)ppDeinterleavedPCMFrames[1];
const float* pSrcF32Running = (const float*)pSrcF32;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstF32_0[iPCMFrame] = pSrcF32Running[0];
pDstF32_1[iPCMFrame] = pSrcF32Running[1];
pSrcF32Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
float* pDstF32 = (float*)ppDeinterleavedPCMFrames[iChannel];
@@ -52945,7 +53087,24 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
{
const ma_int32* pSrcS32 = (const ma_int32*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
iPCMFrame = 0;
/* Specialization for stereo. */
if (channels == 2) {
ma_uint32* pDstU32_0 = (ma_uint32*)ppDeinterleavedPCMFrames[0];
ma_uint32* pDstU32_1 = (ma_uint32*)ppDeinterleavedPCMFrames[1];
const ma_uint32* pSrcU32Running = (const ma_uint32*)pSrcS32;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstU32_0[iPCMFrame] = pSrcU32Running[0];
pDstU32_1[iPCMFrame] = pSrcU32Running[1];
pSrcU32Running += 2;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_int32* pDstS32 = (ma_int32*)ppDeinterleavedPCMFrames[iChannel];
@@ -52958,7 +53117,30 @@ MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_
{
const ma_uint8* pSrcS24 = (const ma_uint8*)pInterleavedPCMFrames;
ma_uint64 iPCMFrame;
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
iPCMFrame = 0;
if (channels == 2) {
ma_uint8* pDstS24_0 = (ma_uint8*)ppDeinterleavedPCMFrames[0];
ma_uint8* pDstS24_1 = (ma_uint8*)ppDeinterleavedPCMFrames[1];
const ma_uint8* pSrcS24Running = (const ma_uint8*)pSrcS24;
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
pDstS24_0[0] = pSrcS24Running[0];
pDstS24_0[1] = pSrcS24Running[1];
pDstS24_0[2] = pSrcS24Running[2];
pDstS24_1[0] = pSrcS24Running[3];
pDstS24_1[1] = pSrcS24Running[4];
pDstS24_1[2] = pSrcS24Running[5];
pSrcS24Running += 6;
pDstS24_0 += 3;
pDstS24_1 += 3;
}
}
for (; iPCMFrame < frameCount; iPCMFrame += 1) {
ma_uint32 iChannel;
for (iChannel = 0; iChannel < channels; iChannel += 1) {
ma_uint8* pDstS24 = (ma_uint8*)ppDeinterleavedPCMFrames[iChannel];