mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-22 00:06:59 +02:00
Optimizations to ma_interleave_pcm_frames().
This commit is contained in:
+184
-16
@@ -18,6 +18,40 @@ const char* format_short_name(ma_format format)
|
||||
}
|
||||
}
|
||||
|
||||
void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount, ma_uint8 valueOffset)
|
||||
{
|
||||
ma_uint8 v = valueOffset;
|
||||
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
|
||||
ma_uint32 iByte;
|
||||
|
||||
/* Just fill byte-by-byte. */
|
||||
for (iByte = 0; iByte < frameCount * bpf; iByte += 1) {
|
||||
((ma_uint8*)pBuffer)[iByte] = v;
|
||||
v += 1; /* Just let this overflow. */
|
||||
}
|
||||
}
|
||||
|
||||
ma_bool32 compare_interleaved_deinterleaved(ma_format format, ma_uint32 channels, ma_uint32 frameCount, void* pInterleaved, const void** ppDeinterleaved)
|
||||
{
|
||||
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
|
||||
ma_uint32 iPCMFrame;
|
||||
|
||||
for (iPCMFrame = 0; iPCMFrame < frameCount; iPCMFrame += 1) {
|
||||
ma_uint32 iChannel;
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
const void* pDeinterleavedSample = ma_offset_ptr(ppDeinterleaved[iChannel], iPCMFrame * sampleSizeInBytes);
|
||||
const void* pInterleavedSample = ma_offset_ptr(pInterleaved, (iPCMFrame * channels + iChannel) * sampleSizeInBytes);
|
||||
|
||||
if (memcmp(pDeinterleavedSample, pInterleavedSample, sampleSizeInBytes) != 0) {
|
||||
return MA_FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return MA_TRUE;
|
||||
}
|
||||
|
||||
|
||||
void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames)
|
||||
{
|
||||
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
|
||||
@@ -32,19 +66,6 @@ void deinterleave_reference(ma_format format, ma_uint32 channels, ma_uint64 fram
|
||||
}
|
||||
}
|
||||
|
||||
void fill_debug_frames(void* pBuffer, ma_format format, ma_uint32 channels, ma_uint32 frameCount)
|
||||
{
|
||||
ma_uint8 v = 0;
|
||||
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
|
||||
ma_uint32 iByte;
|
||||
|
||||
/* Just fill byte-by-byte. */
|
||||
for (iByte = 0; iByte < frameCount * bpf; iByte += 1) {
|
||||
((ma_uint8*)pBuffer)[iByte] = v;
|
||||
v += 1; /* Just let this overflow. */
|
||||
}
|
||||
}
|
||||
|
||||
ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
{
|
||||
ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */
|
||||
@@ -63,7 +84,7 @@ ma_bool32 verify_deinterleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
|
||||
}
|
||||
|
||||
fill_debug_frames(pInterleavedReference, format, channels, frameCount);
|
||||
fill_debug_frames(pInterleavedReference, format, channels, frameCount, 0);
|
||||
MA_COPY_MEMORY(pInterleavedOptimized, pInterleavedReference, frameCount * bpf);
|
||||
|
||||
|
||||
@@ -99,7 +120,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
ma_timer timer;
|
||||
double startTime;
|
||||
|
||||
printf("%s %u: ", format_short_name(format), channels);
|
||||
printf("Deinterleave: %s %u: ", format_short_name(format), channels);
|
||||
|
||||
if (verify_deinterleaving_by_format(format, channels) == MA_FALSE) {
|
||||
printf("FAILED\n");
|
||||
@@ -115,7 +136,7 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
|
||||
}
|
||||
|
||||
ma_debug_fill_pcm_frames_with_sine_wave((float*)pInterleaved, frameCount, format, channels, 48000); /* The float* cast is to work around an API bug in miniaudio v0.11. It's harmless. */
|
||||
fill_debug_frames(pInterleaved, format, channels, frameCount, 0);
|
||||
|
||||
|
||||
startTime = ma_timer_get_time_in_seconds(&timer);
|
||||
@@ -127,6 +148,14 @@ void profile_deinterleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
}
|
||||
printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime);
|
||||
|
||||
/*
|
||||
I think Clang can recognize that we're not actually doing anything with the output data of our tests and then
|
||||
optimizes out the entire thing. We'll do a simple comparision here.
|
||||
*/
|
||||
if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) {
|
||||
printf("FAILED VERIFICATION\n");
|
||||
}
|
||||
|
||||
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
ma_free(pDeinterleaved[iChannel], NULL);
|
||||
@@ -159,9 +188,148 @@ void profile_deinterleaving(void)
|
||||
profile_deinterleaving_by_format(ma_format_s24, 3);
|
||||
}
|
||||
|
||||
|
||||
void interleave_reference(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames)
|
||||
{
|
||||
ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
|
||||
ma_uint64 iPCMFrame;
|
||||
for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
|
||||
ma_uint32 iChannel;
|
||||
for (iChannel = 0; iChannel < channels; ++iChannel) {
|
||||
void* pDst = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
|
||||
const void* pSrc = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
|
||||
memcpy(pDst, pSrc, sampleSizeInBytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ma_bool32 verify_interleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
{
|
||||
ma_uint64 frameCount = 1023; /* <-- Make this odd so we can test that the tail is handled properly from internal loop unrolling. */
|
||||
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
|
||||
void* pDeinterleavedReference[MA_MAX_CHANNELS];
|
||||
void* pInterleavedReference;
|
||||
void* pDeinterleavedOptimized[MA_MAX_CHANNELS];
|
||||
void* pInterleavedOptimized;
|
||||
ma_uint32 iChannel;
|
||||
|
||||
pInterleavedReference = ma_malloc(frameCount * bpf, NULL);
|
||||
pInterleavedOptimized = ma_malloc(frameCount * bpf, NULL);
|
||||
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
pDeinterleavedReference[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
|
||||
pDeinterleavedOptimized[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
|
||||
}
|
||||
|
||||
/* Fill deinterleaved buffers with test data. */
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
fill_debug_frames(pDeinterleavedReference[iChannel], format, 1, frameCount, (ma_uint8)iChannel); /* Last parameter is to ensure each channel position has different values. */
|
||||
MA_COPY_MEMORY(pDeinterleavedOptimized[iChannel], pDeinterleavedReference[iChannel], frameCount * ma_get_bytes_per_sample(format));
|
||||
}
|
||||
|
||||
|
||||
interleave_reference (format, channels, frameCount, (const void**)pDeinterleavedReference, pInterleavedReference);
|
||||
ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleavedOptimized, pInterleavedOptimized);
|
||||
|
||||
if (memcmp(pInterleavedReference, pInterleavedOptimized, frameCount * bpf) != 0) {
|
||||
return MA_FALSE;
|
||||
}
|
||||
|
||||
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
ma_free(pDeinterleavedReference[iChannel], NULL);
|
||||
ma_free(pDeinterleavedOptimized[iChannel], NULL);
|
||||
}
|
||||
|
||||
ma_free(pInterleavedReference, NULL);
|
||||
ma_free(pInterleavedOptimized, NULL);
|
||||
|
||||
return MA_TRUE;
|
||||
}
|
||||
|
||||
void profile_interleaving_by_format(ma_format format, ma_uint32 channels)
|
||||
{
|
||||
ma_uint64 frameCount = 1024 * 1024;
|
||||
ma_uint32 iterationCount = 1000;
|
||||
ma_uint32 bpf = ma_get_bytes_per_frame(format, channels);
|
||||
void* pDeinterleaved[MA_MAX_CHANNELS];
|
||||
void* pInterleaved;
|
||||
ma_uint32 iChannel;
|
||||
ma_timer timer;
|
||||
double startTime;
|
||||
|
||||
printf("Interleave: %s %u: ", format_short_name(format), channels);
|
||||
|
||||
if (verify_interleaving_by_format(format, channels) == MA_FALSE) {
|
||||
printf("FAILED\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
ma_timer_init(&timer);
|
||||
|
||||
pInterleaved = ma_malloc(frameCount * bpf, NULL);
|
||||
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
pDeinterleaved[iChannel] = ma_malloc(frameCount * ma_get_bytes_per_sample(format), NULL);
|
||||
fill_debug_frames(pDeinterleaved[iChannel], format, 1, frameCount, (ma_uint8)iChannel);
|
||||
}
|
||||
|
||||
|
||||
startTime = ma_timer_get_time_in_seconds(&timer);
|
||||
{
|
||||
ma_uint32 i;
|
||||
for (i = 0; i < iterationCount; i += 1) {
|
||||
ma_interleave_pcm_frames(format, channels, frameCount, (const void**)pDeinterleaved, pInterleaved);
|
||||
}
|
||||
}
|
||||
printf("%f\n", ma_timer_get_time_in_seconds(&timer) - startTime);
|
||||
|
||||
/*
|
||||
I think Clang can recognize that we're not actually doing anything with the output data of our tests and then
|
||||
optimizes out the entire thing. We'll do a simple comparision here.
|
||||
*/
|
||||
if (compare_interleaved_deinterleaved(format, channels, frameCount, pInterleaved, (const void**)pDeinterleaved) == MA_FALSE) {
|
||||
printf("FAILED VERIFICATION\n");
|
||||
}
|
||||
|
||||
|
||||
for (iChannel = 0; iChannel < channels; iChannel += 1) {
|
||||
ma_free(pDeinterleaved[iChannel], NULL);
|
||||
}
|
||||
|
||||
ma_free(pInterleaved, NULL);
|
||||
}
|
||||
|
||||
void profile_interleaving(void)
|
||||
{
|
||||
/* Stereo has an optimized code path. */
|
||||
profile_interleaving_by_format(ma_format_u8, 2);
|
||||
profile_interleaving_by_format(ma_format_s16, 2);
|
||||
profile_interleaving_by_format(ma_format_f32, 2);
|
||||
profile_interleaving_by_format(ma_format_s32, 2);
|
||||
profile_interleaving_by_format(ma_format_s24, 2);
|
||||
|
||||
/* We have a special case for mono streams so make sure we have coverage of that case. */
|
||||
profile_interleaving_by_format(ma_format_u8, 1);
|
||||
profile_interleaving_by_format(ma_format_s16, 1);
|
||||
profile_interleaving_by_format(ma_format_f32, 1);
|
||||
profile_interleaving_by_format(ma_format_s32, 1);
|
||||
profile_interleaving_by_format(ma_format_s24, 1);
|
||||
|
||||
/* Channels > 2 run on a generic code path. */
|
||||
profile_interleaving_by_format(ma_format_u8, 3);
|
||||
profile_interleaving_by_format(ma_format_s16, 3);
|
||||
profile_interleaving_by_format(ma_format_f32, 3);
|
||||
profile_interleaving_by_format(ma_format_s32, 3);
|
||||
profile_interleaving_by_format(ma_format_s24, 3);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
profile_deinterleaving();
|
||||
profile_interleaving();
|
||||
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
Reference in New Issue
Block a user