mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-23 00:34:03 +02:00
Initial work on SSE2 optimizations for sample rate conversion.
This commit is contained in:
@@ -3211,9 +3211,20 @@ static MAL_INLINE float mal_mix_f32(float x, float y, float a)
|
|||||||
}
|
}
|
||||||
static MAL_INLINE float mal_mix_f32_fast(float x, float y, float a)
|
static MAL_INLINE float mal_mix_f32_fast(float x, float y, float a)
|
||||||
{
|
{
|
||||||
return x + (y - x)*a;
|
float r0 = (y - x);
|
||||||
|
float r1 = r0*a;
|
||||||
|
return x + r1;
|
||||||
|
//return x + (y - x)*a;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MAL_SUPPORT_SSE2)
|
||||||
|
static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
|
||||||
|
{
|
||||||
|
return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static MAL_INLINE double mal_mix_f64(double x, double y, double a)
|
static MAL_INLINE double mal_mix_f64(double x, double y, double a)
|
||||||
{
|
{
|
||||||
return x*(1-a) + y*a;
|
return x*(1-a) + y*a;
|
||||||
@@ -3384,7 +3395,7 @@ void mal_timer_init(mal_timer* pTimer)
|
|||||||
|
|
||||||
LARGE_INTEGER counter;
|
LARGE_INTEGER counter;
|
||||||
QueryPerformanceCounter(&counter);
|
QueryPerformanceCounter(&counter);
|
||||||
pTimer->counter = (mal_uint64)counter.QuadPart;
|
pTimer->counter = counter.QuadPart;
|
||||||
}
|
}
|
||||||
|
|
||||||
double mal_timer_get_time_in_seconds(mal_timer* pTimer)
|
double mal_timer_get_time_in_seconds(mal_timer* pTimer)
|
||||||
@@ -3394,7 +3405,7 @@ double mal_timer_get_time_in_seconds(mal_timer* pTimer)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (counter.QuadPart - pTimer->counter) / (double)g_mal_TimerFrequency.QuadPart;
|
return (double)(counter.QuadPart - pTimer->counter) / g_mal_TimerFrequency.QuadPart;
|
||||||
}
|
}
|
||||||
#elif defined(MAL_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
|
#elif defined(MAL_APPLE) && (__MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
|
||||||
uint64_t g_mal_TimerFrequency = 0;
|
uint64_t g_mal_TimerFrequency = 0;
|
||||||
@@ -19677,7 +19688,7 @@ void mal_src__build_sinc_table__sinc(mal_src* pSRC)
|
|||||||
mal_assert(pSRC != NULL);
|
mal_assert(pSRC != NULL);
|
||||||
|
|
||||||
pSRC->sinc.table[0] = 1.0f;
|
pSRC->sinc.table[0] = 1.0f;
|
||||||
for (int i = 1; i < mal_countof(pSRC->sinc.table); i += 1) {
|
for (mal_uint32 i = 1; i < mal_countof(pSRC->sinc.table); i += 1) {
|
||||||
double x = i*MAL_PI_D / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION;
|
double x = i*MAL_PI_D / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION;
|
||||||
pSRC->sinc.table[i] = (float)(sin(x)/x);
|
pSRC->sinc.table[i] = (float)(sin(x)/x);
|
||||||
}
|
}
|
||||||
@@ -19693,7 +19704,7 @@ void mal_src__build_sinc_table__hann(mal_src* pSRC)
|
|||||||
{
|
{
|
||||||
mal_src__build_sinc_table__sinc(pSRC);
|
mal_src__build_sinc_table__sinc(pSRC);
|
||||||
|
|
||||||
for (int i = 0; i < mal_countof(pSRC->sinc.table); i += 1) {
|
for (mal_uint32 i = 0; i < mal_countof(pSRC->sinc.table); i += 1) {
|
||||||
double x = pSRC->sinc.table[i];
|
double x = pSRC->sinc.table[i];
|
||||||
double N = MAL_SRC_SINC_MAX_WINDOW_WIDTH*2;
|
double N = MAL_SRC_SINC_MAX_WINDOW_WIDTH*2;
|
||||||
double n = ((double)(i) / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION) + MAL_SRC_SINC_MAX_WINDOW_WIDTH;
|
double n = ((double)(i) / MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION) + MAL_SRC_SINC_MAX_WINDOW_WIDTH;
|
||||||
@@ -20070,7 +20081,7 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
|
|||||||
|
|
||||||
float xabs = (float)fabs(x);
|
float xabs = (float)fabs(x);
|
||||||
if (xabs >= MAL_SRC_SINC_MAX_WINDOW_WIDTH /*pSRC->config.sinc.windowWidth*/) {
|
if (xabs >= MAL_SRC_SINC_MAX_WINDOW_WIDTH /*pSRC->config.sinc.windowWidth*/) {
|
||||||
return 0;
|
xabs = 1; // <-- A non-zero integer will always return 0.
|
||||||
}
|
}
|
||||||
|
|
||||||
xabs = xabs * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION;
|
xabs = xabs * MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION;
|
||||||
@@ -20084,6 +20095,60 @@ static MAL_INLINE float mal_src_sinc__interpolation_factor(const mal_src* pSRC,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(MAL_SUPPORT_SSE2)
|
||||||
|
static MAL_INLINE __m128 mal_fabsf_sse2(__m128 x)
|
||||||
|
{
|
||||||
|
static MAL_ALIGN(16) mal_uint32 mask[4] = {
|
||||||
|
0x7FFFFFFF,
|
||||||
|
0x7FFFFFFF,
|
||||||
|
0x7FFFFFFF,
|
||||||
|
0x7FFFFFFF
|
||||||
|
};
|
||||||
|
|
||||||
|
return _mm_and_ps(*(__m128*)mask, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
static MAL_INLINE __m128 mal_truncf_sse2(__m128 x)
|
||||||
|
{
|
||||||
|
return _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
|
||||||
|
}
|
||||||
|
|
||||||
|
static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src* pSRC, __m128* x)
|
||||||
|
{
|
||||||
|
__m128 windowWidth128 = _mm_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
||||||
|
__m128 resolution128 = _mm_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||||
|
__m128 one = _mm_set1_ps(1);
|
||||||
|
|
||||||
|
__m128 xabs = mal_fabsf_sse2(*x);
|
||||||
|
|
||||||
|
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
|
||||||
|
__m128 xcmp = _mm_cmp_ps(windowWidth128, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
||||||
|
xabs = _mm_or_ps(_mm_and_ps(one, xcmp), _mm_andnot_ps(xcmp, xabs)); // xabs = (xcmp) ? 1 : xabs;
|
||||||
|
|
||||||
|
xabs = _mm_mul_ps(xabs, resolution128);
|
||||||
|
__m128i ixabs = _mm_cvttps_epi32(xabs);
|
||||||
|
|
||||||
|
__m128 lo = _mm_set_ps(
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[3]],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[2]],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[1]],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[0]]
|
||||||
|
);
|
||||||
|
|
||||||
|
__m128 hi = _mm_set_ps(
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[3]+1],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[2]+1],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[1]+1],
|
||||||
|
pSRC->sinc.table[((int*)&ixabs)[0]+1]
|
||||||
|
);
|
||||||
|
|
||||||
|
__m128 a = _mm_sub_ps(xabs, _mm_cvtepi32_ps(ixabs));
|
||||||
|
__m128 r = mal_mix_f32_fast__sse2(lo, hi, a);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount, void** ppSamplesOut, void* pUserData)
|
mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount, void** ppSamplesOut, void* pUserData)
|
||||||
{
|
{
|
||||||
mal_assert(pSRC != NULL);
|
mal_assert(pSRC != NULL);
|
||||||
@@ -20122,21 +20187,66 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
|||||||
outputFramesToRead = maxOutputFramesToRead;
|
outputFramesToRead = maxOutputFramesToRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float _windowSamplesUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||||
|
float* windowSamples = (float*)(((mal_uintptr)_windowSamplesUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||||
|
|
||||||
|
float _iWindowFUnaligned[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2 + MAL_SIMD_ALIGNMENT];
|
||||||
|
float* iWindowF = (float*)(((mal_uintptr)_iWindowFUnaligned + MAL_SIMD_ALIGNMENT-1) & ~(MAL_SIMD_ALIGNMENT-1));
|
||||||
|
for (mal_int32 i = 0; i < windowWidth2; ++i) {
|
||||||
|
iWindowF[i] = (float)(i - windowWidth);
|
||||||
|
}
|
||||||
|
|
||||||
for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) {
|
for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) {
|
||||||
// Do SRC.
|
// Do SRC.
|
||||||
float timeIn = timeInBeg;
|
float timeIn = timeInBeg;
|
||||||
for (mal_uint32 iSample = 0; iSample < outputFramesToRead; iSample += 1) {
|
for (mal_uint32 iSample = 0; iSample < outputFramesToRead; iSample += 1) {
|
||||||
mal_int32 iTimeIn = (mal_int32)timeIn;
|
|
||||||
|
|
||||||
float sampleOut = 0;
|
float sampleOut = 0;
|
||||||
for (mal_int32 iWindow = -windowWidth+1; iWindow < windowWidth; iWindow += 1) {
|
float iTimeInF = mal_floorf(timeIn);
|
||||||
|
mal_uint32 iTimeIn = (mal_uint32)iTimeInF;
|
||||||
|
|
||||||
|
//mal_int32 iWindowBeg = -windowWidth+1;
|
||||||
|
//mal_int32 iWindowEnd = windowWidth;
|
||||||
|
mal_int32 iWindow = 0;
|
||||||
|
|
||||||
|
// Pre-load the window samples into an aligned buffer to begin with. Need to put these into an aligned buffer to make SIMD easier.
|
||||||
|
windowSamples[0] = 0; // <-- The first sample is always zero.
|
||||||
|
for (mal_int32 i = 1; i < windowWidth2; ++i) {
|
||||||
|
windowSamples[i] = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, i - windowWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(MAL_SUPPORT_SSE2)
|
||||||
|
if (pSRC->useSSE2) {
|
||||||
|
__m128 t = _mm_set1_ps((timeIn - iTimeInF));
|
||||||
|
|
||||||
|
mal_int32 windowWidth4 = windowWidth2 >> 2;
|
||||||
|
for (mal_int32 iWindow4 = 0; iWindow4 < windowWidth4; iWindow4 += 1) {
|
||||||
|
__m128* s = (__m128*)windowSamples + iWindow4;
|
||||||
|
__m128* w = (__m128*)iWindowF + iWindow4;
|
||||||
|
|
||||||
|
__m128 x = _mm_sub_ps(t, *w);
|
||||||
|
__m128 a = mal_src_sinc__interpolation_factor__sse2(pSRC, &x);
|
||||||
|
__m128 r = _mm_mul_ps(*s, a);
|
||||||
|
|
||||||
|
sampleOut += ((float*)(&r))[0];
|
||||||
|
sampleOut += ((float*)(&r))[1];
|
||||||
|
sampleOut += ((float*)(&r))[2];
|
||||||
|
sampleOut += ((float*)(&r))[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
iWindow += windowWidth4 * 4;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Non-SIMD/Reference implementation.
|
||||||
|
for (; iWindow < windowWidth2; iWindow += 1) {
|
||||||
|
float s = windowSamples[iWindow];
|
||||||
|
|
||||||
float t = (timeIn - iTimeIn);
|
float t = (timeIn - iTimeIn);
|
||||||
float w = (float)(iWindow);
|
float w = iWindowF[iWindow];
|
||||||
|
|
||||||
float a = mal_src_sinc__interpolation_factor(pSRC, (t - w));
|
float a = mal_src_sinc__interpolation_factor(pSRC, (t - w));
|
||||||
float s = mal_src_sinc__get_input_sample_from_window(pSRC, iChannel, iTimeIn, iWindow);
|
float r = s * a;
|
||||||
|
|
||||||
sampleOut += s * a;
|
sampleOut += r;
|
||||||
}
|
}
|
||||||
|
|
||||||
ppNextSamplesOut[iChannel][iSample] = (float)sampleOut;
|
ppNextSamplesOut[iChannel][iSample] = (float)sampleOut;
|
||||||
@@ -21902,6 +22012,13 @@ mal_result mal_decoder_init__internal(mal_decoder_read_proc onRead, mal_decoder_
|
|||||||
mal_assert(pConfig != NULL);
|
mal_assert(pConfig != NULL);
|
||||||
mal_assert(pDecoder != NULL);
|
mal_assert(pDecoder != NULL);
|
||||||
|
|
||||||
|
// Silence some warnings in the case that we don't have any decoder backends enabled.
|
||||||
|
(void)onRead;
|
||||||
|
(void)onSeek;
|
||||||
|
(void)pUserData;
|
||||||
|
(void)pConfig;
|
||||||
|
(void)pDecoder;
|
||||||
|
|
||||||
// We use trial and error to open a decoder.
|
// We use trial and error to open a decoder.
|
||||||
mal_result result = MAL_NO_BACKEND;
|
mal_result result = MAL_NO_BACKEND;
|
||||||
|
|
||||||
|
|||||||
+300
-8
@@ -1,6 +1,40 @@
|
|||||||
#define MINI_AL_IMPLEMENTATION
|
#define MINI_AL_IMPLEMENTATION
|
||||||
#include "../mini_al.h"
|
#include "../mini_al.h"
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
simd_mode_scalar = 0,
|
||||||
|
simd_mode_sse2,
|
||||||
|
simd_mode_avx,
|
||||||
|
simd_mode_avx512,
|
||||||
|
simd_mode_neon
|
||||||
|
} simd_mode;
|
||||||
|
|
||||||
|
const char* simd_mode_to_string(simd_mode mode)
|
||||||
|
{
|
||||||
|
switch (mode) {
|
||||||
|
case simd_mode_scalar: return "Reference";
|
||||||
|
case simd_mode_sse2: return "SSE2";
|
||||||
|
case simd_mode_avx: return "AVX";
|
||||||
|
case simd_mode_avx512: return "AVX-512";
|
||||||
|
case simd_mode_neon: return "NEON";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "Unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* mal_src_algorithm_to_string(mal_src_algorithm algorithm)
|
||||||
|
{
|
||||||
|
switch (algorithm) {
|
||||||
|
case mal_src_algorithm_none: return "Passthrough";
|
||||||
|
case mal_src_algorithm_linear: return "Linear";
|
||||||
|
case mal_src_algorithm_sinc: return "Sinc";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "Unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
float g_ChannelRouterProfilingOutputBenchmark[8][48000];
|
float g_ChannelRouterProfilingOutputBenchmark[8][48000];
|
||||||
float g_ChannelRouterProfilingOutput[8][48000];
|
float g_ChannelRouterProfilingOutput[8][48000];
|
||||||
double g_ChannelRouterTime_Reference = 0;
|
double g_ChannelRouterTime_Reference = 0;
|
||||||
@@ -9,7 +43,7 @@ double g_ChannelRouterTime_AVX = 0;
|
|||||||
double g_ChannelRouterTime_AVX512 = 0;
|
double g_ChannelRouterTime_AVX512 = 0;
|
||||||
double g_ChannelRouterTime_NEON = 0;
|
double g_ChannelRouterTime_NEON = 0;
|
||||||
|
|
||||||
mal_sine_wave sineWave;
|
mal_sine_wave g_sineWave;
|
||||||
|
|
||||||
mal_bool32 channel_router_test(mal_uint32 channels, mal_uint64 frameCount, float** ppFramesA, float** ppFramesB)
|
mal_bool32 channel_router_test(mal_uint32 channels, mal_uint64 frameCount, float** ppFramesA, float** ppFramesB)
|
||||||
{
|
{
|
||||||
@@ -32,8 +66,8 @@ mal_uint32 channel_router_on_read(mal_channel_router* pRouter, mal_uint32 frameC
|
|||||||
float** ppSamplesOutF = (float**)ppSamplesOut;
|
float** ppSamplesOutF = (float**)ppSamplesOut;
|
||||||
|
|
||||||
for (mal_uint32 iChannel = 0; iChannel < pRouter->config.channelsIn; ++iChannel) {
|
for (mal_uint32 iChannel = 0; iChannel < pRouter->config.channelsIn; ++iChannel) {
|
||||||
mal_sine_wave_init(1/(iChannel+1), 400, 48000, &sineWave);
|
mal_sine_wave_init(1/(iChannel+1), 400, 48000, &g_sineWave);
|
||||||
mal_sine_wave_read(&sineWave, frameCount, ppSamplesOutF[iChannel]);
|
mal_sine_wave_read(&g_sineWave, frameCount, ppSamplesOutF[iChannel]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return frameCount;
|
return frameCount;
|
||||||
@@ -75,7 +109,7 @@ int do_profiling__channel_routing()
|
|||||||
ppOutBenchmark[i] = (void*)g_ChannelRouterProfilingOutputBenchmark[i];
|
ppOutBenchmark[i] = (void*)g_ChannelRouterProfilingOutputBenchmark[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
mal_sine_wave_init(1, 400, 48000, &sineWave);
|
mal_sine_wave_init(1, 400, 48000, &g_sineWave);
|
||||||
mal_uint64 framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOutBenchmark, NULL);
|
mal_uint64 framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOutBenchmark, NULL);
|
||||||
if (framesRead != framesToRead) {
|
if (framesRead != framesToRead) {
|
||||||
printf("Channel Router: An error occurred while reading benchmark data.\n");
|
printf("Channel Router: An error occurred while reading benchmark data.\n");
|
||||||
@@ -183,9 +217,263 @@ int do_profiling__channel_routing()
|
|||||||
printf("NEON: %.4fms (%.2f%%)\n", g_ChannelRouterTime_NEON*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_NEON*100);
|
printf("NEON: %.4fms (%.2f%%)\n", g_ChannelRouterTime_NEON*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_NEON*100);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// SRC
|
||||||
|
//
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
float* pFrameData[MAL_MAX_CHANNELS];
|
||||||
|
mal_uint64 frameCount;
|
||||||
|
mal_uint32 channels;
|
||||||
|
double timeTaken;
|
||||||
|
} src_reference_data;
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
float* pFrameData[MAL_MAX_CHANNELS];
|
||||||
|
mal_uint64 frameCount;
|
||||||
|
mal_uint64 iNextFrame;
|
||||||
|
mal_uint32 channels;
|
||||||
|
} src_data;
|
||||||
|
|
||||||
|
mal_uint32 do_profiling__src__on_read(mal_src* pSRC, mal_uint32 frameCount, void** ppSamplesOut, void* pUserData)
|
||||||
|
{
|
||||||
|
src_data* pBaseData = (src_data*)pUserData;
|
||||||
|
mal_assert(pBaseData != NULL);
|
||||||
|
mal_assert(pBaseData->iNextFrame <= pBaseData->frameCount);
|
||||||
|
|
||||||
|
mal_uint64 framesToRead = frameCount;
|
||||||
|
|
||||||
|
mal_uint64 framesAvailable = pBaseData->frameCount - pBaseData->iNextFrame;
|
||||||
|
if (framesToRead > framesAvailable) {
|
||||||
|
framesToRead = framesAvailable;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (framesToRead > 0) {
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < pSRC->config.channels; iChannel += 1) {
|
||||||
|
mal_copy_memory(ppSamplesOut[iChannel], pBaseData->pFrameData[iChannel], (size_t)(framesToRead * sizeof(float)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pBaseData->iNextFrame += framesToRead;
|
||||||
|
return (mal_uint32)framesToRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_result init_src(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm, simd_mode mode, mal_src* pSRC)
|
||||||
|
{
|
||||||
|
mal_assert(pBaseData != NULL);
|
||||||
|
mal_assert(pSRC != NULL);
|
||||||
|
|
||||||
|
mal_src_config srcConfig = mal_src_config_init(sampleRateIn, sampleRateOut, pBaseData->channels, do_profiling__src__on_read, pBaseData);
|
||||||
|
srcConfig.sinc.windowWidth = 17; // <-- Make this an odd number to test unaligned section in the SIMD implementations.
|
||||||
|
srcConfig.algorithm = algorithm;
|
||||||
|
srcConfig.noSSE2 = MAL_TRUE;
|
||||||
|
srcConfig.noAVX = MAL_TRUE;
|
||||||
|
srcConfig.noAVX512 = MAL_TRUE;
|
||||||
|
srcConfig.noNEON = MAL_TRUE;
|
||||||
|
switch (mode) {
|
||||||
|
case simd_mode_sse2: srcConfig.noSSE2 = MAL_FALSE; break;
|
||||||
|
case simd_mode_avx: srcConfig.noAVX = MAL_FALSE; break;
|
||||||
|
case simd_mode_avx512: srcConfig.noAVX512 = MAL_FALSE; break;
|
||||||
|
case simd_mode_neon: srcConfig.noNEON = MAL_FALSE; break;
|
||||||
|
case simd_mode_scalar:
|
||||||
|
default: break;
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_result result = mal_src_init(&srcConfig, pSRC);
|
||||||
|
if (result != MAL_SUCCESS) {
|
||||||
|
printf("Failed to initialize sample rate converter.\n");
|
||||||
|
return (int)result;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int do_profiling__src__profile_individual(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm, simd_mode mode, src_reference_data* pReferenceData)
|
||||||
|
{
|
||||||
|
mal_assert(pBaseData != NULL);
|
||||||
|
mal_assert(pReferenceData != NULL);
|
||||||
|
|
||||||
|
mal_result result = MAL_ERROR;
|
||||||
|
|
||||||
|
// Make sure the base data is moved back to the start.
|
||||||
|
pBaseData->iNextFrame = 0;
|
||||||
|
|
||||||
|
mal_src src;
|
||||||
|
result = init_src(pBaseData, sampleRateIn, sampleRateOut, algorithm, mode, &src);
|
||||||
|
if (result != MAL_SUCCESS) {
|
||||||
|
return (int)result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Profiling.
|
||||||
|
mal_uint64 sz = pReferenceData->frameCount * sizeof(float);
|
||||||
|
mal_assert(sz <= SIZE_MAX);
|
||||||
|
|
||||||
|
float* pFrameData[MAL_MAX_CHANNELS];
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < pBaseData->channels; iChannel += 1) {
|
||||||
|
pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)sz, MAL_SIMD_ALIGNMENT);
|
||||||
|
if (pFrameData[iChannel] == NULL) {
|
||||||
|
printf("Out of memory.\n");
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
mal_zero_memory(pFrameData[iChannel], (size_t)sz);
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_timer timer;
|
||||||
|
mal_timer_init(&timer);
|
||||||
|
double startTime = mal_timer_get_time_in_seconds(&timer);
|
||||||
|
{
|
||||||
|
mal_src_read_deinterleaved(&src, pReferenceData->frameCount, (void**)pFrameData, pBaseData);
|
||||||
|
}
|
||||||
|
double timeTaken = mal_timer_get_time_in_seconds(&timer) - startTime;
|
||||||
|
|
||||||
|
|
||||||
|
// Correctness test.
|
||||||
|
mal_bool32 passed = MAL_TRUE;
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < pReferenceData->channels; iChannel += 1) {
|
||||||
|
for (mal_uint32 iFrame = 0; iFrame < pReferenceData->frameCount; iFrame += 1) {
|
||||||
|
float s0 = pReferenceData->pFrameData[iChannel][iFrame];
|
||||||
|
float s1 = pFrameData[iChannel][iFrame];
|
||||||
|
if (s0 != s1) {
|
||||||
|
printf("(Channel %d, Sample %d) %f != %f\n", iChannel, iFrame, s0, s1);
|
||||||
|
passed = MAL_FALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Print results.
|
||||||
|
if (passed) {
|
||||||
|
printf(" [PASSED] ");
|
||||||
|
} else {
|
||||||
|
printf(" [FAILED] ");
|
||||||
|
}
|
||||||
|
printf("%s %d -> %d (%s): %.4fms (%.2f%%)\n", mal_src_algorithm_to_string(algorithm), sampleRateIn, sampleRateOut, simd_mode_to_string(mode), timeTaken*1000, pReferenceData->timeTaken/timeTaken*100);
|
||||||
|
|
||||||
|
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < pBaseData->channels; iChannel += 1) {
|
||||||
|
mal_aligned_free(pFrameData[iChannel]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int)result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sampleRateOut, mal_src_algorithm algorithm)
|
||||||
|
{
|
||||||
|
mal_assert(pBaseData != NULL);
|
||||||
|
|
||||||
|
// Make sure the base data is back at the start.
|
||||||
|
pBaseData->iNextFrame = 0;
|
||||||
|
|
||||||
|
src_reference_data referenceData;
|
||||||
|
mal_zero_object(&referenceData);
|
||||||
|
referenceData.channels = pBaseData->channels;
|
||||||
|
|
||||||
|
// The first thing to do is to perform a sample rate conversion using the scalar/reference implementation. This reference is used to compare
|
||||||
|
// the results of the optimized implementation.
|
||||||
|
referenceData.frameCount = mal_calculate_frame_count_after_src(sampleRateOut, sampleRateIn, pBaseData->frameCount);
|
||||||
|
if (referenceData.frameCount == 0) {
|
||||||
|
printf("Failed to calculate output frame count.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_uint64 sz = referenceData.frameCount * sizeof(float);
|
||||||
|
mal_assert(sz <= SIZE_MAX);
|
||||||
|
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < referenceData.channels; iChannel += 1) {
|
||||||
|
referenceData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)sz, MAL_SIMD_ALIGNMENT);
|
||||||
|
if (referenceData.pFrameData[iChannel] == NULL) {
|
||||||
|
printf("Out of memory.\n");
|
||||||
|
return -2;
|
||||||
|
}
|
||||||
|
mal_zero_memory(referenceData.pFrameData[iChannel], (size_t)sz);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Generate the reference data.
|
||||||
|
mal_src src;
|
||||||
|
mal_result result = init_src(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_scalar, &src);
|
||||||
|
if (result != MAL_SUCCESS) {
|
||||||
|
return (int)result;
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_timer timer;
|
||||||
|
mal_timer_init(&timer);
|
||||||
|
double startTime = mal_timer_get_time_in_seconds(&timer);
|
||||||
|
{
|
||||||
|
mal_src_read_deinterleaved(&src, referenceData.frameCount, (void**)referenceData.pFrameData, pBaseData);
|
||||||
|
}
|
||||||
|
referenceData.timeTaken = mal_timer_get_time_in_seconds(&timer) - startTime;
|
||||||
|
|
||||||
|
|
||||||
|
// Now that we have the reference data to compare against we can go ahead and measure the SIMD optimizations.
|
||||||
|
if (mal_has_sse2()) {
|
||||||
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
|
||||||
|
}
|
||||||
|
if (mal_has_avx()) {
|
||||||
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx, &referenceData);
|
||||||
|
}
|
||||||
|
if (mal_has_avx512f()) {
|
||||||
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx512, &referenceData);
|
||||||
|
}
|
||||||
|
if (mal_has_neon()) {
|
||||||
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_neon, &referenceData);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < referenceData.channels; iChannel += 1) {
|
||||||
|
mal_aligned_free(referenceData.pFrameData[iChannel]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int do_profiling__src()
|
||||||
|
{
|
||||||
|
printf("Sample Rate Conversion\n");
|
||||||
|
printf("======================\n");
|
||||||
|
|
||||||
|
// Set up base data.
|
||||||
|
src_data baseData;
|
||||||
|
mal_zero_object(&baseData);
|
||||||
|
baseData.channels = 8;
|
||||||
|
baseData.frameCount = 10000;
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < baseData.channels; ++iChannel) {
|
||||||
|
baseData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)(baseData.frameCount * sizeof(float)), MAL_SIMD_ALIGNMENT);
|
||||||
|
if (baseData.pFrameData[iChannel] == NULL) {
|
||||||
|
printf("Out of memory.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
mal_sine_wave sineWave;
|
||||||
|
mal_sine_wave_init(1.0f, 400 + (iChannel*50), 48000, &sineWave);
|
||||||
|
mal_sine_wave_read(&sineWave, baseData.frameCount, baseData.pFrameData[iChannel]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Upsampling.
|
||||||
|
do_profiling__src__profile_set(&baseData, 44100, 48000, mal_src_algorithm_sinc);
|
||||||
|
|
||||||
|
// Downsampling.
|
||||||
|
do_profiling__src__profile_set(&baseData, 48000, 44100, mal_src_algorithm_sinc);
|
||||||
|
|
||||||
|
|
||||||
|
for (mal_uint32 iChannel = 0; iChannel < baseData.channels; iChannel += 1) {
|
||||||
|
mal_aligned_free(baseData.pFrameData[iChannel]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char** argv)
|
int main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
(void)argc;
|
(void)argc;
|
||||||
@@ -197,19 +485,16 @@ int main(int argc, char** argv)
|
|||||||
} else {
|
} else {
|
||||||
printf("Has SSE: NO\n");
|
printf("Has SSE: NO\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mal_has_avx()) {
|
if (mal_has_avx()) {
|
||||||
printf("Has AVX: YES\n");
|
printf("Has AVX: YES\n");
|
||||||
} else {
|
} else {
|
||||||
printf("Has AVX: NO\n");
|
printf("Has AVX: NO\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mal_has_avx512f()) {
|
if (mal_has_avx512f()) {
|
||||||
printf("Has AVX-512F: YES\n");
|
printf("Has AVX-512F: YES\n");
|
||||||
} else {
|
} else {
|
||||||
printf("Has AVX-512F: NO\n");
|
printf("Has AVX-512F: NO\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mal_has_neon()) {
|
if (mal_has_neon()) {
|
||||||
printf("Has NEON: YES\n");
|
printf("Has NEON: YES\n");
|
||||||
} else {
|
} else {
|
||||||
@@ -221,7 +506,14 @@ int main(int argc, char** argv)
|
|||||||
|
|
||||||
// Channel routing.
|
// Channel routing.
|
||||||
do_profiling__channel_routing();
|
do_profiling__channel_routing();
|
||||||
|
printf("\n\n");
|
||||||
|
|
||||||
|
// Sample rate conversion.
|
||||||
|
do_profiling__src();
|
||||||
|
printf("\n\n");
|
||||||
|
|
||||||
|
|
||||||
|
printf("Press any key to quit...\n");
|
||||||
getchar();
|
getchar();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
Reference in New Issue
Block a user