mirror of
https://github.com/mackron/miniaudio.git
synced 2026-04-22 00:06:59 +02:00
Drop support for AVX and replace with AVX2.
Rationale for this is that it just makes things simpler for integer operations.
This commit is contained in:
@@ -207,8 +207,8 @@
|
|||||||
// #define MAL_NO_SSE2
|
// #define MAL_NO_SSE2
|
||||||
// Disables SSE2 optimizations.
|
// Disables SSE2 optimizations.
|
||||||
//
|
//
|
||||||
// #define MAL_NO_AVX
|
// #define MAL_NO_AVX2
|
||||||
// Disables AVX optimizations.
|
// Disables AVX2 optimizations.
|
||||||
//
|
//
|
||||||
// #define MAL_NO_AVX512
|
// #define MAL_NO_AVX512
|
||||||
// Disables AVX-512 optimizations.
|
// Disables AVX-512 optimizations.
|
||||||
@@ -813,7 +813,7 @@ typedef struct
|
|||||||
mal_stream_format streamFormatOut;
|
mal_stream_format streamFormatOut;
|
||||||
mal_dither_mode ditherMode;
|
mal_dither_mode ditherMode;
|
||||||
mal_bool32 noSSE2 : 1;
|
mal_bool32 noSSE2 : 1;
|
||||||
mal_bool32 noAVX : 1;
|
mal_bool32 noAVX2 : 1;
|
||||||
mal_bool32 noAVX512 : 1;
|
mal_bool32 noAVX512 : 1;
|
||||||
mal_bool32 noNEON : 1;
|
mal_bool32 noNEON : 1;
|
||||||
mal_format_converter_read_proc onRead;
|
mal_format_converter_read_proc onRead;
|
||||||
@@ -825,7 +825,7 @@ struct mal_format_converter
|
|||||||
{
|
{
|
||||||
mal_format_converter_config config;
|
mal_format_converter_config config;
|
||||||
mal_bool32 useSSE2 : 1;
|
mal_bool32 useSSE2 : 1;
|
||||||
mal_bool32 useAVX : 1;
|
mal_bool32 useAVX2 : 1;
|
||||||
mal_bool32 useAVX512 : 1;
|
mal_bool32 useAVX512 : 1;
|
||||||
mal_bool32 useNEON : 1;
|
mal_bool32 useNEON : 1;
|
||||||
void (* onConvertPCM)(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode);
|
void (* onConvertPCM)(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode);
|
||||||
@@ -846,7 +846,7 @@ typedef struct
|
|||||||
mal_channel channelMapOut[MAL_MAX_CHANNELS];
|
mal_channel channelMapOut[MAL_MAX_CHANNELS];
|
||||||
mal_channel_mix_mode mixingMode;
|
mal_channel_mix_mode mixingMode;
|
||||||
mal_bool32 noSSE2 : 1;
|
mal_bool32 noSSE2 : 1;
|
||||||
mal_bool32 noAVX : 1;
|
mal_bool32 noAVX2 : 1;
|
||||||
mal_bool32 noAVX512 : 1;
|
mal_bool32 noAVX512 : 1;
|
||||||
mal_bool32 noNEON : 1;
|
mal_bool32 noNEON : 1;
|
||||||
mal_channel_router_read_deinterleaved_proc onReadDeinterleaved;
|
mal_channel_router_read_deinterleaved_proc onReadDeinterleaved;
|
||||||
@@ -859,7 +859,7 @@ struct mal_channel_router
|
|||||||
mal_bool32 isPassthrough : 1;
|
mal_bool32 isPassthrough : 1;
|
||||||
mal_bool32 isSimpleShuffle : 1;
|
mal_bool32 isSimpleShuffle : 1;
|
||||||
mal_bool32 useSSE2 : 1;
|
mal_bool32 useSSE2 : 1;
|
||||||
mal_bool32 useAVX : 1;
|
mal_bool32 useAVX2 : 1;
|
||||||
mal_bool32 useAVX512 : 1;
|
mal_bool32 useAVX512 : 1;
|
||||||
mal_bool32 useNEON : 1;
|
mal_bool32 useNEON : 1;
|
||||||
mal_uint8 shuffleTable[MAL_MAX_CHANNELS];
|
mal_uint8 shuffleTable[MAL_MAX_CHANNELS];
|
||||||
@@ -894,7 +894,7 @@ typedef struct
|
|||||||
mal_uint32 channels;
|
mal_uint32 channels;
|
||||||
mal_src_algorithm algorithm;
|
mal_src_algorithm algorithm;
|
||||||
mal_bool32 noSSE2 : 1;
|
mal_bool32 noSSE2 : 1;
|
||||||
mal_bool32 noAVX : 1;
|
mal_bool32 noAVX2 : 1;
|
||||||
mal_bool32 noAVX512 : 1;
|
mal_bool32 noAVX512 : 1;
|
||||||
mal_bool32 noNEON : 1;
|
mal_bool32 noNEON : 1;
|
||||||
mal_src_read_deinterleaved_proc onReadDeinterleaved;
|
mal_src_read_deinterleaved_proc onReadDeinterleaved;
|
||||||
@@ -932,7 +932,7 @@ MAL_ALIGNED_STRUCT(MAL_SIMD_ALIGNMENT) mal_src
|
|||||||
|
|
||||||
mal_src_config config;
|
mal_src_config config;
|
||||||
mal_bool32 useSSE2 : 1;
|
mal_bool32 useSSE2 : 1;
|
||||||
mal_bool32 useAVX : 1;
|
mal_bool32 useAVX2 : 1;
|
||||||
mal_bool32 useAVX512 : 1;
|
mal_bool32 useAVX512 : 1;
|
||||||
mal_bool32 useNEON : 1;
|
mal_bool32 useNEON : 1;
|
||||||
};
|
};
|
||||||
@@ -955,7 +955,7 @@ typedef struct
|
|||||||
mal_src_algorithm srcAlgorithm;
|
mal_src_algorithm srcAlgorithm;
|
||||||
mal_bool32 allowDynamicSampleRate;
|
mal_bool32 allowDynamicSampleRate;
|
||||||
mal_bool32 noSSE2 : 1;
|
mal_bool32 noSSE2 : 1;
|
||||||
mal_bool32 noAVX : 1;
|
mal_bool32 noAVX2 : 1;
|
||||||
mal_bool32 noAVX512 : 1;
|
mal_bool32 noAVX512 : 1;
|
||||||
mal_bool32 noNEON : 1;
|
mal_bool32 noNEON : 1;
|
||||||
mal_dsp_read_proc onRead;
|
mal_dsp_read_proc onRead;
|
||||||
@@ -2485,8 +2485,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
|
|||||||
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
|
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
|
||||||
#define MAL_SUPPORT_SSE2
|
#define MAL_SUPPORT_SSE2
|
||||||
#endif
|
#endif
|
||||||
#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
|
//#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
|
||||||
#define MAL_SUPPORT_AVX
|
// #define MAL_SUPPORT_AVX
|
||||||
|
//#endif
|
||||||
|
#if _MSC_VER >= 1700 && !defined(MAL_NO_AVX2) // 2012
|
||||||
|
#define MAL_SUPPORT_AVX2
|
||||||
#endif
|
#endif
|
||||||
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
|
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
|
||||||
#define MAL_SUPPORT_AVX512
|
#define MAL_SUPPORT_AVX512
|
||||||
@@ -2496,8 +2499,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
|
|||||||
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
|
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
|
||||||
#define MAL_SUPPORT_SSE2
|
#define MAL_SUPPORT_SSE2
|
||||||
#endif
|
#endif
|
||||||
#if defined(__AVX__) && !defined(MAL_NO_AVX)
|
//#if defined(__AVX__) && !defined(MAL_NO_AVX)
|
||||||
#define MAL_SUPPORT_AVX
|
// #define MAL_SUPPORT_AVX
|
||||||
|
//#endif
|
||||||
|
#if defined(__AVX2__) && !defined(MAL_NO_AVX2)
|
||||||
|
#define MAL_SUPPORT_AVX2
|
||||||
#endif
|
#endif
|
||||||
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
|
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
|
||||||
#define MAL_SUPPORT_AVX512
|
#define MAL_SUPPORT_AVX512
|
||||||
@@ -2509,8 +2515,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
|
|||||||
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
|
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
|
||||||
#define MAL_SUPPORT_SSE2
|
#define MAL_SUPPORT_SSE2
|
||||||
#endif
|
#endif
|
||||||
#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
|
//#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
|
||||||
#define MAL_SUPPORT_AVX
|
// #define MAL_SUPPORT_AVX
|
||||||
|
//#endif
|
||||||
|
#if !defined(MAL_SUPPORT_AVX2) && !defined(MAL_NO_AVX2) && __has_include(<immintrin.h>)
|
||||||
|
#define MAL_SUPPORT_AVX2
|
||||||
#endif
|
#endif
|
||||||
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
|
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
|
||||||
#define MAL_SUPPORT_AVX512
|
#define MAL_SUPPORT_AVX512
|
||||||
@@ -2519,7 +2528,7 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
|
|||||||
|
|
||||||
#if defined(MAL_SUPPORT_AVX512)
|
#if defined(MAL_SUPPORT_AVX512)
|
||||||
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
|
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
|
||||||
#elif defined(MAL_SUPPORT_AVX)
|
#elif defined(MAL_SUPPORT_AVX2) || defined(MAL_SUPPORT_AVX)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#elif defined(MAL_SUPPORT_SSE2)
|
#elif defined(MAL_SUPPORT_SSE2)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
@@ -2617,6 +2626,7 @@ static MAL_INLINE mal_bool32 mal_has_sse2()
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
static MAL_INLINE mal_bool32 mal_has_avx()
|
static MAL_INLINE mal_bool32 mal_has_avx()
|
||||||
{
|
{
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX)
|
||||||
@@ -2649,6 +2659,42 @@ static MAL_INLINE mal_bool32 mal_has_avx()
|
|||||||
return MAL_FALSE; // No compiler support.
|
return MAL_FALSE; // No compiler support.
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static MAL_INLINE mal_bool32 mal_has_avx2()
|
||||||
|
{
|
||||||
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
|
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX2)
|
||||||
|
#if defined(_AVX2_) || defined(__AVX2__)
|
||||||
|
return MAL_TRUE; // If the compiler is allowed to freely generate AVX2 code we can assume support.
|
||||||
|
#else
|
||||||
|
// AVX requires both CPU and OS support.
|
||||||
|
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
|
||||||
|
return MAL_FALSE;
|
||||||
|
#else
|
||||||
|
int info1[4];
|
||||||
|
int info7[4];
|
||||||
|
mal_cpuid(info1, 1);
|
||||||
|
mal_cpuid(info7, 7);
|
||||||
|
if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 5)) != 0)) {
|
||||||
|
mal_uint64 xrc = mal_xgetbv(0);
|
||||||
|
if ((xrc & 0x06) == 0x06) {
|
||||||
|
return MAL_TRUE;
|
||||||
|
} else {
|
||||||
|
return MAL_FALSE;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return MAL_FALSE;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
return MAL_FALSE; // AVX is only supported on x86 and x64 architectures.
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
return MAL_FALSE; // No compiler support.
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static MAL_INLINE mal_bool32 mal_has_avx512f()
|
static MAL_INLINE mal_bool32 mal_has_avx512f()
|
||||||
{
|
{
|
||||||
@@ -2661,9 +2707,11 @@ static MAL_INLINE mal_bool32 mal_has_avx512f()
|
|||||||
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
|
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
|
||||||
return MAL_FALSE;
|
return MAL_FALSE;
|
||||||
#else
|
#else
|
||||||
int info[4];
|
int info1[4];
|
||||||
mal_cpuid(info, 1);
|
int info7[4];
|
||||||
if (((info[2] & (1 << 27)) != 0) && ((info[1] & (1 << 16)) != 0)) {
|
mal_cpuid(info1, 1);
|
||||||
|
mal_cpuid(info7, 7);
|
||||||
|
if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 16)) != 0)) {
|
||||||
mal_uint64 xrc = mal_xgetbv(0);
|
mal_uint64 xrc = mal_xgetbv(0);
|
||||||
if ((xrc & 0xE6) == 0xE6) {
|
if ((xrc & 0xE6) == 0xE6) {
|
||||||
return MAL_TRUE;
|
return MAL_TRUE;
|
||||||
@@ -3223,8 +3271,8 @@ static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
|
|||||||
return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
|
return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
static MAL_INLINE __m256 mal_mix_f32_fast__avx(__m256 x, __m256 y, __m256 a)
|
static MAL_INLINE __m256 mal_mix_f32_fast__avx2(__m256 x, __m256 y, __m256 a)
|
||||||
{
|
{
|
||||||
return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
|
return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
|
||||||
}
|
}
|
||||||
@@ -17288,8 +17336,8 @@ void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_u8_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_u8_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17346,8 +17394,8 @@ void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_u8_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_u8_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17402,8 +17450,8 @@ void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_u8_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_u8_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17459,13 +17507,13 @@ void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_u8_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_u8_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_SSE2)
|
#if defined(MAL_SUPPORT_AVX512)
|
||||||
void mal_pcm_u8_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_u8_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
@@ -17611,8 +17659,8 @@ void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s16_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s16_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17673,8 +17721,8 @@ void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s16_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s16_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17726,8 +17774,8 @@ void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s16_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s16_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17791,8 +17839,8 @@ void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s16_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s16_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17921,8 +17969,8 @@ void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s24_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s24_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -17992,8 +18040,8 @@ void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s24_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s24_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18053,8 +18101,8 @@ void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s24_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s24_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18118,8 +18166,8 @@ void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s24_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s24_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18255,8 +18303,8 @@ void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s32_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18326,8 +18374,8 @@ void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s32_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18382,8 +18430,8 @@ void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s32_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18453,8 +18501,8 @@ void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_s32_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_s32_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
|
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18576,8 +18624,8 @@ void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_f32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_f32_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -18775,8 +18823,8 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_f32_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_int16* dst_s16 = (mal_int16*)dst;
|
mal_int16* dst_s16 = (mal_int16*)dst;
|
||||||
const float* src_f32 = (const float*)src;
|
const float* src_f32 = (const float*)src;
|
||||||
@@ -18790,7 +18838,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
|
|
||||||
mal_uint64 i = 0;
|
mal_uint64 i = 0;
|
||||||
|
|
||||||
// AVX. AVX allows us to output 16 s16's at a time which means our loop is unrolled 16 times.
|
// AVX2. AVX2 allows us to output 16 s16's at a time which means our loop is unrolled 16 times.
|
||||||
mal_uint64 count16 = count >> 4;
|
mal_uint64 count16 = count >> 4;
|
||||||
for (mal_uint64 i16 = 0; i16 < count16; i16 += 1) {
|
for (mal_uint64 i16 = 0; i16 < count16; i16 += 1) {
|
||||||
__m256 d0;
|
__m256 d0;
|
||||||
@@ -18851,7 +18899,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
x0 = _mm256_mul_ps(x0, _mm256_set1_ps(32767.0f));
|
x0 = _mm256_mul_ps(x0, _mm256_set1_ps(32767.0f));
|
||||||
x1 = _mm256_mul_ps(x1, _mm256_set1_ps(32767.0f));
|
x1 = _mm256_mul_ps(x1, _mm256_set1_ps(32767.0f));
|
||||||
|
|
||||||
// Computing the final result is a little more complicated for AVX than SSE.
|
// Computing the final result is a little more complicated for AVX2 than SSE2.
|
||||||
__m256i i0 = _mm256_cvttps_epi32(x0);
|
__m256i i0 = _mm256_cvttps_epi32(x0);
|
||||||
__m256i i1 = _mm256_cvttps_epi32(x1);
|
__m256i i1 = _mm256_cvttps_epi32(x1);
|
||||||
__m256i p0 = _mm256_permute2x128_si256(i0, i1, 32);
|
__m256i p0 = _mm256_permute2x128_si256(i0, i1, 32);
|
||||||
@@ -18878,7 +18926,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
|
|||||||
void mal_pcm_f32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_f32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
// TODO: Convert this from AVX to AVX-512.
|
// TODO: Convert this from AVX to AVX-512.
|
||||||
mal_pcm_f32_to_s16__avx(dst, src, count, ditherMode);
|
mal_pcm_f32_to_s16__avx2(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_NEON)
|
#if defined(MAL_SUPPORT_NEON)
|
||||||
@@ -18938,8 +18986,8 @@ void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_f32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_f32_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -19004,8 +19052,8 @@ void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
|
|||||||
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void mal_pcm_f32_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
void mal_pcm_f32_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
|
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
|
||||||
}
|
}
|
||||||
@@ -19115,7 +19163,7 @@ mal_result mal_format_converter_init(const mal_format_converter_config* pConfig,
|
|||||||
|
|
||||||
// SIMD
|
// SIMD
|
||||||
pConverter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
pConverter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
||||||
pConverter->useAVX = mal_has_avx() && !pConfig->noAVX;
|
pConverter->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
|
||||||
pConverter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
pConverter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
||||||
pConverter->useNEON = mal_has_neon() && !pConfig->noNEON;
|
pConverter->useNEON = mal_has_neon() && !pConfig->noNEON;
|
||||||
|
|
||||||
@@ -19764,7 +19812,7 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
|
|||||||
|
|
||||||
// SIMD
|
// SIMD
|
||||||
pRouter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
pRouter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
||||||
pRouter->useAVX = mal_has_avx() && !pConfig->noAVX;
|
pRouter->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
|
||||||
pRouter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
pRouter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
||||||
pRouter->useNEON = mal_has_neon() && !pConfig->noNEON;
|
pRouter->useNEON = mal_has_neon() && !pConfig->noNEON;
|
||||||
|
|
||||||
@@ -19948,9 +19996,9 @@ static MAL_INLINE mal_bool32 mal_channel_router__can_use_sse2(mal_channel_router
|
|||||||
return pRouter->useSSE2 && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
|
return pRouter->useSSE2 && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
|
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx2(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
|
||||||
{
|
{
|
||||||
return pRouter->useAVX && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
|
return pRouter->useAVX2 && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx512(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
|
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx512(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
|
||||||
@@ -20017,8 +20065,8 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
if (mal_channel_router__can_use_avx(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
|
if (mal_channel_router__can_use_avx2(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
|
||||||
__m256 weight = _mm256_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
|
__m256 weight = _mm256_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
|
||||||
|
|
||||||
mal_uint64 frameCount8 = frameCount/8;
|
mal_uint64 frameCount8 = frameCount/8;
|
||||||
@@ -20268,7 +20316,7 @@ mal_result mal_src_init(const mal_src_config* pConfig, mal_src* pSRC)
|
|||||||
|
|
||||||
// SIMD
|
// SIMD
|
||||||
pSRC->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
pSRC->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
|
||||||
pSRC->useAVX = mal_has_avx() && !pConfig->noAVX;
|
pSRC->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
|
||||||
pSRC->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
pSRC->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
|
||||||
pSRC->useNEON = mal_has_neon() && !pConfig->noNEON;
|
pSRC->useNEON = mal_has_neon() && !pConfig->noNEON;
|
||||||
|
|
||||||
@@ -20682,20 +20730,20 @@ static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src*
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
static MAL_INLINE __m256 mal_fabsf_avx(__m256 x)
|
static MAL_INLINE __m256 mal_fabsf_avx2(__m256 x)
|
||||||
{
|
{
|
||||||
return _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)), x);
|
return _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)), x);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src* pSRC, __m256 x)
|
static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx2(const mal_src* pSRC, __m256 x)
|
||||||
{
|
{
|
||||||
//__m256 windowWidth256 = _mm256_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
//__m256 windowWidth256 = _mm256_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
|
||||||
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||||
//__m256 one = _mm256_set1_ps(1);
|
//__m256 one = _mm256_set1_ps(1);
|
||||||
|
|
||||||
__m256 xabs = mal_fabsf_avx(x);
|
__m256 xabs = mal_fabsf_avx2(x);
|
||||||
|
|
||||||
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
|
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
|
||||||
//__m256 xcmp = _mm256_cmp_ps(windowWidth256, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
//__m256 xcmp = _mm256_cmp_ps(windowWidth256, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
|
||||||
@@ -20731,7 +20779,7 @@ static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src*
|
|||||||
pSRC->sinc.table[ixabsv[0]+1]
|
pSRC->sinc.table[ixabsv[0]+1]
|
||||||
);
|
);
|
||||||
|
|
||||||
__m256 r = mal_mix_f32_fast__avx(lo, hi, a);
|
__m256 r = mal_mix_f32_fast__avx2(lo, hi, a);
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
@@ -20799,8 +20847,8 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
if (pSRC->useAVX) {
|
if (pSRC->useAVX2) {
|
||||||
windowWidthSIMD = (windowWidthSIMD + 3) & ~(3);
|
windowWidthSIMD = (windowWidthSIMD + 3) & ~(3);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -20866,8 +20914,8 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
|||||||
windowSamples[i] = pSRC->sinc.input[iChannel][iTimeIn + i];
|
windowSamples[i] = pSRC->sinc.input[iChannel][iTimeIn + i];
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
if (pSRC->useAVX) {
|
if (pSRC->useAVX2) {
|
||||||
__m256i ixabs[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
__m256i ixabs[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||||
__m256 a[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
__m256 a[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
|
||||||
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
|
||||||
@@ -20880,7 +20928,7 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
|||||||
__m256 w = *((__m256*)iWindowF + iWindow8);
|
__m256 w = *((__m256*)iWindowF + iWindow8);
|
||||||
|
|
||||||
__m256 xabs = _mm256_sub_ps(t, w);
|
__m256 xabs = _mm256_sub_ps(t, w);
|
||||||
xabs = mal_fabsf_avx(xabs);
|
xabs = mal_fabsf_avx2(xabs);
|
||||||
xabs = _mm256_mul_ps(xabs, resolution256);
|
xabs = _mm256_mul_ps(xabs, resolution256);
|
||||||
|
|
||||||
ixabs[iWindow8] = _mm256_cvttps_epi32(xabs);
|
ixabs[iWindow8] = _mm256_cvttps_epi32(xabs);
|
||||||
@@ -20913,7 +20961,7 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
|
|||||||
);
|
);
|
||||||
|
|
||||||
__m256 s = *((__m256*)windowSamples + iWindow8);
|
__m256 s = *((__m256*)windowSamples + iWindow8);
|
||||||
r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx(lo, hi, a[iWindow8])));
|
r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx2(lo, hi, a[iWindow8])));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Horizontal add.
|
// Horizontal add.
|
||||||
@@ -21345,7 +21393,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
|
|||||||
);
|
);
|
||||||
preFormatConverterConfig.ditherMode = pConfig->ditherMode;
|
preFormatConverterConfig.ditherMode = pConfig->ditherMode;
|
||||||
preFormatConverterConfig.noSSE2 = pConfig->noSSE2;
|
preFormatConverterConfig.noSSE2 = pConfig->noSSE2;
|
||||||
preFormatConverterConfig.noAVX = pConfig->noAVX;
|
preFormatConverterConfig.noAVX2 = pConfig->noAVX2;
|
||||||
preFormatConverterConfig.noAVX512 = pConfig->noAVX512;
|
preFormatConverterConfig.noAVX512 = pConfig->noAVX512;
|
||||||
preFormatConverterConfig.noNEON = pConfig->noNEON;
|
preFormatConverterConfig.noNEON = pConfig->noNEON;
|
||||||
|
|
||||||
@@ -21364,7 +21412,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
|
|||||||
postFormatConverterConfig.channels = pConfig->channelsOut;
|
postFormatConverterConfig.channels = pConfig->channelsOut;
|
||||||
postFormatConverterConfig.ditherMode = pConfig->ditherMode;
|
postFormatConverterConfig.ditherMode = pConfig->ditherMode;
|
||||||
postFormatConverterConfig.noSSE2 = pConfig->noSSE2;
|
postFormatConverterConfig.noSSE2 = pConfig->noSSE2;
|
||||||
postFormatConverterConfig.noAVX = pConfig->noAVX;
|
postFormatConverterConfig.noAVX2 = pConfig->noAVX2;
|
||||||
postFormatConverterConfig.noAVX512 = pConfig->noAVX512;
|
postFormatConverterConfig.noAVX512 = pConfig->noAVX512;
|
||||||
postFormatConverterConfig.noNEON = pConfig->noNEON;
|
postFormatConverterConfig.noNEON = pConfig->noNEON;
|
||||||
if (pDSP->isPreFormatConversionRequired) {
|
if (pDSP->isPreFormatConversionRequired) {
|
||||||
@@ -21391,7 +21439,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
|
|||||||
);
|
);
|
||||||
srcConfig.algorithm = pConfig->srcAlgorithm;
|
srcConfig.algorithm = pConfig->srcAlgorithm;
|
||||||
srcConfig.noSSE2 = pConfig->noSSE2;
|
srcConfig.noSSE2 = pConfig->noSSE2;
|
||||||
srcConfig.noAVX = pConfig->noAVX;
|
srcConfig.noAVX2 = pConfig->noAVX2;
|
||||||
srcConfig.noAVX512 = pConfig->noAVX512;
|
srcConfig.noAVX512 = pConfig->noAVX512;
|
||||||
srcConfig.noNEON = pConfig->noNEON;
|
srcConfig.noNEON = pConfig->noNEON;
|
||||||
mal_copy_memory(&srcConfig.sinc, &pConfig->sinc, sizeof(pConfig->sinc));
|
mal_copy_memory(&srcConfig.sinc, &pConfig->sinc, sizeof(pConfig->sinc));
|
||||||
@@ -21413,7 +21461,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
|
|||||||
mal_dsp__channel_router_on_read_deinterleaved,
|
mal_dsp__channel_router_on_read_deinterleaved,
|
||||||
pDSP);
|
pDSP);
|
||||||
routerConfig.noSSE2 = pConfig->noSSE2;
|
routerConfig.noSSE2 = pConfig->noSSE2;
|
||||||
routerConfig.noAVX = pConfig->noAVX;
|
routerConfig.noAVX2 = pConfig->noAVX2;
|
||||||
routerConfig.noAVX512 = pConfig->noAVX512;
|
routerConfig.noAVX512 = pConfig->noAVX512;
|
||||||
routerConfig.noNEON = pConfig->noNEON;
|
routerConfig.noNEON = pConfig->noNEON;
|
||||||
|
|
||||||
@@ -21848,7 +21896,7 @@ float mal_calculate_cpu_speed_factor()
|
|||||||
// indication on the speed of the system, but SIMD is used more heavily in the DSP pipeline than in the general case which may make
|
// indication on the speed of the system, but SIMD is used more heavily in the DSP pipeline than in the general case which may make
|
||||||
// the results a little less realistic.
|
// the results a little less realistic.
|
||||||
config.noSSE2 = MAL_TRUE;
|
config.noSSE2 = MAL_TRUE;
|
||||||
config.noAVX = MAL_TRUE;
|
config.noAVX2 = MAL_TRUE;
|
||||||
config.noAVX512 = MAL_TRUE;
|
config.noAVX512 = MAL_TRUE;
|
||||||
config.noNEON = MAL_TRUE;
|
config.noNEON = MAL_TRUE;
|
||||||
|
|
||||||
@@ -23414,12 +23462,13 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSineWave, mal_uint64 count, float*
|
|||||||
// as the backend's internal device, and as such results in a pass-through data transmission pipeline.
|
// as the backend's internal device, and as such results in a pass-through data transmission pipeline.
|
||||||
// - Add support for passing in NULL for the device config in mal_device_init(), which uses a default
|
// - Add support for passing in NULL for the device config in mal_device_init(), which uses a default
|
||||||
// config. This requires manually calling mal_device_set_send/recv_callback().
|
// config. This requires manually calling mal_device_set_send/recv_callback().
|
||||||
|
// - Add support for decoding from raw PCM data (mal_decoder_init_raw(), etc.)
|
||||||
// - Make mal_device_init_ex() more robust.
|
// - Make mal_device_init_ex() more robust.
|
||||||
// - Make some APIs more const-correct.
|
// - Make some APIs more const-correct.
|
||||||
// - Fix errors with OpenAL detection.
|
// - Fix errors with OpenAL detection.
|
||||||
// - Fix some memory leaks.
|
// - Fix some memory leaks.
|
||||||
// - Fix a bug with opening decoders from memory.
|
// - Fix a bug with opening decoders from memory.
|
||||||
// - Add support for decoding from raw PCM data (mal_decoder_init_raw(), etc.)
|
// - Early work on SSE2, AVX2 and NEON optimizations.
|
||||||
// - Miscellaneous bug fixes.
|
// - Miscellaneous bug fixes.
|
||||||
// - Documentation updates.
|
// - Documentation updates.
|
||||||
//
|
//
|
||||||
|
|||||||
+51
-49
@@ -5,7 +5,7 @@ typedef enum
|
|||||||
{
|
{
|
||||||
simd_mode_scalar = 0,
|
simd_mode_scalar = 0,
|
||||||
simd_mode_sse2,
|
simd_mode_sse2,
|
||||||
simd_mode_avx,
|
simd_mode_avx2,
|
||||||
simd_mode_avx512,
|
simd_mode_avx512,
|
||||||
simd_mode_neon
|
simd_mode_neon
|
||||||
} simd_mode;
|
} simd_mode;
|
||||||
@@ -14,8 +14,8 @@ const char* simd_mode_to_string(simd_mode mode)
|
|||||||
{
|
{
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case simd_mode_scalar: return "Reference";
|
case simd_mode_scalar: return "Reference";
|
||||||
case simd_mode_sse2: return "SSE2";
|
case simd_mode_sse2: return "SSE2";
|
||||||
case simd_mode_avx: return "AVX";
|
case simd_mode_avx2: return "AVX2";
|
||||||
case simd_mode_avx512: return "AVX-512";
|
case simd_mode_avx512: return "AVX-512";
|
||||||
case simd_mode_neon: return "NEON";
|
case simd_mode_neon: return "NEON";
|
||||||
}
|
}
|
||||||
@@ -266,7 +266,7 @@ void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_fo
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
|
void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
|
||||||
{
|
{
|
||||||
switch (formatIn)
|
switch (formatIn)
|
||||||
@@ -275,10 +275,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
|
|||||||
{
|
{
|
||||||
switch (formatOut)
|
switch (formatOut)
|
||||||
{
|
{
|
||||||
case mal_format_s16: mal_pcm_u8_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s16: mal_pcm_u8_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s24: mal_pcm_u8_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s24: mal_pcm_u8_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s32: mal_pcm_u8_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s32: mal_pcm_u8_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_f32: mal_pcm_u8_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_f32: mal_pcm_u8_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@@ -287,10 +287,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
|
|||||||
{
|
{
|
||||||
switch (formatOut)
|
switch (formatOut)
|
||||||
{
|
{
|
||||||
case mal_format_u8: mal_pcm_s16_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_u8: mal_pcm_s16_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s24: mal_pcm_s16_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s24: mal_pcm_s16_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s32: mal_pcm_s16_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s32: mal_pcm_s16_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_f32: mal_pcm_s16_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_f32: mal_pcm_s16_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@@ -299,10 +299,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
|
|||||||
{
|
{
|
||||||
switch (formatOut)
|
switch (formatOut)
|
||||||
{
|
{
|
||||||
case mal_format_u8: mal_pcm_s24_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_u8: mal_pcm_s24_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s16: mal_pcm_s24_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s16: mal_pcm_s24_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s32: mal_pcm_s24_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s32: mal_pcm_s24_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_f32: mal_pcm_s24_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_f32: mal_pcm_s24_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@@ -311,10 +311,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
|
|||||||
{
|
{
|
||||||
switch (formatOut)
|
switch (formatOut)
|
||||||
{
|
{
|
||||||
case mal_format_u8: mal_pcm_s32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_u8: mal_pcm_s32_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s16: mal_pcm_s32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s16: mal_pcm_s32_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s24: mal_pcm_s32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s24: mal_pcm_s32_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_f32: mal_pcm_s32_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_f32: mal_pcm_s32_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@@ -323,10 +323,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
|
|||||||
{
|
{
|
||||||
switch (formatOut)
|
switch (formatOut)
|
||||||
{
|
{
|
||||||
case mal_format_u8: mal_pcm_f32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_u8: mal_pcm_f32_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s16: mal_pcm_f32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s16: mal_pcm_f32_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s24: mal_pcm_f32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s24: mal_pcm_f32_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
case mal_format_s32: mal_pcm_f32_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
|
case mal_format_s32: mal_pcm_f32_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@@ -495,8 +495,8 @@ void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format f
|
|||||||
} break;
|
} break;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(MAL_SUPPORT_AVX)
|
#if defined(MAL_SUPPORT_AVX2)
|
||||||
case simd_mode_avx:
|
case simd_mode_avx2:
|
||||||
{
|
{
|
||||||
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
|
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
|
||||||
} break;
|
} break;
|
||||||
@@ -515,6 +515,8 @@ void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format f
|
|||||||
pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
|
pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
|
||||||
} break;
|
} break;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
default: break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -611,8 +613,8 @@ int do_profiling__format_conversion__profile_set(mal_format formatIn, mal_format
|
|||||||
if (mal_has_sse2()) {
|
if (mal_has_sse2()) {
|
||||||
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime);
|
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime);
|
||||||
}
|
}
|
||||||
if (mal_has_avx()) {
|
if (mal_has_avx2()) {
|
||||||
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx, pReferenceData, referenceTime);
|
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx2, pReferenceData, referenceTime);
|
||||||
}
|
}
|
||||||
if (mal_has_avx512f()) {
|
if (mal_has_avx512f()) {
|
||||||
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime);
|
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime);
|
||||||
@@ -651,7 +653,7 @@ float g_ChannelRouterProfilingOutputBenchmark[8][48000];
|
|||||||
float g_ChannelRouterProfilingOutput[8][48000];
|
float g_ChannelRouterProfilingOutput[8][48000];
|
||||||
double g_ChannelRouterTime_Reference = 0;
|
double g_ChannelRouterTime_Reference = 0;
|
||||||
double g_ChannelRouterTime_SSE2 = 0;
|
double g_ChannelRouterTime_SSE2 = 0;
|
||||||
double g_ChannelRouterTime_AVX = 0;
|
double g_ChannelRouterTime_AVX2 = 0;
|
||||||
double g_ChannelRouterTime_AVX512 = 0;
|
double g_ChannelRouterTime_AVX512 = 0;
|
||||||
double g_ChannelRouterTime_NEON = 0;
|
double g_ChannelRouterTime_NEON = 0;
|
||||||
|
|
||||||
@@ -709,7 +711,7 @@ int do_profiling__channel_routing()
|
|||||||
router.isPassthrough = MAL_FALSE;
|
router.isPassthrough = MAL_FALSE;
|
||||||
router.isSimpleShuffle = MAL_FALSE;
|
router.isSimpleShuffle = MAL_FALSE;
|
||||||
router.useSSE2 = MAL_FALSE;
|
router.useSSE2 = MAL_FALSE;
|
||||||
router.useAVX = MAL_FALSE;
|
router.useAVX2 = MAL_FALSE;
|
||||||
router.useAVX512 = MAL_FALSE;
|
router.useAVX512 = MAL_FALSE;
|
||||||
router.useNEON = MAL_FALSE;
|
router.useNEON = MAL_FALSE;
|
||||||
|
|
||||||
@@ -781,20 +783,20 @@ int do_profiling__channel_routing()
|
|||||||
printf("SSE2: %.4fms (%.2f%%)\n", g_ChannelRouterTime_SSE2*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_SSE2*100);
|
printf("SSE2: %.4fms (%.2f%%)\n", g_ChannelRouterTime_SSE2*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_SSE2*100);
|
||||||
}
|
}
|
||||||
|
|
||||||
// AVX
|
// AVX2
|
||||||
if (mal_has_avx()) {
|
if (mal_has_avx2()) {
|
||||||
router.useAVX = MAL_TRUE;
|
router.useAVX2 = MAL_TRUE;
|
||||||
mal_timer timer;
|
mal_timer timer;
|
||||||
mal_timer_init(&timer);
|
mal_timer_init(&timer);
|
||||||
double startTime = mal_timer_get_time_in_seconds(&timer);
|
double startTime = mal_timer_get_time_in_seconds(&timer);
|
||||||
|
|
||||||
framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOut, NULL);
|
framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOut, NULL);
|
||||||
if (framesRead != framesToRead) {
|
if (framesRead != framesToRead) {
|
||||||
printf("Channel Router: An error occurred while reading AVX data.\n");
|
printf("Channel Router: An error occurred while reading AVX2 data.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
g_ChannelRouterTime_AVX = mal_timer_get_time_in_seconds(&timer) - startTime;
|
g_ChannelRouterTime_AVX2 = mal_timer_get_time_in_seconds(&timer) - startTime;
|
||||||
router.useAVX = MAL_FALSE;
|
router.useAVX2 = MAL_FALSE;
|
||||||
|
|
||||||
if (!channel_router_test(channels, framesRead, (float**)ppOutBenchmark, (float**)ppOut)) {
|
if (!channel_router_test(channels, framesRead, (float**)ppOutBenchmark, (float**)ppOut)) {
|
||||||
printf(" [ERROR] ");
|
printf(" [ERROR] ");
|
||||||
@@ -802,7 +804,7 @@ int do_profiling__channel_routing()
|
|||||||
printf(" [PASSED] ");
|
printf(" [PASSED] ");
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("AVX: %.4fms (%.2f%%)\n", g_ChannelRouterTime_AVX*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_AVX*100);
|
printf("AVX2: %.4fms (%.2f%%)\n", g_ChannelRouterTime_AVX2*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_AVX2*100);
|
||||||
}
|
}
|
||||||
|
|
||||||
// NEON
|
// NEON
|
||||||
@@ -887,12 +889,12 @@ mal_result init_src(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sam
|
|||||||
srcConfig.sinc.windowWidth = 17; // <-- Make this an odd number to test unaligned section in the SIMD implementations.
|
srcConfig.sinc.windowWidth = 17; // <-- Make this an odd number to test unaligned section in the SIMD implementations.
|
||||||
srcConfig.algorithm = algorithm;
|
srcConfig.algorithm = algorithm;
|
||||||
srcConfig.noSSE2 = MAL_TRUE;
|
srcConfig.noSSE2 = MAL_TRUE;
|
||||||
srcConfig.noAVX = MAL_TRUE;
|
srcConfig.noAVX2 = MAL_TRUE;
|
||||||
srcConfig.noAVX512 = MAL_TRUE;
|
srcConfig.noAVX512 = MAL_TRUE;
|
||||||
srcConfig.noNEON = MAL_TRUE;
|
srcConfig.noNEON = MAL_TRUE;
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case simd_mode_sse2: srcConfig.noSSE2 = MAL_FALSE; break;
|
case simd_mode_sse2: srcConfig.noSSE2 = MAL_FALSE; break;
|
||||||
case simd_mode_avx: srcConfig.noAVX = MAL_FALSE; break;
|
case simd_mode_avx2: srcConfig.noAVX2 = MAL_FALSE; break;
|
||||||
case simd_mode_avx512: srcConfig.noAVX512 = MAL_FALSE; break;
|
case simd_mode_avx512: srcConfig.noAVX512 = MAL_FALSE; break;
|
||||||
case simd_mode_neon: srcConfig.noNEON = MAL_FALSE; break;
|
case simd_mode_neon: srcConfig.noNEON = MAL_FALSE; break;
|
||||||
case simd_mode_scalar:
|
case simd_mode_scalar:
|
||||||
@@ -1032,8 +1034,8 @@ int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn,
|
|||||||
if (mal_has_sse2()) {
|
if (mal_has_sse2()) {
|
||||||
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
|
||||||
}
|
}
|
||||||
if (mal_has_avx()) {
|
if (mal_has_avx2()) {
|
||||||
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx, &referenceData);
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx2, &referenceData);
|
||||||
}
|
}
|
||||||
if (mal_has_avx512f()) {
|
if (mal_has_avx512f()) {
|
||||||
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx512, &referenceData);
|
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx512, &referenceData);
|
||||||
@@ -1115,11 +1117,11 @@ int main(int argc, char** argv)
|
|||||||
//__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
|
//__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
|
||||||
//__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
|
//__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
|
||||||
|
|
||||||
__m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
|
//__m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
|
||||||
__m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8);
|
//__m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8);
|
||||||
__m256i r = drmath_vf32_to_vi16__avx(f0, f1);
|
//__m256i r = drmath_vf32_to_vi16__avx(f0, f1);
|
||||||
|
//
|
||||||
int a = 5;
|
//int a = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1130,10 +1132,10 @@ int main(int argc, char** argv)
|
|||||||
} else {
|
} else {
|
||||||
printf("Has SSE2: NO\n");
|
printf("Has SSE2: NO\n");
|
||||||
}
|
}
|
||||||
if (mal_has_avx()) {
|
if (mal_has_avx2()) {
|
||||||
printf("Has AVX: YES\n");
|
printf("Has AVX2: YES\n");
|
||||||
} else {
|
} else {
|
||||||
printf("Has AVX: NO\n");
|
printf("Has AVX2: NO\n");
|
||||||
}
|
}
|
||||||
if (mal_has_avx512f()) {
|
if (mal_has_avx512f()) {
|
||||||
printf("Has AVX-512F: YES\n");
|
printf("Has AVX-512F: YES\n");
|
||||||
|
|||||||
Reference in New Issue
Block a user