Drop support for AVX and replace with AVX2.

Rationale for this is that it just makes things simpler for integer
operations.
This commit is contained in:
David Reid
2018-05-27 21:01:19 +10:00
parent ef2ad30080
commit 6b988bbc97
2 changed files with 189 additions and 138 deletions
+138 -89
View File
@@ -207,8 +207,8 @@
// #define MAL_NO_SSE2
// Disables SSE2 optimizations.
//
// #define MAL_NO_AVX
// Disables AVX optimizations.
// #define MAL_NO_AVX2
// Disables AVX2 optimizations.
//
// #define MAL_NO_AVX512
// Disables AVX-512 optimizations.
@@ -813,7 +813,7 @@ typedef struct
mal_stream_format streamFormatOut;
mal_dither_mode ditherMode;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX2 : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_format_converter_read_proc onRead;
@@ -825,7 +825,7 @@ struct mal_format_converter
{
mal_format_converter_config config;
mal_bool32 useSSE2 : 1;
mal_bool32 useAVX : 1;
mal_bool32 useAVX2 : 1;
mal_bool32 useAVX512 : 1;
mal_bool32 useNEON : 1;
void (* onConvertPCM)(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode);
@@ -846,7 +846,7 @@ typedef struct
mal_channel channelMapOut[MAL_MAX_CHANNELS];
mal_channel_mix_mode mixingMode;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX2 : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_channel_router_read_deinterleaved_proc onReadDeinterleaved;
@@ -859,7 +859,7 @@ struct mal_channel_router
mal_bool32 isPassthrough : 1;
mal_bool32 isSimpleShuffle : 1;
mal_bool32 useSSE2 : 1;
mal_bool32 useAVX : 1;
mal_bool32 useAVX2 : 1;
mal_bool32 useAVX512 : 1;
mal_bool32 useNEON : 1;
mal_uint8 shuffleTable[MAL_MAX_CHANNELS];
@@ -894,7 +894,7 @@ typedef struct
mal_uint32 channels;
mal_src_algorithm algorithm;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX2 : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_src_read_deinterleaved_proc onReadDeinterleaved;
@@ -932,7 +932,7 @@ MAL_ALIGNED_STRUCT(MAL_SIMD_ALIGNMENT) mal_src
mal_src_config config;
mal_bool32 useSSE2 : 1;
mal_bool32 useAVX : 1;
mal_bool32 useAVX2 : 1;
mal_bool32 useAVX512 : 1;
mal_bool32 useNEON : 1;
};
@@ -955,7 +955,7 @@ typedef struct
mal_src_algorithm srcAlgorithm;
mal_bool32 allowDynamicSampleRate;
mal_bool32 noSSE2 : 1;
mal_bool32 noAVX : 1;
mal_bool32 noAVX2 : 1;
mal_bool32 noAVX512 : 1;
mal_bool32 noNEON : 1;
mal_dsp_read_proc onRead;
@@ -2485,8 +2485,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#if !defined(MAL_NO_SSE2) // Assume all MSVC compilers support SSE2 intrinsics.
#define MAL_SUPPORT_SSE2
#endif
#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
#define MAL_SUPPORT_AVX
//#if _MSC_VER >= 1600 && !defined(MAL_NO_AVX) // 2010
// #define MAL_SUPPORT_AVX
//#endif
#if _MSC_VER >= 1700 && !defined(MAL_NO_AVX2) // 2012
#define MAL_SUPPORT_AVX2
#endif
#if _MSC_VER >= 1910 && !defined(MAL_NO_AVX512) // 2017
#define MAL_SUPPORT_AVX512
@@ -2496,8 +2499,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#if defined(__SSE2__) && !defined(MAL_NO_SSE2)
#define MAL_SUPPORT_SSE2
#endif
#if defined(__AVX__) && !defined(MAL_NO_AVX)
#define MAL_SUPPORT_AVX
//#if defined(__AVX__) && !defined(MAL_NO_AVX)
// #define MAL_SUPPORT_AVX
//#endif
#if defined(__AVX2__) && !defined(MAL_NO_AVX2)
#define MAL_SUPPORT_AVX2
#endif
#if defined(__AVX512F__) && !defined(MAL_NO_AVX512)
#define MAL_SUPPORT_AVX512
@@ -2509,8 +2515,11 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#if !defined(MAL_SUPPORT_SSE2) && !defined(MAL_NO_SSE2) && __has_include(<emmintrin.h>)
#define MAL_SUPPORT_SSE2
#endif
#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
#define MAL_SUPPORT_AVX
//#if !defined(MAL_SUPPORT_AVX) && !defined(MAL_NO_AVX) && __has_include(<immintrin.h>)
// #define MAL_SUPPORT_AVX
//#endif
#if !defined(MAL_SUPPORT_AVX2) && !defined(MAL_NO_AVX2) && __has_include(<immintrin.h>)
#define MAL_SUPPORT_AVX2
#endif
#if !defined(MAL_SUPPORT_AVX512) && !defined(MAL_NO_AVX512) && __has_include(<zmmintrin.h>)
#define MAL_SUPPORT_AVX512
@@ -2519,7 +2528,7 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSignWave, mal_uint64 count, float*
#if defined(MAL_SUPPORT_AVX512)
#include <immintrin.h> // Not a mistake. Intentionally including <immintrin.h> instead of <zmmintrin.h> because otherwise the compiler will complain.
#elif defined(MAL_SUPPORT_AVX)
#elif defined(MAL_SUPPORT_AVX2) || defined(MAL_SUPPORT_AVX)
#include <immintrin.h>
#elif defined(MAL_SUPPORT_SSE2)
#include <emmintrin.h>
@@ -2617,6 +2626,7 @@ static MAL_INLINE mal_bool32 mal_has_sse2()
#endif
}
#if 0
static MAL_INLINE mal_bool32 mal_has_avx()
{
#if defined(MAL_SUPPORT_AVX)
@@ -2649,6 +2659,42 @@ static MAL_INLINE mal_bool32 mal_has_avx()
return MAL_FALSE; // No compiler support.
#endif
}
#endif
static MAL_INLINE mal_bool32 mal_has_avx2()
{
#if defined(MAL_SUPPORT_AVX2)
#if (defined(MAL_X64) || defined(MAL_X86)) && !defined(MAL_NO_AVX2)
#if defined(_AVX2_) || defined(__AVX2__)
return MAL_TRUE; // If the compiler is allowed to freely generate AVX2 code we can assume support.
#else
// AVX requires both CPU and OS support.
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info1[4];
int info7[4];
mal_cpuid(info1, 1);
mal_cpuid(info7, 7);
if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 5)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0x06) == 0x06) {
return MAL_TRUE;
} else {
return MAL_FALSE;
}
} else {
return MAL_FALSE;
}
#endif
#endif
#else
return MAL_FALSE; // AVX is only supported on x86 and x64 architectures.
#endif
#else
return MAL_FALSE; // No compiler support.
#endif
}
static MAL_INLINE mal_bool32 mal_has_avx512f()
{
@@ -2661,9 +2707,11 @@ static MAL_INLINE mal_bool32 mal_has_avx512f()
#if defined(MAL_NO_CPUID) || defined(MAL_NO_XGETBV)
return MAL_FALSE;
#else
int info[4];
mal_cpuid(info, 1);
if (((info[2] & (1 << 27)) != 0) && ((info[1] & (1 << 16)) != 0)) {
int info1[4];
int info7[4];
mal_cpuid(info1, 1);
mal_cpuid(info7, 7);
if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 16)) != 0)) {
mal_uint64 xrc = mal_xgetbv(0);
if ((xrc & 0xE6) == 0xE6) {
return MAL_TRUE;
@@ -3223,8 +3271,8 @@ static MAL_INLINE __m128 mal_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
}
#endif
#if defined(MAL_SUPPORT_AVX)
static MAL_INLINE __m256 mal_mix_f32_fast__avx(__m256 x, __m256 y, __m256 a)
#if defined(MAL_SUPPORT_AVX2)
static MAL_INLINE __m256 mal_mix_f32_fast__avx2(__m256 x, __m256 y, __m256 a)
{
return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
}
@@ -17288,8 +17336,8 @@ void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_u8_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -17346,8 +17394,8 @@ void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_u8_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -17402,8 +17450,8 @@ void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_u8_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -17459,13 +17507,13 @@ void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_u8_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_u8_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_SSE2)
#if defined(MAL_SUPPORT_AVX512)
void mal_pcm_u8_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
@@ -17611,8 +17659,8 @@ void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s16_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -17673,8 +17721,8 @@ void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s16_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -17726,8 +17774,8 @@ void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s16_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -17791,8 +17839,8 @@ void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s16_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s16_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -17921,8 +17969,8 @@ void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s24_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -17992,8 +18040,8 @@ void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s24_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -18053,8 +18101,8 @@ void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s24_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -18118,8 +18166,8 @@ void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s24_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s24_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -18255,8 +18303,8 @@ void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s32_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -18326,8 +18374,8 @@ void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s32_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -18382,8 +18430,8 @@ void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s32_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -18453,8 +18501,8 @@ void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_s32_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_s32_to_f32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -18576,8 +18624,8 @@ void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_f32_to_u8__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -18775,8 +18823,8 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_
}
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_f32_to_s16__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_int16* dst_s16 = (mal_int16*)dst;
const float* src_f32 = (const float*)src;
@@ -18790,7 +18838,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
mal_uint64 i = 0;
// AVX. AVX allows us to output 16 s16's at a time which means our loop is unrolled 16 times.
// AVX2. AVX2 allows us to output 16 s16's at a time which means our loop is unrolled 16 times.
mal_uint64 count16 = count >> 4;
for (mal_uint64 i16 = 0; i16 < count16; i16 += 1) {
__m256 d0;
@@ -18851,7 +18899,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
x0 = _mm256_mul_ps(x0, _mm256_set1_ps(32767.0f));
x1 = _mm256_mul_ps(x1, _mm256_set1_ps(32767.0f));
// Computing the final result is a little more complicated for AVX than SSE.
// Computing the final result is a little more complicated for AVX2 than SSE2.
__m256i i0 = _mm256_cvttps_epi32(x0);
__m256i i1 = _mm256_cvttps_epi32(x1);
__m256i p0 = _mm256_permute2x128_si256(i0, i1, 32);
@@ -18878,7 +18926,7 @@ void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_d
void mal_pcm_f32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
// TODO: Convert this from AVX to AVX-512.
mal_pcm_f32_to_s16__avx(dst, src, count, ditherMode);
mal_pcm_f32_to_s16__avx2(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_NEON)
@@ -18938,8 +18986,8 @@ void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_f32_to_s24__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -19004,8 +19052,8 @@ void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
#endif
#if defined(MAL_SUPPORT_AVX)
void mal_pcm_f32_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
#if defined(MAL_SUPPORT_AVX2)
void mal_pcm_f32_to_s32__avx2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -19115,7 +19163,7 @@ mal_result mal_format_converter_init(const mal_format_converter_config* pConfig,
// SIMD
pConverter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
pConverter->useAVX = mal_has_avx() && !pConfig->noAVX;
pConverter->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
pConverter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
pConverter->useNEON = mal_has_neon() && !pConfig->noNEON;
@@ -19764,7 +19812,7 @@ mal_result mal_channel_router_init(const mal_channel_router_config* pConfig, mal
// SIMD
pRouter->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
pRouter->useAVX = mal_has_avx() && !pConfig->noAVX;
pRouter->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
pRouter->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
pRouter->useNEON = mal_has_neon() && !pConfig->noNEON;
@@ -19948,9 +19996,9 @@ static MAL_INLINE mal_bool32 mal_channel_router__can_use_sse2(mal_channel_router
return pRouter->useSSE2 && (((mal_uintptr)pSamplesOut & 15) == 0) && (((mal_uintptr)pSamplesIn & 15) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx2(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
{
return pRouter->useAVX && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
return pRouter->useAVX2 && (((mal_uintptr)pSamplesOut & 31) == 0) && (((mal_uintptr)pSamplesIn & 31) == 0);
}
static MAL_INLINE mal_bool32 mal_channel_router__can_use_avx512(mal_channel_router* pRouter, const float* pSamplesOut, const float* pSamplesIn)
@@ -20017,8 +20065,8 @@ void mal_channel_router__do_routing(mal_channel_router* pRouter, mal_uint64 fram
}
else
#endif
#if defined(MAL_SUPPORT_AVX)
if (mal_channel_router__can_use_avx(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
#if defined(MAL_SUPPORT_AVX2)
if (mal_channel_router__can_use_avx2(pRouter, ppSamplesOut[iChannelOut], ppSamplesIn[iChannelIn])) {
__m256 weight = _mm256_set1_ps(pRouter->weights[iChannelIn][iChannelOut]);
mal_uint64 frameCount8 = frameCount/8;
@@ -20268,7 +20316,7 @@ mal_result mal_src_init(const mal_src_config* pConfig, mal_src* pSRC)
// SIMD
pSRC->useSSE2 = mal_has_sse2() && !pConfig->noSSE2;
pSRC->useAVX = mal_has_avx() && !pConfig->noAVX;
pSRC->useAVX2 = mal_has_avx2() && !pConfig->noAVX2;
pSRC->useAVX512 = mal_has_avx512f() && !pConfig->noAVX512;
pSRC->useNEON = mal_has_neon() && !pConfig->noNEON;
@@ -20682,20 +20730,20 @@ static MAL_INLINE __m128 mal_src_sinc__interpolation_factor__sse2(const mal_src*
}
#endif
#if defined(MAL_SUPPORT_AVX)
static MAL_INLINE __m256 mal_fabsf_avx(__m256 x)
#if defined(MAL_SUPPORT_AVX2)
static MAL_INLINE __m256 mal_fabsf_avx2(__m256 x)
{
return _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)), x);
}
#if 0
static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src* pSRC, __m256 x)
static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx2(const mal_src* pSRC, __m256 x)
{
//__m256 windowWidth256 = _mm256_set1_ps(MAL_SRC_SINC_MAX_WINDOW_WIDTH);
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
//__m256 one = _mm256_set1_ps(1);
__m256 xabs = mal_fabsf_avx(x);
__m256 xabs = mal_fabsf_avx2(x);
// if (MAL_SRC_SINC_MAX_WINDOW_WIDTH <= xabs) xabs = 1 else xabs = xabs;
//__m256 xcmp = _mm256_cmp_ps(windowWidth256, xabs, 2); // 2 = Less than or equal = _mm_cmple_ps.
@@ -20731,7 +20779,7 @@ static MAL_INLINE __m256 mal_src_sinc__interpolation_factor__avx(const mal_src*
pSRC->sinc.table[ixabsv[0]+1]
);
__m256 r = mal_mix_f32_fast__avx(lo, hi, a);
__m256 r = mal_mix_f32_fast__avx2(lo, hi, a);
return r;
}
@@ -20799,8 +20847,8 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
}
else
#endif
#if defined(MAL_SUPPORT_AVX)
if (pSRC->useAVX) {
#if defined(MAL_SUPPORT_AVX2)
if (pSRC->useAVX2) {
windowWidthSIMD = (windowWidthSIMD + 3) & ~(3);
}
else
@@ -20866,8 +20914,8 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
windowSamples[i] = pSRC->sinc.input[iChannel][iTimeIn + i];
}
#if defined(MAL_SUPPORT_AVX)
if (pSRC->useAVX) {
#if defined(MAL_SUPPORT_AVX2)
if (pSRC->useAVX2) {
__m256i ixabs[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
__m256 a[MAL_SRC_SINC_MAX_WINDOW_WIDTH*2/8];
__m256 resolution256 = _mm256_set1_ps(MAL_SRC_SINC_LOOKUP_TABLE_RESOLUTION);
@@ -20880,7 +20928,7 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
__m256 w = *((__m256*)iWindowF + iWindow8);
__m256 xabs = _mm256_sub_ps(t, w);
xabs = mal_fabsf_avx(xabs);
xabs = mal_fabsf_avx2(xabs);
xabs = _mm256_mul_ps(xabs, resolution256);
ixabs[iWindow8] = _mm256_cvttps_epi32(xabs);
@@ -20913,7 +20961,7 @@ mal_uint64 mal_src_read_deinterleaved__sinc(mal_src* pSRC, mal_uint64 frameCount
);
__m256 s = *((__m256*)windowSamples + iWindow8);
r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx(lo, hi, a[iWindow8])));
r = _mm256_add_ps(r, _mm256_mul_ps(s, mal_mix_f32_fast__avx2(lo, hi, a[iWindow8])));
}
// Horizontal add.
@@ -21345,7 +21393,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
);
preFormatConverterConfig.ditherMode = pConfig->ditherMode;
preFormatConverterConfig.noSSE2 = pConfig->noSSE2;
preFormatConverterConfig.noAVX = pConfig->noAVX;
preFormatConverterConfig.noAVX2 = pConfig->noAVX2;
preFormatConverterConfig.noAVX512 = pConfig->noAVX512;
preFormatConverterConfig.noNEON = pConfig->noNEON;
@@ -21364,7 +21412,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
postFormatConverterConfig.channels = pConfig->channelsOut;
postFormatConverterConfig.ditherMode = pConfig->ditherMode;
postFormatConverterConfig.noSSE2 = pConfig->noSSE2;
postFormatConverterConfig.noAVX = pConfig->noAVX;
postFormatConverterConfig.noAVX2 = pConfig->noAVX2;
postFormatConverterConfig.noAVX512 = pConfig->noAVX512;
postFormatConverterConfig.noNEON = pConfig->noNEON;
if (pDSP->isPreFormatConversionRequired) {
@@ -21391,7 +21439,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
);
srcConfig.algorithm = pConfig->srcAlgorithm;
srcConfig.noSSE2 = pConfig->noSSE2;
srcConfig.noAVX = pConfig->noAVX;
srcConfig.noAVX2 = pConfig->noAVX2;
srcConfig.noAVX512 = pConfig->noAVX512;
srcConfig.noNEON = pConfig->noNEON;
mal_copy_memory(&srcConfig.sinc, &pConfig->sinc, sizeof(pConfig->sinc));
@@ -21413,7 +21461,7 @@ mal_result mal_dsp_init(const mal_dsp_config* pConfig, mal_dsp* pDSP)
mal_dsp__channel_router_on_read_deinterleaved,
pDSP);
routerConfig.noSSE2 = pConfig->noSSE2;
routerConfig.noAVX = pConfig->noAVX;
routerConfig.noAVX2 = pConfig->noAVX2;
routerConfig.noAVX512 = pConfig->noAVX512;
routerConfig.noNEON = pConfig->noNEON;
@@ -21848,7 +21896,7 @@ float mal_calculate_cpu_speed_factor()
// indication on the speed of the system, but SIMD is used more heavily in the DSP pipeline than in the general case which may make
// the results a little less realistic.
config.noSSE2 = MAL_TRUE;
config.noAVX = MAL_TRUE;
config.noAVX2 = MAL_TRUE;
config.noAVX512 = MAL_TRUE;
config.noNEON = MAL_TRUE;
@@ -23414,12 +23462,13 @@ mal_uint64 mal_sine_wave_read(mal_sine_wave* pSineWave, mal_uint64 count, float*
// as the backend's internal device, and as such results in a pass-through data transmission pipeline.
// - Add support for passing in NULL for the device config in mal_device_init(), which uses a default
// config. This requires manually calling mal_device_set_send/recv_callback().
// - Add support for decoding from raw PCM data (mal_decoder_init_raw(), etc.)
// - Make mal_device_init_ex() more robust.
// - Make some APIs more const-correct.
// - Fix errors with OpenAL detection.
// - Fix some memory leaks.
// - Fix a bug with opening decoders from memory.
// - Add support for decoding from raw PCM data (mal_decoder_init_raw(), etc.)
// - Early work on SSE2, AVX2 and NEON optimizations.
// - Miscellaneous bug fixes.
// - Documentation updates.
//
+50 -48
View File
@@ -5,7 +5,7 @@ typedef enum
{
simd_mode_scalar = 0,
simd_mode_sse2,
simd_mode_avx,
simd_mode_avx2,
simd_mode_avx512,
simd_mode_neon
} simd_mode;
@@ -15,7 +15,7 @@ const char* simd_mode_to_string(simd_mode mode)
switch (mode) {
case simd_mode_scalar: return "Reference";
case simd_mode_sse2: return "SSE2";
case simd_mode_avx: return "AVX";
case simd_mode_avx2: return "AVX2";
case simd_mode_avx512: return "AVX-512";
case simd_mode_neon: return "NEON";
}
@@ -266,7 +266,7 @@ void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_fo
}
#endif
#if defined(MAL_SUPPORT_AVX)
#if defined(MAL_SUPPORT_AVX2)
void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
{
switch (formatIn)
@@ -275,10 +275,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
{
switch (formatOut)
{
case mal_format_s16: mal_pcm_u8_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_u8_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_u8_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_u8_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_u8_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
@@ -287,10 +287,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s16_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_u8: mal_pcm_s16_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s16_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s16_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s16_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
@@ -299,10 +299,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s24_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_u8: mal_pcm_s24_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s24_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_s24_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s24_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
@@ -311,10 +311,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_s32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_u8: mal_pcm_s32_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_s32_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_s32_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_f32: mal_pcm_s32_to_f32__avx2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
@@ -323,10 +323,10 @@ void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_for
{
switch (formatOut)
{
case mal_format_u8: mal_pcm_f32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_u8: mal_pcm_f32_to_u8__avx2( pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s16: mal_pcm_f32_to_s16__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s24: mal_pcm_f32_to_s24__avx2(pOut, pIn, sampleCount, ditherMode); return;
case mal_format_s32: mal_pcm_f32_to_s32__avx2(pOut, pIn, sampleCount, ditherMode); return;
default: break;
}
} break;
@@ -495,8 +495,8 @@ void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format f
} break;
#endif
#if defined(MAL_SUPPORT_AVX)
case simd_mode_avx:
#if defined(MAL_SUPPORT_AVX2)
case simd_mode_avx2:
{
pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
@@ -515,6 +515,8 @@ void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format f
pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
} break;
#endif
default: break;
}
}
@@ -611,8 +613,8 @@ int do_profiling__format_conversion__profile_set(mal_format formatIn, mal_format
if (mal_has_sse2()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime);
}
if (mal_has_avx()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx, pReferenceData, referenceTime);
if (mal_has_avx2()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx2, pReferenceData, referenceTime);
}
if (mal_has_avx512f()) {
do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime);
@@ -651,7 +653,7 @@ float g_ChannelRouterProfilingOutputBenchmark[8][48000];
float g_ChannelRouterProfilingOutput[8][48000];
double g_ChannelRouterTime_Reference = 0;
double g_ChannelRouterTime_SSE2 = 0;
double g_ChannelRouterTime_AVX = 0;
double g_ChannelRouterTime_AVX2 = 0;
double g_ChannelRouterTime_AVX512 = 0;
double g_ChannelRouterTime_NEON = 0;
@@ -709,7 +711,7 @@ int do_profiling__channel_routing()
router.isPassthrough = MAL_FALSE;
router.isSimpleShuffle = MAL_FALSE;
router.useSSE2 = MAL_FALSE;
router.useAVX = MAL_FALSE;
router.useAVX2 = MAL_FALSE;
router.useAVX512 = MAL_FALSE;
router.useNEON = MAL_FALSE;
@@ -781,20 +783,20 @@ int do_profiling__channel_routing()
printf("SSE2: %.4fms (%.2f%%)\n", g_ChannelRouterTime_SSE2*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_SSE2*100);
}
// AVX
if (mal_has_avx()) {
router.useAVX = MAL_TRUE;
// AVX2
if (mal_has_avx2()) {
router.useAVX2 = MAL_TRUE;
mal_timer timer;
mal_timer_init(&timer);
double startTime = mal_timer_get_time_in_seconds(&timer);
framesRead = mal_channel_router_read_deinterleaved(&router, framesToRead, ppOut, NULL);
if (framesRead != framesToRead) {
printf("Channel Router: An error occurred while reading AVX data.\n");
printf("Channel Router: An error occurred while reading AVX2 data.\n");
}
g_ChannelRouterTime_AVX = mal_timer_get_time_in_seconds(&timer) - startTime;
router.useAVX = MAL_FALSE;
g_ChannelRouterTime_AVX2 = mal_timer_get_time_in_seconds(&timer) - startTime;
router.useAVX2 = MAL_FALSE;
if (!channel_router_test(channels, framesRead, (float**)ppOutBenchmark, (float**)ppOut)) {
printf(" [ERROR] ");
@@ -802,7 +804,7 @@ int do_profiling__channel_routing()
printf(" [PASSED] ");
}
printf("AVX: %.4fms (%.2f%%)\n", g_ChannelRouterTime_AVX*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_AVX*100);
printf("AVX2: %.4fms (%.2f%%)\n", g_ChannelRouterTime_AVX2*1000, g_ChannelRouterTime_Reference/g_ChannelRouterTime_AVX2*100);
}
// NEON
@@ -887,12 +889,12 @@ mal_result init_src(src_data* pBaseData, mal_uint32 sampleRateIn, mal_uint32 sam
srcConfig.sinc.windowWidth = 17; // <-- Make this an odd number to test unaligned section in the SIMD implementations.
srcConfig.algorithm = algorithm;
srcConfig.noSSE2 = MAL_TRUE;
srcConfig.noAVX = MAL_TRUE;
srcConfig.noAVX2 = MAL_TRUE;
srcConfig.noAVX512 = MAL_TRUE;
srcConfig.noNEON = MAL_TRUE;
switch (mode) {
case simd_mode_sse2: srcConfig.noSSE2 = MAL_FALSE; break;
case simd_mode_avx: srcConfig.noAVX = MAL_FALSE; break;
case simd_mode_avx2: srcConfig.noAVX2 = MAL_FALSE; break;
case simd_mode_avx512: srcConfig.noAVX512 = MAL_FALSE; break;
case simd_mode_neon: srcConfig.noNEON = MAL_FALSE; break;
case simd_mode_scalar:
@@ -1032,8 +1034,8 @@ int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn,
if (mal_has_sse2()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
}
if (mal_has_avx()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx, &referenceData);
if (mal_has_avx2()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx2, &referenceData);
}
if (mal_has_avx512f()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_avx512, &referenceData);
@@ -1115,11 +1117,11 @@ int main(int argc, char** argv)
//__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
//__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
__m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
__m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8);
__m256i r = drmath_vf32_to_vi16__avx(f0, f1);
int a = 5;
//__m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
//__m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8);
//__m256i r = drmath_vf32_to_vi16__avx(f0, f1);
//
//int a = 5;
}
@@ -1130,10 +1132,10 @@ int main(int argc, char** argv)
} else {
printf("Has SSE2: NO\n");
}
if (mal_has_avx()) {
printf("Has AVX: YES\n");
if (mal_has_avx2()) {
printf("Has AVX2: YES\n");
} else {
printf("Has AVX: NO\n");
printf("Has AVX2: NO\n");
}
if (mal_has_avx512f()) {
printf("Has AVX-512F: YES\n");