diff --git a/mini_al.h b/mini_al.h index 10b5c361..2d3088e1 100644 --- a/mini_al.h +++ b/mini_al.h @@ -3306,16 +3306,25 @@ static MAL_INLINE mal_int32 mal_rand_range_s32(mal_int32 lo, mal_int32 hi) } +static MAL_INLINE float mal_dither_f32_rectangle(float ditherMin, float ditherMax) +{ + return mal_rand_range_f32(ditherMin, ditherMax); +} + +static MAL_INLINE float mal_dither_f32_triangle(float ditherMin, float ditherMax) +{ + float a = mal_rand_range_f32(ditherMin, 0); + float b = mal_rand_range_f32(0, ditherMax); + return a + b; +} + static MAL_INLINE float mal_dither_f32(mal_dither_mode ditherMode, float ditherMin, float ditherMax) { if (ditherMode == mal_dither_mode_rectangle) { - float a = mal_rand_range_f32(ditherMin, ditherMax); - return a; + return mal_dither_f32_rectangle(ditherMin, ditherMax); } if (ditherMode == mal_dither_mode_triangle) { - float a = mal_rand_range_f32(ditherMin, 0); - float b = mal_rand_range_f32(0, ditherMax); - return a + b; + return mal_dither_f32_triangle(ditherMin, ditherMax); } return 0; @@ -17273,8 +17282,8 @@ void mal_pcm_u8_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_u8_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); } @@ -17284,13 +17293,9 @@ void mal_pcm_u8_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_u8_to_s16__sse(dst, src, count, ditherMode); #else mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17317,8 +17322,8 @@ void mal_pcm_u8_to_s24__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_u8_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); } @@ -17329,8 +17334,8 @@ void mal_pcm_u8_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_ #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode); #else -#ifdef MAL_USE_SSE - mal_pcm_u8_to_s24__sse(dst, src, count, ditherMode); +#if defined(MAL_SUPPORT_SSE2) + mal_pcm_u8_to_s24__sse2(dst, src, count, ditherMode); #else mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); #endif @@ -17359,8 +17364,8 @@ void mal_pcm_u8_to_s32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_u8_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); } @@ -17370,13 +17375,9 @@ void mal_pcm_u8_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_u8_to_s32__sse(dst, src, count, ditherMode); #else mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17402,8 +17403,8 @@ void mal_pcm_u8_to_f32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_u8_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); } @@ -17413,13 +17414,9 @@ void mal_pcm_u8_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_u8_to_f32__sse(dst, src, count, ditherMode); #else mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17540,8 +17537,8 @@ void mal_pcm_s16_to_u8__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s16_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); } @@ -17551,13 +17548,9 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s16_to_u8__sse(dst, src, count, ditherMode); #else mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17588,8 +17581,8 @@ void mal_pcm_s16_to_s24__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s16_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); } @@ -17599,13 +17592,9 @@ void mal_pcm_s16_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s16_to_s24__sse(dst, src, count, ditherMode); #else mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17627,8 +17616,8 @@ void mal_pcm_s16_to_s32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s16_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); } @@ -17638,13 +17627,9 @@ void mal_pcm_s16_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s16_to_s32__sse(dst, src, count, ditherMode); #else mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17678,8 +17663,8 @@ void mal_pcm_s16_to_f32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s16_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); } @@ -17689,13 +17674,9 @@ void mal_pcm_s16_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s16_to_f32__sse(dst, src, count, ditherMode); #else mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17794,8 +17775,8 @@ void mal_pcm_s24_to_u8__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s24_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); } @@ -17805,13 +17786,9 @@ void mal_pcm_s24_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s24_to_u8__sse(dst, src, count, ditherMode); #else mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17851,8 +17828,8 @@ void mal_pcm_s24_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s24_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); } @@ -17862,13 +17839,9 @@ void mal_pcm_s24_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s24_to_s16__sse(dst, src, count, ditherMode); #else mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17898,8 +17871,8 @@ void mal_pcm_s24_to_s32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s24_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); } @@ -17909,13 +17882,9 @@ void mal_pcm_s24_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s24_to_s32__sse(dst, src, count, ditherMode); #else mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17949,8 +17918,8 @@ void mal_pcm_s24_to_f32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s24_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); } @@ -17960,13 +17929,9 @@ void mal_pcm_s24_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s24_to_f32__sse(dst, src, count, ditherMode); #else mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18072,8 +18037,8 @@ void mal_pcm_s32_to_u8__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); } @@ -18083,13 +18048,9 @@ void mal_pcm_s32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s32_to_u8__sse(dst, src, count, ditherMode); #else mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18129,8 +18090,8 @@ void mal_pcm_s32_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); } @@ -18140,13 +18101,9 @@ void mal_pcm_s32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s32_to_s16__sse(dst, src, count, ditherMode); #else mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18171,8 +18128,8 @@ void mal_pcm_s32_to_s24__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); } @@ -18182,13 +18139,9 @@ void mal_pcm_s32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s32_to_s24__sse(dst, src, count, ditherMode); #else mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18228,8 +18181,8 @@ void mal_pcm_s32_to_f32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_s32_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); } @@ -18239,13 +18192,9 @@ void mal_pcm_s32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_s32_to_f32__sse(dst, src, count, ditherMode); #else mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18337,8 +18286,8 @@ void mal_pcm_f32_to_u8__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_f32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); } @@ -18348,13 +18297,9 @@ void mal_pcm_f32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_ { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_f32_to_u8__sse(dst, src, count, ditherMode); #else mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18392,13 +18337,144 @@ void mal_pcm_f32_to_s16__reference(void* dst, const void* src, mal_uint64 count, void mal_pcm_f32_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { - mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode); + mal_int16* dst_s16 = (mal_int16*)dst; + const float* src_f32 = (const float*)src; + + float ditherMin = 0; + float ditherMax = 0; + if (ditherMode != mal_dither_mode_none) { + ditherMin = 1.0f / -32768; + ditherMax = 1.0f / 32767; + } + + mal_uint64 i = 0; + + // Unrolled. + mal_uint64 count4 = count >> 2; + for (mal_uint64 i4 = 0; i4 < count4; i4 += 1) { + float d0 = mal_dither_f32(ditherMode, ditherMin, ditherMax); + float d1 = mal_dither_f32(ditherMode, ditherMin, ditherMax); + float d2 = mal_dither_f32(ditherMode, ditherMin, ditherMax); + float d3 = mal_dither_f32(ditherMode, ditherMin, ditherMax); + + float x0 = src_f32[i+0]; + float x1 = src_f32[i+1]; + float x2 = src_f32[i+2]; + float x3 = src_f32[i+3]; + + x0 = x0 + d0; + x1 = x1 + d1; + x2 = x2 + d2; + x3 = x3 + d3; + + x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0)); + x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1)); + x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2)); + x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3)); + + x0 = x0 * 32767.0f; + x1 = x1 * 32767.0f; + x2 = x2 * 32767.0f; + x3 = x3 * 32767.0f; + + dst_s16[i+0] = (mal_int16)x0; + dst_s16[i+1] = (mal_int16)x1; + dst_s16[i+2] = (mal_int16)x2; + dst_s16[i+3] = (mal_int16)x3; + + i += 4; + } + + // Leftover. + for (; i < count; i += 1) { + float x = src_f32[i]; + x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax); + x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip + x = x * 32767.0f; // -1..1 to -32767..32767 + + dst_s16[i] = (mal_int16)x; + } } -#ifdef MAL_USE_SSE -void mal_pcm_f32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { +#if 1 + mal_int16* dst_s16 = (mal_int16*)dst; + const float* src_f32 = (const float*)src; + + float ditherMin = 0; + float ditherMax = 0; + if (ditherMode != mal_dither_mode_none) { + ditherMin = 1.0f / -32768; + ditherMax = 1.0f / 32767; + } + + mal_uint64 i = 0; + + // SSE2. SSE allows us to output 8 s16's at a time which means our loop is unrolled 8 times. + mal_uint64 count8 = count >> 3; + for (mal_uint64 i8 = 0; i8 < count8; i8 += 1) { + __m128 d0; + __m128 d1; + if (ditherMode == mal_dither_mode_none) { + d0 = _mm_set1_ps(0); + d1 = _mm_set1_ps(0); + } else if (ditherMode == mal_dither_mode_rectangle) { + d0 = _mm_set_ps( + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax) + ); + d1 = _mm_set_ps( + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax) + ); + } else { + d0 = _mm_set_ps( + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax) + ); + d1 = _mm_set_ps( + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax) + ); + } + + __m128 x0 = *((__m128*)(src_f32 + i) + 0); + __m128 x1 = *((__m128*)(src_f32 + i) + 1); + + x0 = _mm_add_ps(x0, d0); + x1 = _mm_add_ps(x1, d1); + + x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f)); + x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f)); + + *((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvtps_epi32(x0), _mm_cvtps_epi32(x1)); + + i += 8; + } + + + // Leftover. + for (; i < count; i += 1) { + float x = src_f32[i]; + x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax); + x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip + x = x * 32767.0f; // -1..1 to -32767..32767 + + dst_s16[i] = (mal_int16)x; + } +#else mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode); +#endif } #endif @@ -18406,13 +18482,9 @@ void mal_pcm_f32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_f32_to_s16__sse(dst, src, count, ditherMode); #else mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18450,8 +18522,8 @@ void mal_pcm_f32_to_s24__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_f32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); } @@ -18461,13 +18533,9 @@ void mal_pcm_f32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_f32_to_s24__sse(dst, src, count, ditherMode); #else mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -18502,8 +18570,8 @@ void mal_pcm_f32_to_s32__optimized(void* dst, const void* src, mal_uint64 count, mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode); } -#ifdef MAL_USE_SSE -void mal_pcm_f32_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); } @@ -18513,13 +18581,9 @@ void mal_pcm_f32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode); -#else -#ifdef MAL_USE_SSE - mal_pcm_f32_to_s32__sse(dst, src, count, ditherMode); #else mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); #endif -#endif } diff --git a/tests/mal_profiling.c b/tests/mal_profiling.c index ae6c6f90..fa9e89a8 100644 --- a/tests/mal_profiling.c +++ b/tests/mal_profiling.c @@ -34,6 +34,429 @@ const char* mal_src_algorithm_to_string(mal_src_algorithm algorithm) return "Unknown"; } +const char* mal_dither_mode_to_string(mal_dither_mode ditherMode) +{ + switch (ditherMode) { + case mal_dither_mode_none: return "None"; + case mal_dither_mode_rectangle: return "Rectangle"; + case mal_dither_mode_triangle: return "Triangle"; + } + + return "Unkown"; +} + + + + +/////////////////////////////////////////////////////////////////////////////// +// +// Format Conversion +// +/////////////////////////////////////////////////////////////////////////////// +typedef struct +{ + void* pBaseData; + mal_uint64 sampleCount; + mal_uint64 iNextSample; +} format_conversion_data; + +void pcm_convert__reference(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } +} + +void pcm_convert__optimized(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } +} + +#if defined(MAL_SUPPORT_SSE2) +void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } +} +#endif + +#if defined(MAL_SUPPORT_AVX) +void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); +} +#endif + +#if defined(MAL_SUPPORT_AVX512) +void pcm_convert__avx512(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); +} +#endif + +#if defined(MAL_SUPPORT_NEON) +void pcm_convert__neon(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) +{ + pcm_convert__reference(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); +} +#endif + +void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode, simd_mode mode) +{ + // For testing, we always reset the seed for dithering so we can get consistent results for comparisons. + mal_seed(1234); + + switch (mode) + { + case simd_mode_scalar: + { + pcm_convert__optimized(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + } break; + +#if defined(MAL_SUPPORT_SSE2) + case simd_mode_sse2: + { + pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + } break; +#endif + +#if defined(MAL_SUPPORT_AVX) + case simd_mode_avx: + { + pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + } break; +#endif + +#if defined(MAL_SUPPORT_AVX512) + case simd_mode_avx512: + { + pcm_convert__avx512(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + } break; +#endif + +#if defined(MAL_SUPPORT_NEON) + case simd_mode_neon: + { + pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + } break; +#endif + } +} + + +int do_profiling__format_conversion__profile_individual(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode, const void* pBaseData, mal_uint64 sampleCount, simd_mode mode, const void* pReferenceData, double referenceTime) +{ + void* pTestData = mal_aligned_malloc((size_t)(sampleCount * mal_get_bytes_per_sample(formatOut)), MAL_SIMD_ALIGNMENT); + if (pTestData == NULL) { + printf("Out of memory.\n"); + return -1; + } + + mal_timer timer; + mal_timer_init(&timer); + double timeTaken = mal_timer_get_time_in_seconds(&timer); + { + pcm_convert(pTestData, formatOut, pBaseData, formatIn, sampleCount, ditherMode, mode); + } + timeTaken = mal_timer_get_time_in_seconds(&timer) - timeTaken; + + + // Compare with the reference for correctness. + mal_bool32 passed = MAL_TRUE; + for (mal_uint64 iSample = 0; iSample < sampleCount; ++iSample) { + mal_uint32 bps = mal_get_bytes_per_sample(formatOut); + + // We need to compare on a format by format basis because we allow for very slight deviations in results depending on the output format. + switch (formatOut) + { + case mal_format_s16: + { + mal_int16 a = ((const mal_int16*)pReferenceData)[iSample]; + mal_int16 b = ((const mal_int16*)pTestData)[iSample]; + if (abs(a-b) > 1) { + printf("Incorrect Sample: (%d) %d != %d\n", (int)iSample, a, b); + passed = MAL_FALSE; + } + } break; + + default: + { + if (memcmp(mal_offset_ptr(pReferenceData, iSample*bps), mal_offset_ptr(pTestData, iSample*bps), bps) != 0) { + printf("Incorrect Sample: (%d)\n", (int)iSample); + passed = MAL_FALSE; + } + } break; + } + } + + if (passed) { + printf(" [PASSED] "); + } else { + printf(" [FAILED] "); + } + printf("(Dither = %s) %s -> %s (%s): %.4fms (%.2f%%)\n", mal_dither_mode_to_string(ditherMode), mal_get_format_name(formatIn), mal_get_format_name(formatOut), simd_mode_to_string(mode), timeTaken*1000, referenceTime/timeTaken*100); + + mal_aligned_free(pTestData); + return 0; +} + +int do_profiling__format_conversion__profile_set(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode) +{ + // Generate our base data to begin with. This is generated from an f32 sine wave which is converted to formatIn. That then becomes our base data. + mal_uint32 sampleCount = 1000000; + + float* pSourceData = (float*)mal_aligned_malloc(sampleCount*sizeof(*pSourceData), MAL_SIMD_ALIGNMENT); + if (pSourceData == NULL) { + printf("Out of memory.\n"); + return -1; + } + + mal_sine_wave sineWave; + mal_sine_wave_init(1.0, 400, 48000, &sineWave); + mal_sine_wave_read(&sineWave, sampleCount, pSourceData); + + void* pBaseData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatIn), MAL_SIMD_ALIGNMENT); + mal_pcm_convert(pBaseData, formatIn, pSourceData, mal_format_f32, sampleCount, mal_dither_mode_none); + + + // Reference first so we can get a benchmark. + void* pReferenceData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatOut), MAL_SIMD_ALIGNMENT); + mal_timer timer; + mal_timer_init(&timer); + double referenceTime = mal_timer_get_time_in_seconds(&timer); + { + pcm_convert__reference(pReferenceData, formatOut, pBaseData, formatIn, sampleCount, ditherMode); + } + referenceTime = mal_timer_get_time_in_seconds(&timer) - referenceTime; + + + // Here is where each optimized implementation is profiled. + do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_scalar, pReferenceData, referenceTime); + + if (mal_has_sse2()) { + do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime); + } + if (mal_has_avx()) { + do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx, pReferenceData, referenceTime); + } + if (mal_has_avx512f()) { + do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime); + } + if (mal_has_neon()) { + do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_neon, pReferenceData, referenceTime); + } + + + + mal_aligned_free(pReferenceData); + mal_aligned_free(pBaseData); + mal_aligned_free(pSourceData); + return 0; +} + +int do_profiling__format_conversion() +{ + // First we need to generate our base data. + + + do_profiling__format_conversion__profile_set(mal_format_f32, mal_format_s16, mal_dither_mode_none); + + return 0; +} + + + +/////////////////////////////////////////////////////////////////////////////// +// +// Channel Routing +// +/////////////////////////////////////////////////////////////////////////////// float g_ChannelRouterProfilingOutputBenchmark[8][48000]; float g_ChannelRouterProfilingOutput[8][48000]; @@ -416,6 +839,7 @@ int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn, // Now that we have the reference data to compare against we can go ahead and measure the SIMD optimizations. + do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_scalar, &referenceData); if (mal_has_sse2()) { do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData); } @@ -446,7 +870,7 @@ int do_profiling__src() src_data baseData; mal_zero_object(&baseData); baseData.channels = 8; - baseData.frameCount = 10000; + baseData.frameCount = 100000; for (mal_uint32 iChannel = 0; iChannel < baseData.channels; ++iChannel) { baseData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)(baseData.frameCount * sizeof(float)), MAL_SIMD_ALIGNMENT); if (baseData.pFrameData[iChannel] == NULL) { @@ -475,16 +899,33 @@ int do_profiling__src() } +// Converts two 4xf32 vectors to one 8xi16 vector with signed saturation. +static inline __m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1) +{ + return _mm_packs_epi32(_mm_cvtps_epi32(f32_0), _mm_cvtps_epi32(f32_1)); +} + int main(int argc, char** argv) { (void)argc; (void)argv; + + { + //__m128 f0 = _mm_set_ps(32780, 2, 1, 0); + //__m128 f1 = _mm_set_ps(-32780, 6, 5, 4); + //__m128i r = drmath_vf32_to_vi16__sse2(f0, f1); + + //int a = 5; + } + + + // Summary. if (mal_has_sse2()) { - printf("Has SSE: YES\n"); + printf("Has SSE2: YES\n"); } else { - printf("Has SSE: NO\n"); + printf("Has SSE2: NO\n"); } if (mal_has_avx()) { printf("Has AVX: YES\n"); @@ -505,6 +946,10 @@ int main(int argc, char** argv) printf("\n"); + // Format conversion. + do_profiling__format_conversion(); + printf("\n\n"); + // Channel routing. do_profiling__channel_routing(); printf("\n\n"); diff --git a/tests/mal_test_0.vcxproj b/tests/mal_test_0.vcxproj index 3aee3154..173ab933 100644 --- a/tests/mal_test_0.vcxproj +++ b/tests/mal_test_0.vcxproj @@ -141,7 +141,7 @@ %(AdditionalIncludeDirectories) MultiThreadedDebug Default - NoExtensions + NotSet Console @@ -162,7 +162,7 @@ %(AdditionalIncludeDirectories) MultiThreadedDebug Default - NoExtensions + NotSet Console @@ -183,6 +183,7 @@ %(AdditionalIncludeDirectories) MultiThreadedDebug Default + NotSet Console @@ -202,7 +203,7 @@ true %(AdditionalIncludeDirectories) Default - NoExtensions + NotSet Console @@ -226,7 +227,7 @@ true %(AdditionalIncludeDirectories) Default - NoExtensions + NotSet Console @@ -250,6 +251,7 @@ true %(AdditionalIncludeDirectories) Default + NotSet Console @@ -269,21 +271,21 @@ true - true - true - true - true - true - true - - false - false false + false false false false + + true + true + true + true + true + true + true true