diff --git a/mini_al.h b/mini_al.h index 2d3088e1..934b5ab7 100644 --- a/mini_al.h +++ b/mini_al.h @@ -17288,6 +17288,24 @@ void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_u8_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_u8_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_u8_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_u8_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17328,18 +17346,32 @@ void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_u8_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_u8_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_u8_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_u8_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { #ifdef MAL_USE_REFERENCE_CONVERSION_APIS mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode); -#else -#if defined(MAL_SUPPORT_SSE2) - mal_pcm_u8_to_s24__sse2(dst, src, count, ditherMode); #else mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode); #endif -#endif } @@ -17370,6 +17402,24 @@ void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_u8_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_u8_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_u8_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_u8_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17409,6 +17459,24 @@ void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_u8_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_SSE2) +void mal_pcm_u8_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_u8_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_u8_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17543,6 +17611,24 @@ void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s16_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s16_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s16_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17587,6 +17673,24 @@ void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s16_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s16_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s16_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s16_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17622,6 +17726,24 @@ void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s16_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s16_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s16_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s16_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17669,6 +17791,24 @@ void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s16_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s16_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s16_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s16_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17781,6 +17921,24 @@ void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s24_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s24_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s24_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s24_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17834,6 +17992,24 @@ void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s24_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s24_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s24_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s24_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17877,6 +18053,24 @@ void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s24_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s24_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s24_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s24_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -17924,6 +18118,24 @@ void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s24_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s24_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s24_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s24_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18043,6 +18255,24 @@ void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s32_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s32_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18096,6 +18326,24 @@ void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s32_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18134,6 +18382,24 @@ void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s32_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s32_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18187,6 +18453,24 @@ void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_s32_to_f32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_s32_to_f32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_s32_to_f32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_s32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18292,6 +18576,24 @@ void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_d mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_f32_to_u8__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_f32_to_u8__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_f32_to_u8__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_f32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18399,7 +18701,6 @@ void mal_pcm_f32_to_s16__optimized(void* dst, const void* src, mal_uint64 count, #if defined(MAL_SUPPORT_SSE2) void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { -#if 1 mal_int16* dst_s16 = (mal_int16*)dst; const float* src_f32 = (const float*)src; @@ -18457,7 +18758,7 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_ x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f)); x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f)); - *((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvtps_epi32(x0), _mm_cvtps_epi32(x1)); + *((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvttps_epi32(x0), _mm_cvttps_epi32(x1)); i += 8; } @@ -18472,9 +18773,118 @@ void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_ dst_s16[i] = (mal_int16)x; } -#else - mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode); +} #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_f32_to_s16__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_int16* dst_s16 = (mal_int16*)dst; + const float* src_f32 = (const float*)src; + + float ditherMin = 0; + float ditherMax = 0; + if (ditherMode != mal_dither_mode_none) { + ditherMin = 1.0f / -32768; + ditherMax = 1.0f / 32767; + } + + mal_uint64 i = 0; + + // AVX. AVX allows us to output 16 s16's at a time which means our loop is unrolled 16 times. + mal_uint64 count16 = count >> 4; + for (mal_uint64 i16 = 0; i16 < count16; i16 += 1) { + __m256 d0; + __m256 d1; + if (ditherMode == mal_dither_mode_none) { + d0 = _mm256_set1_ps(0); + d1 = _mm256_set1_ps(0); + } else if (ditherMode == mal_dither_mode_rectangle) { + d0 = _mm256_set_ps( + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax) + ); + d1 = _mm256_set_ps( + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax), + mal_dither_f32_rectangle(ditherMin, ditherMax) + ); + } else { + d0 = _mm256_set_ps( + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax) + ); + d1 = _mm256_set_ps( + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax), + mal_dither_f32_triangle(ditherMin, ditherMax) + ); + } + + __m256 x0 = *((__m256*)(src_f32 + i) + 0); + __m256 x1 = *((__m256*)(src_f32 + i) + 1); + + x0 = _mm256_add_ps(x0, d0); + x1 = _mm256_add_ps(x1, d1); + + x0 = _mm256_mul_ps(x0, _mm256_set1_ps(32767.0f)); + x1 = _mm256_mul_ps(x1, _mm256_set1_ps(32767.0f)); + + // Computing the final result is a little more complicated for AVX than SSE. + __m256i i0 = _mm256_cvttps_epi32(x0); + __m256i i1 = _mm256_cvttps_epi32(x1); + __m256i p0 = _mm256_permute2x128_si256(i0, i1, 32); + __m256i p1 = _mm256_permute2x128_si256(i0, i1, 49); + __m256i r = _mm256_packs_epi32(p0, p1); + + *((__m256i*)(dst_s16 + i)) = r; + i += 16; + } + + + // Leftover. + for (; i < count; i += 1) { + float x = src_f32[i]; + x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax); + x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip + x = x * 32767.0f; // -1..1 to -32767..32767 + + dst_s16[i] = (mal_int16)x; + } +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_f32_to_s16__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + // TODO: Convert this from AVX to AVX-512. + mal_pcm_f32_to_s16__avx(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_f32_to_s16__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode); } #endif @@ -18528,6 +18938,24 @@ void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_f32_to_s24__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_f32_to_s24__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_f32_to_s24__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_f32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { @@ -18576,6 +19004,24 @@ void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_ mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); } #endif +#if defined(MAL_SUPPORT_AVX) +void mal_pcm_f32_to_s32__avx(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_AVX512) +void mal_pcm_f32_to_s32__avx512(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); +} +#endif +#if defined(MAL_SUPPORT_NEON) +void mal_pcm_f32_to_s32__neon(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) +{ + mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode); +} +#endif void mal_pcm_f32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode) { diff --git a/tests/mal_profiling.c b/tests/mal_profiling.c index fa9e89a8..ab95bdf6 100644 --- a/tests/mal_profiling.c +++ b/tests/mal_profiling.c @@ -269,21 +269,210 @@ void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_fo #if defined(MAL_SUPPORT_AVX) void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) { - pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__avx(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__avx( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__avx(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__avx(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } } #endif #if defined(MAL_SUPPORT_AVX512) void pcm_convert__avx512(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) { - pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__avx512(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__avx512( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__avx512(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__avx512(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } } #endif #if defined(MAL_SUPPORT_NEON) void pcm_convert__neon(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode) { - pcm_convert__reference(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode); + switch (formatIn) + { + case mal_format_u8: + { + switch (formatOut) + { + case mal_format_s16: mal_pcm_u8_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_u8_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_u8_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_u8_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s16: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s16_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s16_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s16_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s16_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s24: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s24_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s24_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_s24_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s24_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_s32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_s32_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_s32_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_s32_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_f32: mal_pcm_s32_to_f32__neon(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + case mal_format_f32: + { + switch (formatOut) + { + case mal_format_u8: mal_pcm_f32_to_u8__neon( pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s16: mal_pcm_f32_to_s16__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s24: mal_pcm_f32_to_s24__neon(pOut, pIn, sampleCount, ditherMode); return; + case mal_format_s32: mal_pcm_f32_to_s32__neon(pOut, pIn, sampleCount, ditherMode); return; + default: break; + } + } break; + + default: break; + } } #endif @@ -359,7 +548,7 @@ int do_profiling__format_conversion__profile_individual(mal_format formatIn, mal { mal_int16 a = ((const mal_int16*)pReferenceData)[iSample]; mal_int16 b = ((const mal_int16*)pTestData)[iSample]; - if (abs(a-b) > 1) { + if (abs(a-b) > 0) { printf("Incorrect Sample: (%d) %d != %d\n", (int)iSample, a, b); passed = MAL_FALSE; } @@ -900,9 +1089,19 @@ int do_profiling__src() // Converts two 4xf32 vectors to one 8xi16 vector with signed saturation. -static inline __m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1) +__m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1) { - return _mm_packs_epi32(_mm_cvtps_epi32(f32_0), _mm_cvtps_epi32(f32_1)); + return _mm_packs_epi32(_mm_cvttps_epi32(f32_0), _mm_cvttps_epi32(f32_1)); +} + +__m256i drmath_vf32_to_vi16__avx(__m256 f32_0, __m256 f32_1) +{ + __m256i i0 = _mm256_cvttps_epi32(f32_0); + __m256i i1 = _mm256_cvttps_epi32(f32_1); + __m256i p0 = _mm256_permute2x128_si256(i0, i1, 32); + __m256i p1 = _mm256_permute2x128_si256(i0, i1, 49); + __m256i r = _mm256_packs_epi32(p0, p1); + return r; } int main(int argc, char** argv) @@ -916,7 +1115,11 @@ int main(int argc, char** argv) //__m128 f1 = _mm_set_ps(-32780, 6, 5, 4); //__m128i r = drmath_vf32_to_vi16__sse2(f0, f1); - //int a = 5; + __m256 f0 = _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0); + __m256 f1 = _mm256_set_ps(15, 14, 13, 12, 11, 10, 9, 8); + __m256i r = drmath_vf32_to_vi16__avx(f0, f1); + + int a = 5; }