diff --git a/mini_al.h b/mini_al.h
index 10b5c361..2d3088e1 100644
--- a/mini_al.h
+++ b/mini_al.h
@@ -3306,16 +3306,25 @@ static MAL_INLINE mal_int32 mal_rand_range_s32(mal_int32 lo, mal_int32 hi)
}
+static MAL_INLINE float mal_dither_f32_rectangle(float ditherMin, float ditherMax)
+{
+ return mal_rand_range_f32(ditherMin, ditherMax);
+}
+
+static MAL_INLINE float mal_dither_f32_triangle(float ditherMin, float ditherMax)
+{
+ float a = mal_rand_range_f32(ditherMin, 0);
+ float b = mal_rand_range_f32(0, ditherMax);
+ return a + b;
+}
+
static MAL_INLINE float mal_dither_f32(mal_dither_mode ditherMode, float ditherMin, float ditherMax)
{
if (ditherMode == mal_dither_mode_rectangle) {
- float a = mal_rand_range_f32(ditherMin, ditherMax);
- return a;
+ return mal_dither_f32_rectangle(ditherMin, ditherMax);
}
if (ditherMode == mal_dither_mode_triangle) {
- float a = mal_rand_range_f32(ditherMin, 0);
- float b = mal_rand_range_f32(0, ditherMax);
- return a + b;
+ return mal_dither_f32_triangle(ditherMin, ditherMax);
}
return 0;
@@ -17273,8 +17282,8 @@ void mal_pcm_u8_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_u8_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_u8_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -17284,13 +17293,9 @@ void mal_pcm_u8_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_u8_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17317,8 +17322,8 @@ void mal_pcm_u8_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_u8_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_u8_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -17329,8 +17334,8 @@ void mal_pcm_u8_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither_
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
#else
-#ifdef MAL_USE_SSE
- mal_pcm_u8_to_s24__sse(dst, src, count, ditherMode);
+#if defined(MAL_SUPPORT_SSE2)
+ mal_pcm_u8_to_s24__sse2(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
#endif
@@ -17359,8 +17364,8 @@ void mal_pcm_u8_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_u8_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_u8_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -17370,13 +17375,9 @@ void mal_pcm_u8_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_u8_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17402,8 +17403,8 @@ void mal_pcm_u8_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_u8_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_u8_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -17413,13 +17414,9 @@ void mal_pcm_u8_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_u8_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17540,8 +17537,8 @@ void mal_pcm_s16_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s16_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s16_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -17551,13 +17548,9 @@ void mal_pcm_s16_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s16_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17588,8 +17581,8 @@ void mal_pcm_s16_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s16_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s16_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -17599,13 +17592,9 @@ void mal_pcm_s16_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s16_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17627,8 +17616,8 @@ void mal_pcm_s16_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s16_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s16_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -17638,13 +17627,9 @@ void mal_pcm_s16_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s16_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17678,8 +17663,8 @@ void mal_pcm_s16_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s16_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s16_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -17689,13 +17674,9 @@ void mal_pcm_s16_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s16_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17794,8 +17775,8 @@ void mal_pcm_s24_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s24_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s24_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -17805,13 +17786,9 @@ void mal_pcm_s24_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s24_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17851,8 +17828,8 @@ void mal_pcm_s24_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s24_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s24_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -17862,13 +17839,9 @@ void mal_pcm_s24_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s24_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17898,8 +17871,8 @@ void mal_pcm_s24_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s24_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s24_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -17909,13 +17882,9 @@ void mal_pcm_s24_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s24_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -17949,8 +17918,8 @@ void mal_pcm_s24_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s24_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s24_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -17960,13 +17929,9 @@ void mal_pcm_s24_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s24_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18072,8 +18037,8 @@ void mal_pcm_s32_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -18083,13 +18048,9 @@ void mal_pcm_s32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s32_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18129,8 +18090,8 @@ void mal_pcm_s32_to_s16__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
}
@@ -18140,13 +18101,9 @@ void mal_pcm_s32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s32_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18171,8 +18128,8 @@ void mal_pcm_s32_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -18182,13 +18139,9 @@ void mal_pcm_s32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s32_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18228,8 +18181,8 @@ void mal_pcm_s32_to_f32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_s32_to_f32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_s32_to_f32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
}
@@ -18239,13 +18192,9 @@ void mal_pcm_s32_to_f32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_s32_to_f32__sse(dst, src, count, ditherMode);
#else
mal_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18337,8 +18286,8 @@ void mal_pcm_f32_to_u8__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_f32_to_u8__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_f32_to_u8__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
}
@@ -18348,13 +18297,9 @@ void mal_pcm_f32_to_u8(void* dst, const void* src, mal_uint64 count, mal_dither_
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_f32_to_u8__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18392,13 +18337,144 @@ void mal_pcm_f32_to_s16__reference(void* dst, const void* src, mal_uint64 count,
void mal_pcm_f32_to_s16__optimized(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
- mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
+ mal_int16* dst_s16 = (mal_int16*)dst;
+ const float* src_f32 = (const float*)src;
+
+ float ditherMin = 0;
+ float ditherMax = 0;
+ if (ditherMode != mal_dither_mode_none) {
+ ditherMin = 1.0f / -32768;
+ ditherMax = 1.0f / 32767;
+ }
+
+ mal_uint64 i = 0;
+
+ // Unrolled.
+ mal_uint64 count4 = count >> 2;
+ for (mal_uint64 i4 = 0; i4 < count4; i4 += 1) {
+ float d0 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
+ float d1 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
+ float d2 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
+ float d3 = mal_dither_f32(ditherMode, ditherMin, ditherMax);
+
+ float x0 = src_f32[i+0];
+ float x1 = src_f32[i+1];
+ float x2 = src_f32[i+2];
+ float x3 = src_f32[i+3];
+
+ x0 = x0 + d0;
+ x1 = x1 + d1;
+ x2 = x2 + d2;
+ x3 = x3 + d3;
+
+ x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
+ x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
+ x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
+ x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
+
+ x0 = x0 * 32767.0f;
+ x1 = x1 * 32767.0f;
+ x2 = x2 * 32767.0f;
+ x3 = x3 * 32767.0f;
+
+ dst_s16[i+0] = (mal_int16)x0;
+ dst_s16[i+1] = (mal_int16)x1;
+ dst_s16[i+2] = (mal_int16)x2;
+ dst_s16[i+3] = (mal_int16)x3;
+
+ i += 4;
+ }
+
+ // Leftover.
+ for (; i < count; i += 1) {
+ float x = src_f32[i];
+ x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax);
+ x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip
+ x = x * 32767.0f; // -1..1 to -32767..32767
+
+ dst_s16[i] = (mal_int16)x;
+ }
}
-#ifdef MAL_USE_SSE
-void mal_pcm_f32_to_s16__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_f32_to_s16__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
+#if 1
+ mal_int16* dst_s16 = (mal_int16*)dst;
+ const float* src_f32 = (const float*)src;
+
+ float ditherMin = 0;
+ float ditherMax = 0;
+ if (ditherMode != mal_dither_mode_none) {
+ ditherMin = 1.0f / -32768;
+ ditherMax = 1.0f / 32767;
+ }
+
+ mal_uint64 i = 0;
+
+ // SSE2. SSE allows us to output 8 s16's at a time which means our loop is unrolled 8 times.
+ mal_uint64 count8 = count >> 3;
+ for (mal_uint64 i8 = 0; i8 < count8; i8 += 1) {
+ __m128 d0;
+ __m128 d1;
+ if (ditherMode == mal_dither_mode_none) {
+ d0 = _mm_set1_ps(0);
+ d1 = _mm_set1_ps(0);
+ } else if (ditherMode == mal_dither_mode_rectangle) {
+ d0 = _mm_set_ps(
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax)
+ );
+ d1 = _mm_set_ps(
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax),
+ mal_dither_f32_rectangle(ditherMin, ditherMax)
+ );
+ } else {
+ d0 = _mm_set_ps(
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax)
+ );
+ d1 = _mm_set_ps(
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax),
+ mal_dither_f32_triangle(ditherMin, ditherMax)
+ );
+ }
+
+ __m128 x0 = *((__m128*)(src_f32 + i) + 0);
+ __m128 x1 = *((__m128*)(src_f32 + i) + 1);
+
+ x0 = _mm_add_ps(x0, d0);
+ x1 = _mm_add_ps(x1, d1);
+
+ x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f));
+ x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f));
+
+ *((__m128i*)(dst_s16 + i)) = _mm_packs_epi32(_mm_cvtps_epi32(x0), _mm_cvtps_epi32(x1));
+
+ i += 8;
+ }
+
+
+ // Leftover.
+ for (; i < count; i += 1) {
+ float x = src_f32[i];
+ x = x + mal_dither_f32(ditherMode, ditherMin, ditherMax);
+ x = ((x < -1) ? -1 : ((x > 1) ? 1 : x)); // clip
+ x = x * 32767.0f; // -1..1 to -32767..32767
+
+ dst_s16[i] = (mal_int16)x;
+ }
+#else
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
+#endif
}
#endif
@@ -18406,13 +18482,9 @@ void mal_pcm_f32_to_s16(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_f32_to_s16__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18450,8 +18522,8 @@ void mal_pcm_f32_to_s24__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_f32_to_s24__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_f32_to_s24__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
}
@@ -18461,13 +18533,9 @@ void mal_pcm_f32_to_s24(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_f32_to_s24__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
@@ -18502,8 +18570,8 @@ void mal_pcm_f32_to_s32__optimized(void* dst, const void* src, mal_uint64 count,
mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
}
-#ifdef MAL_USE_SSE
-void mal_pcm_f32_to_s32__sse(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
+#if defined(MAL_SUPPORT_SSE2)
+void mal_pcm_f32_to_s32__sse2(void* dst, const void* src, mal_uint64 count, mal_dither_mode ditherMode)
{
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
}
@@ -18513,13 +18581,9 @@ void mal_pcm_f32_to_s32(void* dst, const void* src, mal_uint64 count, mal_dither
{
#ifdef MAL_USE_REFERENCE_CONVERSION_APIS
mal_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
-#else
-#ifdef MAL_USE_SSE
- mal_pcm_f32_to_s32__sse(dst, src, count, ditherMode);
#else
mal_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
#endif
-#endif
}
diff --git a/tests/mal_profiling.c b/tests/mal_profiling.c
index ae6c6f90..fa9e89a8 100644
--- a/tests/mal_profiling.c
+++ b/tests/mal_profiling.c
@@ -34,6 +34,429 @@ const char* mal_src_algorithm_to_string(mal_src_algorithm algorithm)
return "Unknown";
}
+const char* mal_dither_mode_to_string(mal_dither_mode ditherMode)
+{
+ switch (ditherMode) {
+ case mal_dither_mode_none: return "None";
+ case mal_dither_mode_rectangle: return "Rectangle";
+ case mal_dither_mode_triangle: return "Triangle";
+ }
+
+ return "Unkown";
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Format Conversion
+//
+///////////////////////////////////////////////////////////////////////////////
+typedef struct
+{
+ void* pBaseData;
+ mal_uint64 sampleCount;
+ mal_uint64 iNextSample;
+} format_conversion_data;
+
+void pcm_convert__reference(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ switch (formatIn)
+ {
+ case mal_format_u8:
+ {
+ switch (formatOut)
+ {
+ case mal_format_s16: mal_pcm_u8_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_u8_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_u8_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_u8_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s16:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s16_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s16_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s16_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s16_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s24:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s24_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s24_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s24_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s24_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s32_to_f32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_f32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_f32_to_u8__reference( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_f32_to_s16__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_f32_to_s24__reference(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_f32_to_s32__reference(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ default: break;
+ }
+}
+
+void pcm_convert__optimized(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ switch (formatIn)
+ {
+ case mal_format_u8:
+ {
+ switch (formatOut)
+ {
+ case mal_format_s16: mal_pcm_u8_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_u8_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_u8_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_u8_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s16:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s16_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s16_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s16_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s16_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s24:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s24_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s24_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s24_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s24_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s32_to_f32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_f32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_f32_to_u8__optimized( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_f32_to_s16__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_f32_to_s24__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_f32_to_s32__optimized(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ default: break;
+ }
+}
+
+#if defined(MAL_SUPPORT_SSE2)
+void pcm_convert__sse2(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ switch (formatIn)
+ {
+ case mal_format_u8:
+ {
+ switch (formatOut)
+ {
+ case mal_format_s16: mal_pcm_u8_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_u8_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_u8_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_u8_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s16:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s16_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s16_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s16_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s16_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s24:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s24_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s24_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_s24_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s24_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_s32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_s32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_s32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_s32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_f32: mal_pcm_s32_to_f32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ case mal_format_f32:
+ {
+ switch (formatOut)
+ {
+ case mal_format_u8: mal_pcm_f32_to_u8__sse2( pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s16: mal_pcm_f32_to_s16__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s24: mal_pcm_f32_to_s24__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ case mal_format_s32: mal_pcm_f32_to_s32__sse2(pOut, pIn, sampleCount, ditherMode); return;
+ default: break;
+ }
+ } break;
+
+ default: break;
+ }
+}
+#endif
+
+#if defined(MAL_SUPPORT_AVX)
+void pcm_convert__avx(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+}
+#endif
+
+#if defined(MAL_SUPPORT_AVX512)
+void pcm_convert__avx512(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+}
+#endif
+
+#if defined(MAL_SUPPORT_NEON)
+void pcm_convert__neon(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode)
+{
+ pcm_convert__reference(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+}
+#endif
+
+void pcm_convert(void* pOut, mal_format formatOut, const void* pIn, mal_format formatIn, mal_uint64 sampleCount, mal_dither_mode ditherMode, simd_mode mode)
+{
+ // For testing, we always reset the seed for dithering so we can get consistent results for comparisons.
+ mal_seed(1234);
+
+ switch (mode)
+ {
+ case simd_mode_scalar:
+ {
+ pcm_convert__optimized(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+ } break;
+
+#if defined(MAL_SUPPORT_SSE2)
+ case simd_mode_sse2:
+ {
+ pcm_convert__sse2(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+ } break;
+#endif
+
+#if defined(MAL_SUPPORT_AVX)
+ case simd_mode_avx:
+ {
+ pcm_convert__avx(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+ } break;
+#endif
+
+#if defined(MAL_SUPPORT_AVX512)
+ case simd_mode_avx512:
+ {
+ pcm_convert__avx512(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+ } break;
+#endif
+
+#if defined(MAL_SUPPORT_NEON)
+ case simd_mode_neon:
+ {
+ pcm_convert__neon(pOut, formatOut, pIn, formatIn, sampleCount, ditherMode);
+ } break;
+#endif
+ }
+}
+
+
+int do_profiling__format_conversion__profile_individual(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode, const void* pBaseData, mal_uint64 sampleCount, simd_mode mode, const void* pReferenceData, double referenceTime)
+{
+ void* pTestData = mal_aligned_malloc((size_t)(sampleCount * mal_get_bytes_per_sample(formatOut)), MAL_SIMD_ALIGNMENT);
+ if (pTestData == NULL) {
+ printf("Out of memory.\n");
+ return -1;
+ }
+
+ mal_timer timer;
+ mal_timer_init(&timer);
+ double timeTaken = mal_timer_get_time_in_seconds(&timer);
+ {
+ pcm_convert(pTestData, formatOut, pBaseData, formatIn, sampleCount, ditherMode, mode);
+ }
+ timeTaken = mal_timer_get_time_in_seconds(&timer) - timeTaken;
+
+
+ // Compare with the reference for correctness.
+ mal_bool32 passed = MAL_TRUE;
+ for (mal_uint64 iSample = 0; iSample < sampleCount; ++iSample) {
+ mal_uint32 bps = mal_get_bytes_per_sample(formatOut);
+
+ // We need to compare on a format by format basis because we allow for very slight deviations in results depending on the output format.
+ switch (formatOut)
+ {
+ case mal_format_s16:
+ {
+ mal_int16 a = ((const mal_int16*)pReferenceData)[iSample];
+ mal_int16 b = ((const mal_int16*)pTestData)[iSample];
+ if (abs(a-b) > 1) {
+ printf("Incorrect Sample: (%d) %d != %d\n", (int)iSample, a, b);
+ passed = MAL_FALSE;
+ }
+ } break;
+
+ default:
+ {
+ if (memcmp(mal_offset_ptr(pReferenceData, iSample*bps), mal_offset_ptr(pTestData, iSample*bps), bps) != 0) {
+ printf("Incorrect Sample: (%d)\n", (int)iSample);
+ passed = MAL_FALSE;
+ }
+ } break;
+ }
+ }
+
+ if (passed) {
+ printf(" [PASSED] ");
+ } else {
+ printf(" [FAILED] ");
+ }
+ printf("(Dither = %s) %s -> %s (%s): %.4fms (%.2f%%)\n", mal_dither_mode_to_string(ditherMode), mal_get_format_name(formatIn), mal_get_format_name(formatOut), simd_mode_to_string(mode), timeTaken*1000, referenceTime/timeTaken*100);
+
+ mal_aligned_free(pTestData);
+ return 0;
+}
+
+int do_profiling__format_conversion__profile_set(mal_format formatIn, mal_format formatOut, mal_dither_mode ditherMode)
+{
+ // Generate our base data to begin with. This is generated from an f32 sine wave which is converted to formatIn. That then becomes our base data.
+ mal_uint32 sampleCount = 1000000;
+
+ float* pSourceData = (float*)mal_aligned_malloc(sampleCount*sizeof(*pSourceData), MAL_SIMD_ALIGNMENT);
+ if (pSourceData == NULL) {
+ printf("Out of memory.\n");
+ return -1;
+ }
+
+ mal_sine_wave sineWave;
+ mal_sine_wave_init(1.0, 400, 48000, &sineWave);
+ mal_sine_wave_read(&sineWave, sampleCount, pSourceData);
+
+ void* pBaseData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatIn), MAL_SIMD_ALIGNMENT);
+ mal_pcm_convert(pBaseData, formatIn, pSourceData, mal_format_f32, sampleCount, mal_dither_mode_none);
+
+
+ // Reference first so we can get a benchmark.
+ void* pReferenceData = mal_aligned_malloc(sampleCount * mal_get_bytes_per_sample(formatOut), MAL_SIMD_ALIGNMENT);
+ mal_timer timer;
+ mal_timer_init(&timer);
+ double referenceTime = mal_timer_get_time_in_seconds(&timer);
+ {
+ pcm_convert__reference(pReferenceData, formatOut, pBaseData, formatIn, sampleCount, ditherMode);
+ }
+ referenceTime = mal_timer_get_time_in_seconds(&timer) - referenceTime;
+
+
+ // Here is where each optimized implementation is profiled.
+ do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_scalar, pReferenceData, referenceTime);
+
+ if (mal_has_sse2()) {
+ do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_sse2, pReferenceData, referenceTime);
+ }
+ if (mal_has_avx()) {
+ do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx, pReferenceData, referenceTime);
+ }
+ if (mal_has_avx512f()) {
+ do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_avx512, pReferenceData, referenceTime);
+ }
+ if (mal_has_neon()) {
+ do_profiling__format_conversion__profile_individual(formatIn, formatOut, ditherMode, pBaseData, sampleCount, simd_mode_neon, pReferenceData, referenceTime);
+ }
+
+
+
+ mal_aligned_free(pReferenceData);
+ mal_aligned_free(pBaseData);
+ mal_aligned_free(pSourceData);
+ return 0;
+}
+
+int do_profiling__format_conversion()
+{
+ // First we need to generate our base data.
+
+
+ do_profiling__format_conversion__profile_set(mal_format_f32, mal_format_s16, mal_dither_mode_none);
+
+ return 0;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Channel Routing
+//
+///////////////////////////////////////////////////////////////////////////////
float g_ChannelRouterProfilingOutputBenchmark[8][48000];
float g_ChannelRouterProfilingOutput[8][48000];
@@ -416,6 +839,7 @@ int do_profiling__src__profile_set(src_data* pBaseData, mal_uint32 sampleRateIn,
// Now that we have the reference data to compare against we can go ahead and measure the SIMD optimizations.
+ do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_scalar, &referenceData);
if (mal_has_sse2()) {
do_profiling__src__profile_individual(pBaseData, sampleRateIn, sampleRateOut, algorithm, simd_mode_sse2, &referenceData);
}
@@ -446,7 +870,7 @@ int do_profiling__src()
src_data baseData;
mal_zero_object(&baseData);
baseData.channels = 8;
- baseData.frameCount = 10000;
+ baseData.frameCount = 100000;
for (mal_uint32 iChannel = 0; iChannel < baseData.channels; ++iChannel) {
baseData.pFrameData[iChannel] = (float*)mal_aligned_malloc((size_t)(baseData.frameCount * sizeof(float)), MAL_SIMD_ALIGNMENT);
if (baseData.pFrameData[iChannel] == NULL) {
@@ -475,16 +899,33 @@ int do_profiling__src()
}
+// Converts two 4xf32 vectors to one 8xi16 vector with signed saturation.
+static inline __m128i drmath_vf32_to_vi16__sse2(__m128 f32_0, __m128 f32_1)
+{
+ return _mm_packs_epi32(_mm_cvtps_epi32(f32_0), _mm_cvtps_epi32(f32_1));
+}
+
int main(int argc, char** argv)
{
(void)argc;
(void)argv;
+
+ {
+ //__m128 f0 = _mm_set_ps(32780, 2, 1, 0);
+ //__m128 f1 = _mm_set_ps(-32780, 6, 5, 4);
+ //__m128i r = drmath_vf32_to_vi16__sse2(f0, f1);
+
+ //int a = 5;
+ }
+
+
+
// Summary.
if (mal_has_sse2()) {
- printf("Has SSE: YES\n");
+ printf("Has SSE2: YES\n");
} else {
- printf("Has SSE: NO\n");
+ printf("Has SSE2: NO\n");
}
if (mal_has_avx()) {
printf("Has AVX: YES\n");
@@ -505,6 +946,10 @@ int main(int argc, char** argv)
printf("\n");
+ // Format conversion.
+ do_profiling__format_conversion();
+ printf("\n\n");
+
// Channel routing.
do_profiling__channel_routing();
printf("\n\n");
diff --git a/tests/mal_test_0.vcxproj b/tests/mal_test_0.vcxproj
index 3aee3154..173ab933 100644
--- a/tests/mal_test_0.vcxproj
+++ b/tests/mal_test_0.vcxproj
@@ -141,7 +141,7 @@
%(AdditionalIncludeDirectories)
MultiThreadedDebug
Default
- NoExtensions
+ NotSet
Console
@@ -162,7 +162,7 @@
%(AdditionalIncludeDirectories)
MultiThreadedDebug
Default
- NoExtensions
+ NotSet
Console
@@ -183,6 +183,7 @@
%(AdditionalIncludeDirectories)
MultiThreadedDebug
Default
+ NotSet
Console
@@ -202,7 +203,7 @@
true
%(AdditionalIncludeDirectories)
Default
- NoExtensions
+ NotSet
Console
@@ -226,7 +227,7 @@
true
%(AdditionalIncludeDirectories)
Default
- NoExtensions
+ NotSet
Console
@@ -250,6 +251,7 @@
true
%(AdditionalIncludeDirectories)
Default
+ NotSet
Console
@@ -269,21 +271,21 @@
true
- true
- true
- true
- true
- true
- true
-
-
false
- false
false
+ false
false
false
false
+
+ true
+ true
+ true
+ true
+ true
+ true
+
true
true