diff --git a/extras/dr_flac.h b/extras/dr_flac.h index f34a2f36..58ee1076 100644 --- a/extras/dr_flac.h +++ b/extras/dr_flac.h @@ -1,6 +1,6 @@ /* FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file. -dr_flac - v0.12.11 - TBD +dr_flac - v0.12.11 - 2020-04-19 David Reid - mackron@gmail.com @@ -151,10 +151,7 @@ samples as it can, up to the amount requested. Later on when you need the next b } ``` -You can seek to a specific sample with drflac_seek_to_sample(). The given sample is based on interleaving. So for example, if you were to seek to the sample at -index 0 in a stereo stream, you'll be seeking to the first sample of the left channel. The sample at index 1 will be the first sample of the right channel. The -sample at index 2 will be the second sample of the left channel, etc. - +You can seek to a specific PCM frame with `drflac_seek_to_pcm_frame()`. If you just want to quickly decode an entire FLAC file in one go you can do something like this: @@ -4124,6 +4121,12 @@ static DRFLAC_INLINE void drflac__vst2q_s32(drflac_int32* p, int32x4x2_t x) vst1q_s32(p+4, x.val[1]); } +static DRFLAC_INLINE void drflac__vst2q_u32(drflac_uint32* p, uint32x4x2_t x) +{ + vst1q_u32(p+0, x.val[0]); + vst1q_u32(p+4, x.val[1]); +} + static DRFLAC_INLINE void drflac__vst2q_f32(float* p, float32x4x2_t x) { vst1q_f32(p+0, x.val[0]); @@ -4135,6 +4138,11 @@ static DRFLAC_INLINE void drflac__vst2q_s16(drflac_int16* p, int16x4x2_t x) vst1q_s16(p, vcombine_s16(x.val[0], x.val[1])); } +static DRFLAC_INLINE void drflac__vst2q_u16(drflac_uint16* p, uint16x4x2_t x) +{ + vst1q_u16(p, vcombine_u16(x.val[0], x.val[1])); +} + static DRFLAC_INLINE int32x4_t drflac__vdupq_n_s32x4(drflac_int32 x3, drflac_int32 x2, drflac_int32 x1, drflac_int32 x0) { drflac_int32 x[4]; @@ -8716,15 +8724,15 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drf shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); - drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); + drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { @@ -8870,15 +8878,15 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(dr shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); - drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); + drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right)); } for (i = (frameCount4 << 2); i < frameCount; ++i) { @@ -9121,30 +9129,30 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drfl const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0; const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1; drflac_int32 shift = unusedBitsPerSample; - int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */ - int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */ - int32x4_t one4; + int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */ + int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */ + uint32x4_t one4; DRFLAC_ASSERT(pFlac->bitsPerSample <= 24); wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample); wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample); - one4 = vdupq_n_s32(1); + one4 = vdupq_n_u32(1); if (shift == 0) { for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, one4)); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4)); - left = vshrq_n_s32(vaddq_s32(mid, side), 1); - right = vshrq_n_s32(vsubq_s32(mid, side), 1); + left = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } @@ -9165,18 +9173,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drfl shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, one4)); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4)); - left = vshlq_s32(vaddq_s32(mid, side), shift4); - right = vshlq_s32(vsubq_s32(mid, side), shift4); + left = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } @@ -9304,8 +9312,8 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo_ int32x4_t left; int32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift4_0); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift4_1); + left = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0)); + right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1)); drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right)); } @@ -9552,18 +9560,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drf shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); - left = vshrq_n_s32(left, 16); - right = vshrq_n_s32(right, 16); + left = vshrq_n_u32(left, 16); + right = vshrq_n_u32(right, 16); - drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right))); + drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right))); } for (i = (frameCount4 << 2); i < frameCount; ++i) { @@ -9733,18 +9741,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(dr shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); - left = vshrq_n_s32(left, 16); - right = vshrq_n_s32(right, 16); + left = vshrq_n_u32(left, 16); + right = vshrq_n_u32(right, 16); - drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right))); + drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right))); } for (i = (frameCount4 << 2); i < frameCount; ++i) { @@ -10024,18 +10032,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl if (shift == 0) { for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - left = vshrq_n_s32(vaddq_s32(mid, side), 1); - right = vshrq_n_s32(vsubq_s32(mid, side), 1); + left = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10059,18 +10067,18 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drfl shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t left; int32x4_t right; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbpsShift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbpsShift1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - left = vshlq_s32(vaddq_s32(mid, side), shift4); - right = vshlq_s32(vsubq_s32(mid, side), shift4); + left = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10214,8 +10222,8 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo_ int32x4_t left; int32x4_t right; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); + left = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4)); + right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4)); left = vshrq_n_s32(left, 16); right = vshrq_n_s32(right, 16); @@ -10453,17 +10461,17 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drf shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t left; - int32x4_t side; - int32x4_t right; + uint32x4_t left; + uint32x4_t side; + uint32x4_t right; float32x4_t leftf; float32x4_t rightf; - left = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - right = vsubq_s32(left, side); - leftf = vmulq_f32(vcvtq_f32_s32(left), factor4); - rightf = vmulq_f32(vcvtq_f32_s32(right), factor4); + left = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + right = vsubq_u32(left, side); + leftf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)), factor4); + rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4); drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf)); } @@ -10619,17 +10627,17 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(dr shift1_4 = vdupq_n_s32(shift1); for (i = 0; i < frameCount4; ++i) { - int32x4_t side; - int32x4_t right; - int32x4_t left; + uint32x4_t side; + uint32x4_t right; + uint32x4_t left; float32x4_t leftf; float32x4_t rightf; - side = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - right = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); - left = vaddq_s32(right, side); - leftf = vmulq_f32(vcvtq_f32_s32(left), factor4); - rightf = vmulq_f32(vcvtq_f32_s32(right), factor4); + side = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4); + right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4); + left = vaddq_u32(right, side); + leftf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)), factor4); + rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4); drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf)); } @@ -10910,13 +10918,13 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl float32x4_t leftf; float32x4_t rightf; - int32x4_t mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbps0_4); - int32x4_t side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbps1_4); + uint32x4_t mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4); + uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - lefti = vshrq_n_s32(vaddq_s32(mid, side), 1); - righti = vshrq_n_s32(vsubq_s32(mid, side), 1); + lefti = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1); + righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -10937,20 +10945,20 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drfl shift -= 1; shift4 = vdupq_n_s32(shift); for (i = 0; i < frameCount4; ++i) { - int32x4_t mid; - int32x4_t side; + uint32x4_t mid; + uint32x4_t side; int32x4_t lefti; int32x4_t righti; float32x4_t leftf; float32x4_t rightf; - mid = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), wbps0_4); - side = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), wbps1_4); + mid = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4); + side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4); - mid = vorrq_s32(vshlq_n_s32(mid, 1), vandq_s32(side, vdupq_n_s32(1))); + mid = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1))); - lefti = vshlq_s32(vaddq_s32(mid, side), shift4); - righti = vshlq_s32(vsubq_s32(mid, side), shift4); + lefti = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4)); + righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4)); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -11096,8 +11104,8 @@ static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo_ float32x4_t leftf; float32x4_t rightf; - lefti = vshlq_s32(vld1q_s32(pInputSamples0 + i*4), shift0_4); - righti = vshlq_s32(vld1q_s32(pInputSamples1 + i*4), shift1_4); + lefti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4)); + righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4)); leftf = vmulq_f32(vcvtq_f32_s32(lefti), factor4); rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4); @@ -11697,7 +11705,7 @@ DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterat /* REVISION HISTORY ================ -v0.12.11 - TBD +v0.12.11 - 2020-04-19 - Fix some pedantic warnings. - Fix some undefined behaviour warnings. diff --git a/extras/dr_mp3.h b/extras/dr_mp3.h index f12abba4..dc8a70fb 100644 --- a/extras/dr_mp3.h +++ b/extras/dr_mp3.h @@ -1,6 +1,6 @@ /* MP3 audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file. -dr_mp3 - v0.6.3 - 2020-04-13 +dr_mp3 - v0.6.4 - 2020-04-19 David Reid - mackron@gmail.com @@ -669,6 +669,17 @@ static int drmp3_have_simd(void) #endif +#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) +#define DRMP3_HAVE_ARMV6 1 +static __inline__ __attribute__((always_inline)) drmp32_int32 drmp3_clip_int16_arm(int32_t a) +{ + drmp3_int32 x = 0; + __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a)); + return x; +} +#endif + + typedef struct { const drmp3_uint8 *buf; @@ -1888,11 +1899,17 @@ typedef drmp3_int16 drmp3d_sample_t; static drmp3_int16 drmp3d_scale_pcm(float sample) { drmp3_int16 s; +#if DRMP3_HAVE_ARMV6 + drmp3_int32 s32 = (drmp3_int32)(sample + .5f); + s32 -= (s32 < 0); + s = (drmp3_int16)drmp3_clip_int16_arm(s32); +#else if (sample >= 32766.5) return (drmp3_int16) 32767; if (sample <= -32767.5) return (drmp3_int16)-32768; s = (drmp3_int16)(sample + .5f); s -= (s < 0); /* away from zero, to be compliant */ - return (drmp3_int16)s; +#endif + return s; } #else typedef float drmp3d_sample_t; @@ -4343,6 +4360,9 @@ counts rather than sample counts. /* REVISION HISTORY ================ +v0.6.4 - 2020-04-19 + - Bring up to date with changes to minimp3. + v0.6.3 - 2020-04-13 - Fix some pedantic warnings.