Use correct types for all variables/operations and make all vector conversions explicit in common/aarch64/pixel-prim.cpp. --- source/common/aarch64/pixel-prim.cpp | 524 ++++++++++++++------------- 1 file changed, 270 insertions(+), 254 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 9598f5498..dd3fd4637 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -36,15 +36,21 @@ static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2, static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s32(s1, s2); - t2 = vtrn2q_s32(s1, s2); + int32x4_t tmp1 = vreinterpretq_s32_s16(s1); + int32x4_t tmp2 = vreinterpretq_s32_s16(s2); + + t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2)); + t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2)); } static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); + int64x2_t tmp1 = vreinterpretq_s64_s16(s1); + int64x2_t tmp2 = vreinterpretq_s64_s16(s2); + + t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2)); } static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2, @@ -84,16 +90,16 @@ static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int1 transpose_4s_8h(v0, v1, v16, v18); transpose_4s_8h(v2, v3, v17, v19); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); - v0 = vmaxq_u16(v0, v1); - v1 = vmaxq_u16(v2, v3); + uint16x8_t max0 = vmaxq_u16(abs0, abs1); + uint16x8_t max1 = vmaxq_u16(abs2, abs3); - v0 = vaddq_u16(v0, v1); - return vaddlvq_u16(v0); + uint16x8_t sum = vaddq_u16(max0, max1); + return vaddlvq_u16(sum); } static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1) @@ -109,11 +115,11 @@ static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1) transpose_4s_8h(v0, v1, v2, v3); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v0 = vmaxq_u16(v0, v1); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t max = vmaxq_u16(abs0, abs1); - return vaddlvq_s16(v0); + return vaddlvq_u16(max); } static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20, @@ -141,20 +147,19 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in transpose_4s_8h(v4, v6, v20, v22); transpose_4s_8h(v5, v7, v21, v23); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); - v4 = vabsq_s16(v4); - v5 = vabsq_s16(v5); - v6 = vabsq_s16(v6); - v7 = vabsq_s16(v7); - - v0 = vmaxq_u16(v0, v2); - v1 = vmaxq_u16(v1, v3); - v2 = vmaxq_u16(v4, v6); - v3 = vmaxq_u16(v5, v7); - + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); + uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4)); + uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5)); + uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6)); + uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7)); + + v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2)); + v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3)); + v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6)); + v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7)); } #if HIGH_BIT_DEPTH @@ -163,8 +168,11 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); + int64x2_t tmp1 = vreinterpretq_s64_s32(s1); + int64x2_t tmp2 = vreinterpretq_s64_s32(s2); + + t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2)); } static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b) @@ -203,10 +211,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons t2 = vld1q_u16(pix2 + 2 * stride_pix2); t3 = vld1q_u16(pix2 + 3 * stride_pix2); - v16 = vsubq_u16(r0, t0); - v17 = vsubq_u16(r1, t1); - v18 = vsubq_u16(r2, t2); - v19 = vsubq_u16(r3, t3); + v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); r0 = vld1q_u16(pix1 + 4 * stride_pix1); r1 = vld1q_u16(pix1 + 5 * stride_pix1); @@ -218,10 +226,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons t2 = vld1q_u16(pix2 + 6 * stride_pix2); t3 = vld1q_u16(pix2 + 7 * stride_pix2); - v20 = vsubq_u16(r0, t0); - v21 = vsubq_u16(r1, t1); - v22 = vsubq_u16(r2, t2); - v23 = vsubq_u16(r3, t3); + v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -234,8 +242,8 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2, int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3) { - uint8x16_t r0, r1, r2, r3; - uint8x16_t t0, t1, t2, t3; + uint16x8_t r0, r1, r2, r3; + uint16x8_t t0, t1, t2, t3; int16x8_t v16, v17, v20, v21; int16x8_t v18, v19, v22, v23; @@ -249,11 +257,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui t2 = vld1q_u16(pix2 + 2 * stride_pix2); t3 = vld1q_u16(pix2 + 3 * stride_pix2); - - v16 = vsubq_u16((r0), (t0)); - v17 = vsubq_u16((r1), (t1)); - v18 = vsubq_u16((r2), (t2)); - v19 = vsubq_u16((r3), (t3)); + v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8); r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8); @@ -265,11 +272,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8); t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8); - - v20 = vsubq_u16(r0, t0); - v21 = vsubq_u16(r1, t1); - v22 = vsubq_u16(r2, t2); - v23 = vsubq_u16(r3, t3); + v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -295,7 +301,10 @@ int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 uint16x8_t r0 = vcombine_u16(r0_0, r0_1); uint16x8_t r1 = vcombine_u16(r1_0, r1_1); - return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1)); + int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + + return _satd_4x4_neon(v0, v1); } @@ -316,10 +325,10 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 i6 = vld1q_u16(pix1 + 3 * stride_pix1); i7 = vld1q_u16(pix2 + 3 * stride_pix2); - int16x8_t v0 = vsubq_u16(i0, i1); - int16x8_t v1 = vsubq_u16(i2, i3); - int16x8_t v2 = vsubq_u16(i4, i5); - int16x8_t v3 = vsubq_u16(i6, i7); + int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3)); + int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5)); + int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7)); return _satd_4x8_8x4_end_neon(v0, v1, v2, v3); } @@ -327,17 +336,21 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2) { - int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0); + uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0); int16x8_t v0, v1, v2, v3; - for (int offset = 0; offset <= 12; offset += 4) { - _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3); - v30 = vpadalq_u16(v30, v0); - v30 = vpadalq_u16(v30, v1); - v31 = vpadalq_u16(v31, v2); - v31 = vpadalq_u16(v31, v3); + + for (int offset = 0; offset <= 12; offset += 4) + { + _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, + pix2 + offset * stride_pix2,stride_pix2, + v0, v1, v2, v3); + v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0)); + v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1)); + v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2)); + v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3)); } - return vaddvq_s32(vaddq_s32(v30, v31)); + return vaddvq_u32(vaddq_u32(v30, v31)); } #else //HIGH_BIT_DEPTH @@ -360,16 +373,14 @@ static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uin t2 = vld1q_u8(pix2 + 2 * stride_pix2); t3 = vld1q_u8(pix2 + 3 * stride_pix2); - - - v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)); - v20 = vsubl_high_u8(r0, t0); - v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)); - v21 = vsubl_high_u8(r1, t1); - v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)); - v22 = vsubl_high_u8(r2, t2); - v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)); - v23 = vsubl_high_u8(r3, t3); + v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0))); + v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1))); + v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2))); + v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3))); + v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -398,10 +409,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const t2 = vld1_u8(pix2 + 2 * stride_pix2); t3 = vld1_u8(pix2 + 3 * stride_pix2); - v16 = vsubl_u8(r0, t0); - v17 = vsubl_u8(r1, t1); - v18 = vsubl_u8(r2, t2); - v19 = vsubl_u8(r3, t3); + v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3)); r0 = vld1_u8(pix1 + 4 * stride_pix1); r1 = vld1_u8(pix1 + 5 * stride_pix1); @@ -413,10 +424,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const t2 = vld1_u8(pix2 + 6 * stride_pix2); t3 = vld1_u8(pix2 + 7 * stride_pix2); - v20 = vsubl_u8(r0, t0); - v21 = vsubl_u8(r1, t1); - v22 = vsubl_u8(r2, t2); - v23 = vsubl_u8(r3, t3); + v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); @@ -432,7 +443,8 @@ int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2); uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2); - return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1)); + return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)), + vreinterpretq_s16_u16(vsubl_u8(r1, t1))); } @@ -449,53 +461,53 @@ int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t i6 = vld1_u8(pix1 + 3 * stride_pix1); i7 = vld1_u8(pix2 + 3 * stride_pix2); - int16x8_t v0 = vsubl_u8(i0, i1); - int16x8_t v1 = vsubl_u8(i2, i3); - int16x8_t v2 = vsubl_u8(i4, i5); - int16x8_t v3 = vsubl_u8(i6, i7); + int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3)); + int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5)); + int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7)); return _satd_4x8_8x4_end_neon(v0, v1, v2, v3); } int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2) { - int16x8_t v30, v31; + uint16x8_t v30, v31; int16x8_t v0, v1, v2, v3; + uint16x8_t t0, t1; _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3); - v30 = vaddq_s16(v0, v1); - v31 = vaddq_s16(v2, v3); + v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); - - int32x4_t sum0 = vpaddlq_u16(v30); - int32x4_t sum1 = vpaddlq_u16(v31); - sum0 = vaddq_s32(sum0, sum1); - return vaddvq_s32(sum0); - + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); + + uint32x4_t sum0 = vpaddlq_u16(v30); + uint32x4_t sum1 = vpaddlq_u16(v31); + sum0 = vaddq_u32(sum0, sum1); + return vaddvq_u32(sum0); } #endif //HIGH_BIT_DEPTH #if HIGH_BIT_DEPTH -typedef int32x4_t sa8d_out_type; +typedef uint32x4_t sa8d_out_type; #else -typedef int16x8_t sa8d_out_type; +typedef uint16x8_t sa8d_out_type; #endif static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, @@ -546,31 +558,30 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, transpose_2d_8h(v18, v22, v2, v6); transpose_2d_8h(v19, v23, v3, v7); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19)); + uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20)); + uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21)); + uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22)); + uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23)); - v16 = vabsq_s16(v16); - v17 = vabsq_s16(v17); - v18 = vabsq_s16(v18); - v19 = vabsq_s16(v19); - v20 = vabsq_s16(v20); - v21 = vabsq_s16(v21); - v22 = vabsq_s16(v22); - v23 = vabsq_s16(v23); - - v16 = vmaxq_u16(v16, v20); - v17 = vmaxq_u16(v17, v21); - v18 = vmaxq_u16(v18, v22); - v19 = vmaxq_u16(v19, v23); + uint16x8_t max0 = vmaxq_u16(abs0, abs4); + uint16x8_t max1 = vmaxq_u16(abs1, abs5); + uint16x8_t max2 = vmaxq_u16(abs2, abs6); + uint16x8_t max3 = vmaxq_u16(abs3, abs7); #if HIGH_BIT_DEPTH - out0 = vpaddlq_u16(v16); - out1 = vpaddlq_u16(v17); - out0 = vpadalq_u16(out0, v18); - out1 = vpadalq_u16(out1, v19); + out0 = vpaddlq_u16(max0); + out1 = vpaddlq_u16(max1); + out0 = vpadalq_u16(out0, max2); + out1 = vpadalq_u16(out1, max3); #else //HIGH_BIT_DEPTH - out0 = vaddq_u16(v16, v17); - out1 = vaddq_u16(v18, v19); + out0 = vaddq_u16(max0, max1); + out1 = vaddq_u16(max2, max3); #endif //HIGH_BIT_DEPTH @@ -643,41 +654,41 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, v23l = v3h; v23h = v7h; - v16l = vabsq_s32(v16l); - v17l = vabsq_s32(v17l); - v18l = vabsq_s32(v18l); - v19l = vabsq_s32(v19l); - v20l = vabsq_s32(v20l); - v21l = vabsq_s32(v21l); - v22l = vabsq_s32(v22l); - v23l = vabsq_s32(v23l); - - v16h = vabsq_s32(v16h); - v17h = vabsq_s32(v17h); - v18h = vabsq_s32(v18h); - v19h = vabsq_s32(v19h); - v20h = vabsq_s32(v20h); - v21h = vabsq_s32(v21h); - v22h = vabsq_s32(v22h); - v23h = vabsq_s32(v23h); - - v16l = vmaxq_u32(v16l, v20l); - v17l = vmaxq_u32(v17l, v21l); - v18l = vmaxq_u32(v18l, v22l); - v19l = vmaxq_u32(v19l, v23l); - - v16h = vmaxq_u32(v16h, v20h); - v17h = vmaxq_u32(v17h, v21h); - v18h = vmaxq_u32(v18h, v22h); - v19h = vmaxq_u32(v19h, v23h); - - v16l = vaddq_u32(v16l, v16h); - v17l = vaddq_u32(v17l, v17h); - v18l = vaddq_u32(v18l, v18h); - v19l = vaddq_u32(v19l, v19h); - - out0 = vaddq_u32(v16l, v17l); - out1 = vaddq_u32(v18l, v19l); + uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l)); + uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l)); + uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l)); + uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l)); + uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l)); + uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l)); + uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l)); + uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l)); + + uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h)); + uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h)); + uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h)); + uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h)); + uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h)); + uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h)); + uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h)); + uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h)); + + uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo); + uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo); + uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo); + uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo); + + uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi); + uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi); + uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi); + uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi); + + uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi); + uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi); + uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi); + uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi); + + out0 = vaddq_u32(sum0, sum1); + out1 = vaddq_u32(sum2, sum3); #endif @@ -700,24 +711,19 @@ static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2) { - int16x8_t v30, v31; int16x8_t v0, v1, v2, v3; _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3); -#if !(HIGH_BIT_DEPTH) - v30 = vaddq_u16(v0, v1); - v31 = vaddq_u16(v2, v3); + uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); +#if !(HIGH_BIT_DEPTH) uint16x8_t sum = vaddq_u16(v30, v31); - return vaddvq_s32(vpaddlq_u16(sum)); + return vaddvq_u32(vpaddlq_u16(sum)); #else - - v30 = vaddq_u16(v0, v1); - v31 = vaddq_u16(v2, v3); - - int32x4_t sum = vpaddlq_u16(v30); + uint32x4_t sum = vpaddlq_u16(v30); sum = vpadalq_u16(sum, v31); - return vaddvq_s32(sum); + return vaddvq_u32(sum); #endif } @@ -732,10 +738,10 @@ int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pi _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1); #if HIGH_BIT_DEPTH - int32x4_t s = vaddq_u32(res0, res1); + uint32x4_t s = vaddq_u32(res0, res1); return (vaddvq_u32(s) + 1) >> 1; #else - return (vaddlvq_s16(vaddq_u16(res0, res1)) + 1) >> 1; + return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1; #endif } @@ -748,7 +754,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * int16x8_t v0, v1, v2, v3; int16x8_t v20, v21, v22, v23; sa8d_out_type res0, res1; - int32x4_t v30, v31; + uint32x4_t v30, v31; _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23); _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1); @@ -757,7 +763,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpaddlq_u16(res0); v31 = vpaddlq_u16(res1); #else - v30 = vaddq_s32(res0, res1); + v30 = vaddq_u32(res0, res1); #endif _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23); @@ -767,7 +773,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v31 = vaddq_s32(res0, res1); + v31 = vaddq_u32(res0, res1); #endif @@ -779,8 +785,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v30 = vaddq_s32(v30, res0); - v31 = vaddq_s32(v31, res1); + v30 = vaddq_u32(v30, res0); + v31 = vaddq_u32(v31, res1); #endif _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, @@ -791,8 +797,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v30 = vaddq_s32(v30, res0); - v31 = vaddq_s32(v31, res1); + v30 = vaddq_u32(v30, res0); + v31 = vaddq_u32(v31, res1); #endif v30 = vaddq_u32(v30, v31); @@ -840,19 +846,18 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp { uint16x8_t p1 = vld1q_u16(pix1 + x); uint16x8_t p2 = vld1q_u16(pix2 + x); - vsum16_1 = vabaq_s16(vsum16_1, p1, p2); - + vsum16_1 = vabaq_u16(vsum16_1, p1, p2); } if (lx & 4) { uint16x4_t p1 = vld1_u16(pix1 + x); uint16x4_t p2 = vld1_u16(pix2 + x); - sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2)); + sum += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); x += 4; } if (lx >= 4) { - sum += vaddlvq_s16(vsum16_1); + sum += vaddlvq_u16(vsum16_1); } #else @@ -924,10 +929,9 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8_t p2 = vld1q_u16(pix2 + x); uint16x8_t p3 = vld1q_u16(pix3 + x); uint16x8_t p4 = vld1q_u16(pix4 + x); - vsum16_0 = vabaq_s16(vsum16_0, p1, p2); - vsum16_1 = vabaq_s16(vsum16_1, p1, p3); - vsum16_2 = vabaq_s16(vsum16_2, p1, p4); - + vsum16_0 = vabaq_u16(vsum16_0, p1, p2); + vsum16_1 = vabaq_u16(vsum16_1, p1, p3); + vsum16_2 = vabaq_u16(vsum16_2, p1, p4); } if (lx & 4) { @@ -935,16 +939,16 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x4_t p2 = vld1_u16(pix2 + x); uint16x4_t p3 = vld1_u16(pix3 + x); uint16x4_t p4 = vld1_u16(pix4 + x); - res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2)); - res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3)); - res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4)); + res[0] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); + res[1] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p3)); + res[2] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p4)); x += 4; } if (lx >= 4) { - res[0] += vaddlvq_s16(vsum16_0); - res[1] += vaddlvq_s16(vsum16_1); - res[2] += vaddlvq_s16(vsum16_2); + res[0] += vaddlvq_u16(vsum16_0); + res[1] += vaddlvq_u16(vsum16_1); + res[2] += vaddlvq_u16(vsum16_2); } #else @@ -1009,7 +1013,8 @@ template<int lx, int ly> void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res) { - int32x4_t result = {0}; + uint32x4_t result = vdupq_n_u32(0); + for (int y = 0; y < ly; y++) { int x = 0; @@ -1025,14 +1030,14 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]); uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]); uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]); - vsum16_0 = vabaq_s16(vsum16_0, p1.val[0], p2.val[0]); - vsum16_1 = vabaq_s16(vsum16_1, p1.val[0], p3.val[0]); - vsum16_2 = vabaq_s16(vsum16_2, p1.val[0], p4.val[0]); - vsum16_3 = vabaq_s16(vsum16_3, p1.val[0], p5.val[0]); - vsum16_0 = vabaq_s16(vsum16_0, p1.val[1], p2.val[1]); - vsum16_1 = vabaq_s16(vsum16_1, p1.val[1], p3.val[1]); - vsum16_2 = vabaq_s16(vsum16_2, p1.val[1], p4.val[1]); - vsum16_3 = vabaq_s16(vsum16_3, p1.val[1], p5.val[1]); + vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]); + vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]); + vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]); + vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]); + vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]); + vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]); + vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]); + vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]); } if (lx & 8) { @@ -1041,10 +1046,10 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8_t p3 = vld1q_u16(pix3 + x); uint16x8_t p4 = vld1q_u16(pix4 + x); uint16x8_t p5 = vld1q_u16(pix5 + x); - vsum16_0 = vabaq_s16(vsum16_0, p1, p2); - vsum16_1 = vabaq_s16(vsum16_1, p1, p3); - vsum16_2 = vabaq_s16(vsum16_2, p1, p4); - vsum16_3 = vabaq_s16(vsum16_3, p1, p5); + vsum16_0 = vabaq_u16(vsum16_0, p1, p2); + vsum16_1 = vabaq_u16(vsum16_1, p1, p3); + vsum16_2 = vabaq_u16(vsum16_2, p1, p4); + vsum16_3 = vabaq_u16(vsum16_3, p1, p5); x += 8; } if (lx & 4) @@ -1052,15 +1057,17 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const /* This is equivalent to getting the absolute difference of pix1[x] with each of * pix2 - pix5, then summing across the vector (4 values each) and adding the * result to result. */ - uint16x8_t p1 = vreinterpretq_s16_u64( - vld1q_dup_u64((uint64_t *)&pix1[x])); - uint16x8_t p2_3 = vcombine_s16(vld1_u16(pix2 + x), vld1_u16(pix3 + x)); - uint16x8_t p4_5 = vcombine_s16(vld1_u16(pix4 + x), vld1_u16(pix5 + x)); + uint16x8_t p1 = vreinterpretq_u16_u64( + vld1q_dup_u64((uint64_t *)&pix1[x])); + uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x), + vld1_u16(pix3 + x)); + uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x), + vld1_u16(pix5 + x)); uint16x8_t a = vabdq_u16(p1, p2_3); uint16x8_t b = vabdq_u16(p1, p4_5); - result = vpadalq_s16(result, vpaddq_s16(a, b)); + result = vpadalq_u16(result, vpaddq_u16(a, b)); x += 4; } if (lx >= 4) @@ -1105,7 +1112,7 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const } if (lx & 4) { - uint8x16_t p1 = vreinterpretq_u32_u8( + uint8x16_t p1 = vreinterpretq_u8_u32( vld1q_dup_u32((uint32_t *)&pix1[x])); uint32x4_t p_x4 = vdupq_n_u32(0); @@ -1114,11 +1121,11 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2); p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3); - uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4)); - uint16x8_t sum2 = vabdl_high_u8(p1, p_x4); + uint16x8_t sum = vabdl_u8(vget_low_u8(p1), + vget_low_u8(vreinterpretq_u8_u32(p_x4))); + uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4)); - uint16x8_t a = vpaddq_u16(sum, sum2); - result = vpadalq_u16(result, a); + result = vpadalq_u16(result, vpaddq_u16(sum, sum2)); } if (lx >= 4) { @@ -1143,7 +1150,8 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pix4 += frefstride; pix5 += frefstride; } - vst1q_s32(res, result); + + vst1q_s32(res, vreinterpretq_s32_u32(result)); } @@ -1164,17 +1172,17 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st { // We have to cast to the 'real' type so that this block // will compile for both low and high bitdepth. - const uint16_t *p1_u16 = (const uint16_t*)pix1; - const uint16_t *p2_u16 = (const uint16_t*)pix2; - tmp = vsubq_u16(vld1q_u16(p1_u16 + x), vld1q_u16(p2_u16 + x)); + uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x); + uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x); + tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2)); } else if (sizeof(T1) == 1 && sizeof(T2) == 1) { // We have to cast to the 'real' type so that this block // will compile for both low and high bitdepth. - const uint8_t *p1_u8 = (const uint8_t*)pix1; - const uint8_t *p2_u8 = (const uint8_t*)pix2; - tmp = vsubl_u8(vld1_u8(p1_u8 + x), vld1_u8(p2_u8 + x)); + uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x); + uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x); + tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2)); } else { @@ -1191,17 +1199,18 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st if (sizeof(T1) == 2 && sizeof(T2) == 2) { - int32x4_t vsum = vaddq_u32(vsum1, vsum2);; - sum += vaddvq_u32(vsum); - vsum1 = vsum2 = vdupq_n_u16(0); + int32x4_t vsum = vaddq_s32(vsum1, vsum2); + sum += vaddvq_s32(vsum); + vsum1 = vsum2 = vdupq_n_s32(0); } pix1 += stride_pix1; pix2 += stride_pix2; } - int32x4_t vsum = vaddq_u32(vsum1, vsum2); - return sum + vaddvq_u32(vsum); + int32x4_t vsum = vaddq_s32(vsum1, vsum2); + + return sum + vaddvq_s32(vsum); } @@ -1214,9 +1223,10 @@ void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t st for (; (x + 8) <= bx; x += 8) { #if HIGH_BIT_DEPTH - vst1q_s16(a + x, vld1q_u16(b + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(vld1q_u16(b + x))); #else - vst1q_s16(a + x, vmovl_u8(vld1_u8(b + x))); + int16x8_t in = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b + x))); + vst1q_s16(a + x, in); #endif } for (; x < bx; x++) @@ -1283,9 +1293,11 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixe for (; (x + 8) <= bx; x += 8) { #if HIGH_BIT_DEPTH - vst1q_s16(a + x, vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x))); + uint16x8_t diff = vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(diff)); #else - vst1q_s16(a + x, vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x))); + uint16x8_t diff = vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(diff)); #endif } for (; x < bx; x++) @@ -1312,13 +1324,13 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_ int16x8_t b1e = vld1q_s16(b1 + x); int16x8_t b0e; #if HIGH_BIT_DEPTH - b0e = vld1q_u16(b0 + x); + b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x)); t = vaddq_s16(b0e, b1e); t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); t = vmaxq_s16(t, vdupq_n_s16(0)); - vst1q_u16(a + x, t); + vst1q_u16(a + x, vreinterpretq_u16_s16(t)); #else - b0e = vmovl_u8(vld1_u8(b0 + x)); + b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x))); t = vaddq_s16(b0e, b1e); vst1_u8(a + x, vqmovun_s16(t)); #endif @@ -1357,11 +1369,12 @@ void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t t2 = vaddq_s32(t2, addon); t1 = vshrq_n_s32(t1, shiftNum); t2 = vshrq_n_s32(t2, shiftNum); - int16x8_t t = vuzp1q_s16(t1, t2); + int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1), + vreinterpretq_s16_s32(t2)); #if HIGH_BIT_DEPTH t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); t = vmaxq_s16(t, vdupq_n_s16(0)); - vst1q_u16(dst + x, t); + vst1q_u16(dst + x, vreinterpretq_u16_s16(t)); #else vst1_u8(dst + x, vqmovun_s16(t)); #endif @@ -1393,9 +1406,9 @@ void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t uint16x8_t t = vrhaddq_u16(in0, in1); vst1q_u16(dst + x, t); #else - int16x8_t in0 = vmovl_u8(vld1_u8(src0 + x)); - int16x8_t in1 = vmovl_u8(vld1_u8(src1 + x)); - int16x8_t t = vrhaddq_s16(in0, in1); + uint16x8_t in0 = vmovl_u8(vld1_u8(src0 + x)); + uint16x8_t in1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t t = vrhaddq_u16(in0, in1); vst1_u8(dst + x, vmovn_u16(t)); #endif } @@ -1440,25 +1453,27 @@ uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride) { uint32_t sum = 0, sqr = 0; - int32x4_t vsqr = vdupq_n_s32(0); + uint32x4_t vsqr = vdupq_n_u32(0); + for (int y = 0; y < size; y++) { int x = 0; - int16x8_t vsum = vdupq_n_s16(0); + uint16x8_t vsum = vdupq_n_u16(0); for (; (x + 8) <= size; x += 8) { - int16x8_t in; + uint16x8_t in; in = vmovl_u8(vld1_u8(pix + x)); vsum = vaddq_u16(vsum, in); - vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in)); - vsqr = vmlal_high_s16(vsqr, in, in); + vsqr = vmlal_u16(vsqr, vget_low_u16(in), vget_low_u16(in)); + vsqr = vmlal_high_u16(vsqr, in, in); } for (; x < size; x++) { sum += pix[x]; sqr += pix[x] * pix[x]; } - sum += vaddvq_s16(vsum); + + sum += vaddvq_u16(vsum); pix += i_stride; } @@ -1474,7 +1489,7 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i int x = 0; for (; (x + 8) < blockSize; x += 8) { - int16x8_t vfenc, vpred; + uint16x8_t vfenc, vpred; #if HIGH_BIT_DEPTH vfenc = vld1q_u16(fenc + x); vpred = vld1q_u16(pred + x); @@ -1482,7 +1497,8 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i vfenc = vmovl_u8(vld1_u8(fenc + x)); vpred = vmovl_u8(vld1_u8(pred + x)); #endif - vst1q_s16(residual + x, vsubq_s16(vfenc, vpred)); + int16x8_t res = vreinterpretq_s16_u16(vsubq_u16(vfenc, vpred)); + vst1q_s16(residual + x, res); } for (; x < blockSize; x++) { -- 2.42.1
>From 59041d5ee3c1989a7a68f6572030855bd0932f21 Mon Sep 17 00:00:00 2001 Message-ID: <59041d5ee3c1989a7a68f6572030855bd0932f21.1724144859.git.hari.lim...@arm.com> In-Reply-To: <cover.1724144859.git.hari.lim...@arm.com> References: <cover.1724144859.git.hari.lim...@arm.com> From: Hari Limaye <hari.lim...@arm.com> Date: Tue, 20 Feb 2024 17:55:07 +0000 Subject: [PATCH v2 04/18] AArch64: Refactor types and conversions in pixel-prim.cpp Use correct types for all variables/operations and make all vector conversions explicit in common/aarch64/pixel-prim.cpp. --- source/common/aarch64/pixel-prim.cpp | 524 ++++++++++++++------------- 1 file changed, 270 insertions(+), 254 deletions(-) diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 9598f5498..dd3fd4637 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -36,15 +36,21 @@ static inline void transpose_8h_8h(int16x8_t &t1, int16x8_t &t2, static inline void transpose_4s_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s32(s1, s2); - t2 = vtrn2q_s32(s1, s2); + int32x4_t tmp1 = vreinterpretq_s32_s16(s1); + int32x4_t tmp2 = vreinterpretq_s32_s16(s2); + + t1 = vreinterpretq_s16_s32(vtrn1q_s32(tmp1, tmp2)); + t2 = vreinterpretq_s16_s32(vtrn2q_s32(tmp1, tmp2)); } static inline void transpose_2d_8h(int16x8_t &t1, int16x8_t &t2, const int16x8_t s1, const int16x8_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); + int64x2_t tmp1 = vreinterpretq_s64_s16(s1); + int64x2_t tmp2 = vreinterpretq_s64_s16(s2); + + t1 = vreinterpretq_s16_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s16_s64(vtrn2q_s64(tmp1, tmp2)); } static inline void SUMSUB_ABCD(int16x8_t &s1, int16x8_t &d1, int16x8_t &s2, int16x8_t &d2, @@ -84,16 +90,16 @@ static int _satd_4x8_8x4_end_neon(int16x8_t v0, int16x8_t v1, int16x8_t v2, int1 transpose_4s_8h(v0, v1, v16, v18); transpose_4s_8h(v2, v3, v17, v19); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); - v0 = vmaxq_u16(v0, v1); - v1 = vmaxq_u16(v2, v3); + uint16x8_t max0 = vmaxq_u16(abs0, abs1); + uint16x8_t max1 = vmaxq_u16(abs2, abs3); - v0 = vaddq_u16(v0, v1); - return vaddlvq_u16(v0); + uint16x8_t sum = vaddq_u16(max0, max1); + return vaddlvq_u16(sum); } static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1) @@ -109,11 +115,11 @@ static inline int _satd_4x4_neon(int16x8_t v0, int16x8_t v1) transpose_4s_8h(v0, v1, v2, v3); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v0 = vmaxq_u16(v0, v1); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t max = vmaxq_u16(abs0, abs1); - return vaddlvq_s16(v0); + return vaddlvq_u16(max); } static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3, int16x8_t &v20, @@ -141,20 +147,19 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in transpose_4s_8h(v4, v6, v20, v22); transpose_4s_8h(v5, v7, v21, v23); - v0 = vabsq_s16(v0); - v1 = vabsq_s16(v1); - v2 = vabsq_s16(v2); - v3 = vabsq_s16(v3); - v4 = vabsq_s16(v4); - v5 = vabsq_s16(v5); - v6 = vabsq_s16(v6); - v7 = vabsq_s16(v7); - - v0 = vmaxq_u16(v0, v2); - v1 = vmaxq_u16(v1, v3); - v2 = vmaxq_u16(v4, v6); - v3 = vmaxq_u16(v5, v7); - + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v0)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v1)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v2)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v3)); + uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v4)); + uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v5)); + uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v6)); + uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v7)); + + v0 = vreinterpretq_s16_u16(vmaxq_u16(abs0, abs2)); + v1 = vreinterpretq_s16_u16(vmaxq_u16(abs1, abs3)); + v2 = vreinterpretq_s16_u16(vmaxq_u16(abs4, abs6)); + v3 = vreinterpretq_s16_u16(vmaxq_u16(abs5, abs7)); } #if HIGH_BIT_DEPTH @@ -163,8 +168,11 @@ static void _satd_8x4v_8x8h_neon(int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, in static inline void transpose_2d_4s(int32x4_t &t1, int32x4_t &t2, const int32x4_t s1, const int32x4_t s2) { - t1 = vtrn1q_s64(s1, s2); - t2 = vtrn2q_s64(s1, s2); + int64x2_t tmp1 = vreinterpretq_s64_s32(s1); + int64x2_t tmp2 = vreinterpretq_s64_s32(s2); + + t1 = vreinterpretq_s32_s64(vtrn1q_s64(tmp1, tmp2)); + t2 = vreinterpretq_s32_s64(vtrn2q_s64(tmp1, tmp2)); } static inline void ISUMSUB_AB(int32x4_t &sum, int32x4_t &sub, const int32x4_t a, const int32x4_t b) @@ -203,10 +211,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons t2 = vld1q_u16(pix2 + 2 * stride_pix2); t3 = vld1q_u16(pix2 + 3 * stride_pix2); - v16 = vsubq_u16(r0, t0); - v17 = vsubq_u16(r1, t1); - v18 = vsubq_u16(r2, t2); - v19 = vsubq_u16(r3, t3); + v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); r0 = vld1q_u16(pix1 + 4 * stride_pix1); r1 = vld1q_u16(pix1 + 5 * stride_pix1); @@ -218,10 +226,10 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons t2 = vld1q_u16(pix2 + 6 * stride_pix2); t3 = vld1q_u16(pix2 + 7 * stride_pix2); - v20 = vsubq_u16(r0, t0); - v21 = vsubq_u16(r1, t1); - v22 = vsubq_u16(r2, t2); - v23 = vsubq_u16(r3, t3); + v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -234,8 +242,8 @@ static inline void _sub_8x8_fly(const uint16_t *pix1, intptr_t stride_pix1, cons static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2, int16x8_t &v0, int16x8_t &v1, int16x8_t &v2, int16x8_t &v3) { - uint8x16_t r0, r1, r2, r3; - uint8x16_t t0, t1, t2, t3; + uint16x8_t r0, r1, r2, r3; + uint16x8_t t0, t1, t2, t3; int16x8_t v16, v17, v20, v21; int16x8_t v18, v19, v22, v23; @@ -249,11 +257,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui t2 = vld1q_u16(pix2 + 2 * stride_pix2); t3 = vld1q_u16(pix2 + 3 * stride_pix2); - - v16 = vsubq_u16((r0), (t0)); - v17 = vsubq_u16((r1), (t1)); - v18 = vsubq_u16((r2), (t2)); - v19 = vsubq_u16((r3), (t3)); + v16 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); r0 = vld1q_u16(pix1 + 0 * stride_pix1 + 8); r1 = vld1q_u16(pix1 + 1 * stride_pix1 + 8); @@ -265,11 +272,10 @@ static void _satd_16x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const ui t2 = vld1q_u16(pix2 + 2 * stride_pix2 + 8); t3 = vld1q_u16(pix2 + 3 * stride_pix2 + 8); - - v20 = vsubq_u16(r0, t0); - v21 = vsubq_u16(r1, t1); - v22 = vsubq_u16(r2, t2); - v23 = vsubq_u16(r3, t3); + v20 = vreinterpretq_s16_u16(vsubq_u16(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubq_u16(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubq_u16(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -295,7 +301,10 @@ int pixel_satd_4x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 uint16x8_t r0 = vcombine_u16(r0_0, r0_1); uint16x8_t r1 = vcombine_u16(r1_0, r1_1); - return _satd_4x4_neon(vsubq_u16(t0, r0), vsubq_u16(r1, t1)); + int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(t0, r0)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(r1, t1)); + + return _satd_4x4_neon(v0, v1); } @@ -316,10 +325,10 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 i6 = vld1q_u16(pix1 + 3 * stride_pix1); i7 = vld1q_u16(pix2 + 3 * stride_pix2); - int16x8_t v0 = vsubq_u16(i0, i1); - int16x8_t v1 = vsubq_u16(i2, i3); - int16x8_t v2 = vsubq_u16(i4, i5); - int16x8_t v3 = vsubq_u16(i6, i7); + int16x8_t v0 = vreinterpretq_s16_u16(vsubq_u16(i0, i1)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubq_u16(i2, i3)); + int16x8_t v2 = vreinterpretq_s16_u16(vsubq_u16(i4, i5)); + int16x8_t v3 = vreinterpretq_s16_u16(vsubq_u16(i6, i7)); return _satd_4x8_8x4_end_neon(v0, v1, v2, v3); } @@ -327,17 +336,21 @@ int pixel_satd_8x4_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16 int pixel_satd_16x16_neon(const uint16_t *pix1, intptr_t stride_pix1, const uint16_t *pix2, intptr_t stride_pix2) { - int32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0); + uint32x4_t v30 = vdupq_n_u32(0), v31 = vdupq_n_u32(0); int16x8_t v0, v1, v2, v3; - for (int offset = 0; offset <= 12; offset += 4) { - _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, pix2 + offset * stride_pix2, stride_pix2, v0, v1, v2, v3); - v30 = vpadalq_u16(v30, v0); - v30 = vpadalq_u16(v30, v1); - v31 = vpadalq_u16(v31, v2); - v31 = vpadalq_u16(v31, v3); + + for (int offset = 0; offset <= 12; offset += 4) + { + _satd_16x4_neon(pix1 + offset * stride_pix1, stride_pix1, + pix2 + offset * stride_pix2,stride_pix2, + v0, v1, v2, v3); + v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v0)); + v30 = vpadalq_u16(v30, vreinterpretq_u16_s16(v1)); + v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v2)); + v31 = vpadalq_u16(v31, vreinterpretq_u16_s16(v3)); } - return vaddvq_s32(vaddq_s32(v30, v31)); + return vaddvq_u32(vaddq_u32(v30, v31)); } #else //HIGH_BIT_DEPTH @@ -360,16 +373,14 @@ static void _satd_16x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uin t2 = vld1q_u8(pix2 + 2 * stride_pix2); t3 = vld1q_u8(pix2 + 3 * stride_pix2); - - - v16 = vsubl_u8(vget_low_u8(r0), vget_low_u8(t0)); - v20 = vsubl_high_u8(r0, t0); - v17 = vsubl_u8(vget_low_u8(r1), vget_low_u8(t1)); - v21 = vsubl_high_u8(r1, t1); - v18 = vsubl_u8(vget_low_u8(r2), vget_low_u8(t2)); - v22 = vsubl_high_u8(r2, t2); - v19 = vsubl_u8(vget_low_u8(r3), vget_low_u8(t3)); - v23 = vsubl_high_u8(r3, t3); + v16 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r0), vget_low_u8(t0))); + v20 = vreinterpretq_s16_u16(vsubl_high_u8(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r1), vget_low_u8(t1))); + v21 = vreinterpretq_s16_u16(vsubl_high_u8(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r2), vget_low_u8(t2))); + v22 = vreinterpretq_s16_u16(vsubl_high_u8(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(r3), vget_low_u8(t3))); + v23 = vreinterpretq_s16_u16(vsubl_high_u8(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); SUMSUB_AB(v2, v3, v18, v19); @@ -398,10 +409,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const t2 = vld1_u8(pix2 + 2 * stride_pix2); t3 = vld1_u8(pix2 + 3 * stride_pix2); - v16 = vsubl_u8(r0, t0); - v17 = vsubl_u8(r1, t1); - v18 = vsubl_u8(r2, t2); - v19 = vsubl_u8(r3, t3); + v16 = vreinterpretq_s16_u16(vsubl_u8(r0, t0)); + v17 = vreinterpretq_s16_u16(vsubl_u8(r1, t1)); + v18 = vreinterpretq_s16_u16(vsubl_u8(r2, t2)); + v19 = vreinterpretq_s16_u16(vsubl_u8(r3, t3)); r0 = vld1_u8(pix1 + 4 * stride_pix1); r1 = vld1_u8(pix1 + 5 * stride_pix1); @@ -413,10 +424,10 @@ static inline void _sub_8x8_fly(const uint8_t *pix1, intptr_t stride_pix1, const t2 = vld1_u8(pix2 + 6 * stride_pix2); t3 = vld1_u8(pix2 + 7 * stride_pix2); - v20 = vsubl_u8(r0, t0); - v21 = vsubl_u8(r1, t1); - v22 = vsubl_u8(r2, t2); - v23 = vsubl_u8(r3, t3); + v20 = vreinterpretq_s16_u16(vsubl_u8(r0, t0)); + v21 = vreinterpretq_s16_u16(vsubl_u8(r1, t1)); + v22 = vreinterpretq_s16_u16(vsubl_u8(r2, t2)); + v23 = vreinterpretq_s16_u16(vsubl_u8(r3, t3)); SUMSUB_AB(v0, v1, v16, v17); @@ -432,7 +443,8 @@ int pixel_satd_4x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t uint8x8_t r0 = load_u8x4x2(pix2, 2 * stride_pix2); uint8x8_t r1 = load_u8x4x2(pix2 + stride_pix2, 2 * stride_pix2); - return _satd_4x4_neon(vsubl_u8(t0, r0), vsubl_u8(r1, t1)); + return _satd_4x4_neon(vreinterpretq_s16_u16(vsubl_u8(t0, r0)), + vreinterpretq_s16_u16(vsubl_u8(r1, t1))); } @@ -449,53 +461,53 @@ int pixel_satd_8x4_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t i6 = vld1_u8(pix1 + 3 * stride_pix1); i7 = vld1_u8(pix2 + 3 * stride_pix2); - int16x8_t v0 = vsubl_u8(i0, i1); - int16x8_t v1 = vsubl_u8(i2, i3); - int16x8_t v2 = vsubl_u8(i4, i5); - int16x8_t v3 = vsubl_u8(i6, i7); + int16x8_t v0 = vreinterpretq_s16_u16(vsubl_u8(i0, i1)); + int16x8_t v1 = vreinterpretq_s16_u16(vsubl_u8(i2, i3)); + int16x8_t v2 = vreinterpretq_s16_u16(vsubl_u8(i4, i5)); + int16x8_t v3 = vreinterpretq_s16_u16(vsubl_u8(i6, i7)); return _satd_4x8_8x4_end_neon(v0, v1, v2, v3); } int pixel_satd_16x16_neon(const uint8_t *pix1, intptr_t stride_pix1, const uint8_t *pix2, intptr_t stride_pix2) { - int16x8_t v30, v31; + uint16x8_t v30, v31; int16x8_t v0, v1, v2, v3; + uint16x8_t t0, t1; _satd_16x4_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3); - v30 = vaddq_s16(v0, v1); - v31 = vaddq_s16(v2, v3); + v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); _satd_16x4_neon(pix1 + 4 * stride_pix1, stride_pix1, pix2 + 4 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); _satd_16x4_neon(pix1 + 8 * stride_pix1, stride_pix1, pix2 + 8 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); _satd_16x4_neon(pix1 + 12 * stride_pix1, stride_pix1, pix2 + 12 * stride_pix2, stride_pix2, v0, v1, v2, v3); - v0 = vaddq_s16(v0, v1); - v1 = vaddq_s16(v2, v3); - v30 = vaddq_s16(v30, v0); - v31 = vaddq_s16(v31, v1); - - int32x4_t sum0 = vpaddlq_u16(v30); - int32x4_t sum1 = vpaddlq_u16(v31); - sum0 = vaddq_s32(sum0, sum1); - return vaddvq_s32(sum0); - + t0 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + t1 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); + v30 = vaddq_u16(v30, t0); + v31 = vaddq_u16(v31, t1); + + uint32x4_t sum0 = vpaddlq_u16(v30); + uint32x4_t sum1 = vpaddlq_u16(v31); + sum0 = vaddq_u32(sum0, sum1); + return vaddvq_u32(sum0); } #endif //HIGH_BIT_DEPTH #if HIGH_BIT_DEPTH -typedef int32x4_t sa8d_out_type; +typedef uint32x4_t sa8d_out_type; #else -typedef int16x8_t sa8d_out_type; +typedef uint16x8_t sa8d_out_type; #endif static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, @@ -546,31 +558,30 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, transpose_2d_8h(v18, v22, v2, v6); transpose_2d_8h(v19, v23, v3, v7); + uint16x8_t abs0 = vreinterpretq_u16_s16(vabsq_s16(v16)); + uint16x8_t abs1 = vreinterpretq_u16_s16(vabsq_s16(v17)); + uint16x8_t abs2 = vreinterpretq_u16_s16(vabsq_s16(v18)); + uint16x8_t abs3 = vreinterpretq_u16_s16(vabsq_s16(v19)); + uint16x8_t abs4 = vreinterpretq_u16_s16(vabsq_s16(v20)); + uint16x8_t abs5 = vreinterpretq_u16_s16(vabsq_s16(v21)); + uint16x8_t abs6 = vreinterpretq_u16_s16(vabsq_s16(v22)); + uint16x8_t abs7 = vreinterpretq_u16_s16(vabsq_s16(v23)); - v16 = vabsq_s16(v16); - v17 = vabsq_s16(v17); - v18 = vabsq_s16(v18); - v19 = vabsq_s16(v19); - v20 = vabsq_s16(v20); - v21 = vabsq_s16(v21); - v22 = vabsq_s16(v22); - v23 = vabsq_s16(v23); - - v16 = vmaxq_u16(v16, v20); - v17 = vmaxq_u16(v17, v21); - v18 = vmaxq_u16(v18, v22); - v19 = vmaxq_u16(v19, v23); + uint16x8_t max0 = vmaxq_u16(abs0, abs4); + uint16x8_t max1 = vmaxq_u16(abs1, abs5); + uint16x8_t max2 = vmaxq_u16(abs2, abs6); + uint16x8_t max3 = vmaxq_u16(abs3, abs7); #if HIGH_BIT_DEPTH - out0 = vpaddlq_u16(v16); - out1 = vpaddlq_u16(v17); - out0 = vpadalq_u16(out0, v18); - out1 = vpadalq_u16(out1, v19); + out0 = vpaddlq_u16(max0); + out1 = vpaddlq_u16(max1); + out0 = vpadalq_u16(out0, max2); + out1 = vpadalq_u16(out1, max3); #else //HIGH_BIT_DEPTH - out0 = vaddq_u16(v16, v17); - out1 = vaddq_u16(v18, v19); + out0 = vaddq_u16(max0, max1); + out1 = vaddq_u16(max2, max3); #endif //HIGH_BIT_DEPTH @@ -643,41 +654,41 @@ static inline void _sa8d_8x8_neon_end(int16x8_t v0, int16x8_t v1, int16x8_t v2, v23l = v3h; v23h = v7h; - v16l = vabsq_s32(v16l); - v17l = vabsq_s32(v17l); - v18l = vabsq_s32(v18l); - v19l = vabsq_s32(v19l); - v20l = vabsq_s32(v20l); - v21l = vabsq_s32(v21l); - v22l = vabsq_s32(v22l); - v23l = vabsq_s32(v23l); - - v16h = vabsq_s32(v16h); - v17h = vabsq_s32(v17h); - v18h = vabsq_s32(v18h); - v19h = vabsq_s32(v19h); - v20h = vabsq_s32(v20h); - v21h = vabsq_s32(v21h); - v22h = vabsq_s32(v22h); - v23h = vabsq_s32(v23h); - - v16l = vmaxq_u32(v16l, v20l); - v17l = vmaxq_u32(v17l, v21l); - v18l = vmaxq_u32(v18l, v22l); - v19l = vmaxq_u32(v19l, v23l); - - v16h = vmaxq_u32(v16h, v20h); - v17h = vmaxq_u32(v17h, v21h); - v18h = vmaxq_u32(v18h, v22h); - v19h = vmaxq_u32(v19h, v23h); - - v16l = vaddq_u32(v16l, v16h); - v17l = vaddq_u32(v17l, v17h); - v18l = vaddq_u32(v18l, v18h); - v19l = vaddq_u32(v19l, v19h); - - out0 = vaddq_u32(v16l, v17l); - out1 = vaddq_u32(v18l, v19l); + uint32x4_t abs0_lo = vreinterpretq_u32_s32(vabsq_s32(v16l)); + uint32x4_t abs1_lo = vreinterpretq_u32_s32(vabsq_s32(v17l)); + uint32x4_t abs2_lo = vreinterpretq_u32_s32(vabsq_s32(v18l)); + uint32x4_t abs3_lo = vreinterpretq_u32_s32(vabsq_s32(v19l)); + uint32x4_t abs4_lo = vreinterpretq_u32_s32(vabsq_s32(v20l)); + uint32x4_t abs5_lo = vreinterpretq_u32_s32(vabsq_s32(v21l)); + uint32x4_t abs6_lo = vreinterpretq_u32_s32(vabsq_s32(v22l)); + uint32x4_t abs7_lo = vreinterpretq_u32_s32(vabsq_s32(v23l)); + + uint32x4_t abs0_hi = vreinterpretq_u32_s32(vabsq_s32(v16h)); + uint32x4_t abs1_hi = vreinterpretq_u32_s32(vabsq_s32(v17h)); + uint32x4_t abs2_hi = vreinterpretq_u32_s32(vabsq_s32(v18h)); + uint32x4_t abs3_hi = vreinterpretq_u32_s32(vabsq_s32(v19h)); + uint32x4_t abs4_hi = vreinterpretq_u32_s32(vabsq_s32(v20h)); + uint32x4_t abs5_hi = vreinterpretq_u32_s32(vabsq_s32(v21h)); + uint32x4_t abs6_hi = vreinterpretq_u32_s32(vabsq_s32(v22h)); + uint32x4_t abs7_hi = vreinterpretq_u32_s32(vabsq_s32(v23h)); + + uint32x4_t max0_lo = vmaxq_u32(abs0_lo, abs4_lo); + uint32x4_t max1_lo = vmaxq_u32(abs1_lo, abs5_lo); + uint32x4_t max2_lo = vmaxq_u32(abs2_lo, abs6_lo); + uint32x4_t max3_lo = vmaxq_u32(abs3_lo, abs7_lo); + + uint32x4_t max0_hi = vmaxq_u32(abs0_hi, abs4_hi); + uint32x4_t max1_hi = vmaxq_u32(abs1_hi, abs5_hi); + uint32x4_t max2_hi = vmaxq_u32(abs2_hi, abs6_hi); + uint32x4_t max3_hi = vmaxq_u32(abs3_hi, abs7_hi); + + uint32x4_t sum0 = vaddq_u32(max0_lo, max0_hi); + uint32x4_t sum1 = vaddq_u32(max1_lo, max1_hi); + uint32x4_t sum2 = vaddq_u32(max2_lo, max2_hi); + uint32x4_t sum3 = vaddq_u32(max3_lo, max3_hi); + + out0 = vaddq_u32(sum0, sum1); + out1 = vaddq_u32(sum2, sum3); #endif @@ -700,24 +711,19 @@ static inline void _satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const int pixel_satd_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2) { - int16x8_t v30, v31; int16x8_t v0, v1, v2, v3; _satd_8x8_neon(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3); -#if !(HIGH_BIT_DEPTH) - v30 = vaddq_u16(v0, v1); - v31 = vaddq_u16(v2, v3); + uint16x8_t v30 = vaddq_u16(vreinterpretq_u16_s16(v0), vreinterpretq_u16_s16(v1)); + uint16x8_t v31 = vaddq_u16(vreinterpretq_u16_s16(v2), vreinterpretq_u16_s16(v3)); +#if !(HIGH_BIT_DEPTH) uint16x8_t sum = vaddq_u16(v30, v31); - return vaddvq_s32(vpaddlq_u16(sum)); + return vaddvq_u32(vpaddlq_u16(sum)); #else - - v30 = vaddq_u16(v0, v1); - v31 = vaddq_u16(v2, v3); - - int32x4_t sum = vpaddlq_u16(v30); + uint32x4_t sum = vpaddlq_u16(v30); sum = vpadalq_u16(sum, v31); - return vaddvq_s32(sum); + return vaddvq_u32(sum); #endif } @@ -732,10 +738,10 @@ int pixel_sa8d_8x8_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pi _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1); #if HIGH_BIT_DEPTH - int32x4_t s = vaddq_u32(res0, res1); + uint32x4_t s = vaddq_u32(res0, res1); return (vaddvq_u32(s) + 1) >> 1; #else - return (vaddlvq_s16(vaddq_u16(res0, res1)) + 1) >> 1; + return (vaddlvq_u16(vaddq_u16(res0, res1)) + 1) >> 1; #endif } @@ -748,7 +754,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * int16x8_t v0, v1, v2, v3; int16x8_t v20, v21, v22, v23; sa8d_out_type res0, res1; - int32x4_t v30, v31; + uint32x4_t v30, v31; _sub_8x8_fly(pix1, stride_pix1, pix2, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23); _sa8d_8x8_neon_end(v0, v1, v2, v3, v20, v21, v22, v23, res0, res1); @@ -757,7 +763,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpaddlq_u16(res0); v31 = vpaddlq_u16(res1); #else - v30 = vaddq_s32(res0, res1); + v30 = vaddq_u32(res0, res1); #endif _sub_8x8_fly(pix1 + 8, stride_pix1, pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, v22, v23); @@ -767,7 +773,7 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v31 = vaddq_s32(res0, res1); + v31 = vaddq_u32(res0, res1); #endif @@ -779,8 +785,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v30 = vaddq_s32(v30, res0); - v31 = vaddq_s32(v31, res1); + v30 = vaddq_u32(v30, res0); + v31 = vaddq_u32(v31, res1); #endif _sub_8x8_fly(pix1 + 8 * stride_pix1 + 8, stride_pix1, pix2 + 8 * stride_pix2 + 8, stride_pix2, v0, v1, v2, v3, v20, v21, @@ -791,8 +797,8 @@ int pixel_sa8d_16x16_neon(const pixel *pix1, intptr_t stride_pix1, const pixel * v30 = vpadalq_u16(v30, res0); v31 = vpadalq_u16(v31, res1); #else - v30 = vaddq_s32(v30, res0); - v31 = vaddq_s32(v31, res1); + v30 = vaddq_u32(v30, res0); + v31 = vaddq_u32(v31, res1); #endif v30 = vaddq_u32(v30, v31); @@ -840,19 +846,18 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp { uint16x8_t p1 = vld1q_u16(pix1 + x); uint16x8_t p2 = vld1q_u16(pix2 + x); - vsum16_1 = vabaq_s16(vsum16_1, p1, p2); - + vsum16_1 = vabaq_u16(vsum16_1, p1, p2); } if (lx & 4) { uint16x4_t p1 = vld1_u16(pix1 + x); uint16x4_t p2 = vld1_u16(pix2 + x); - sum += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2)); + sum += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); x += 4; } if (lx >= 4) { - sum += vaddlvq_s16(vsum16_1); + sum += vaddlvq_u16(vsum16_1); } #else @@ -924,10 +929,9 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8_t p2 = vld1q_u16(pix2 + x); uint16x8_t p3 = vld1q_u16(pix3 + x); uint16x8_t p4 = vld1q_u16(pix4 + x); - vsum16_0 = vabaq_s16(vsum16_0, p1, p2); - vsum16_1 = vabaq_s16(vsum16_1, p1, p3); - vsum16_2 = vabaq_s16(vsum16_2, p1, p4); - + vsum16_0 = vabaq_u16(vsum16_0, p1, p2); + vsum16_1 = vabaq_u16(vsum16_1, p1, p3); + vsum16_2 = vabaq_u16(vsum16_2, p1, p4); } if (lx & 4) { @@ -935,16 +939,16 @@ void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x4_t p2 = vld1_u16(pix2 + x); uint16x4_t p3 = vld1_u16(pix3 + x); uint16x4_t p4 = vld1_u16(pix4 + x); - res[0] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p2)); - res[1] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p3)); - res[2] += vaddlv_s16(vaba_s16(vdup_n_s16(0), p1, p4)); + res[0] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); + res[1] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p3)); + res[2] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p4)); x += 4; } if (lx >= 4) { - res[0] += vaddlvq_s16(vsum16_0); - res[1] += vaddlvq_s16(vsum16_1); - res[2] += vaddlvq_s16(vsum16_2); + res[0] += vaddlvq_u16(vsum16_0); + res[1] += vaddlvq_u16(vsum16_1); + res[2] += vaddlvq_u16(vsum16_2); } #else @@ -1009,7 +1013,8 @@ template<int lx, int ly> void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res) { - int32x4_t result = {0}; + uint32x4_t result = vdupq_n_u32(0); + for (int y = 0; y < ly; y++) { int x = 0; @@ -1025,14 +1030,14 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8x2_t p3 = vld1q_u16_x2(&pix3[x]); uint16x8x2_t p4 = vld1q_u16_x2(&pix4[x]); uint16x8x2_t p5 = vld1q_u16_x2(&pix5[x]); - vsum16_0 = vabaq_s16(vsum16_0, p1.val[0], p2.val[0]); - vsum16_1 = vabaq_s16(vsum16_1, p1.val[0], p3.val[0]); - vsum16_2 = vabaq_s16(vsum16_2, p1.val[0], p4.val[0]); - vsum16_3 = vabaq_s16(vsum16_3, p1.val[0], p5.val[0]); - vsum16_0 = vabaq_s16(vsum16_0, p1.val[1], p2.val[1]); - vsum16_1 = vabaq_s16(vsum16_1, p1.val[1], p3.val[1]); - vsum16_2 = vabaq_s16(vsum16_2, p1.val[1], p4.val[1]); - vsum16_3 = vabaq_s16(vsum16_3, p1.val[1], p5.val[1]); + vsum16_0 = vabaq_u16(vsum16_0, p1.val[0], p2.val[0]); + vsum16_1 = vabaq_u16(vsum16_1, p1.val[0], p3.val[0]); + vsum16_2 = vabaq_u16(vsum16_2, p1.val[0], p4.val[0]); + vsum16_3 = vabaq_u16(vsum16_3, p1.val[0], p5.val[0]); + vsum16_0 = vabaq_u16(vsum16_0, p1.val[1], p2.val[1]); + vsum16_1 = vabaq_u16(vsum16_1, p1.val[1], p3.val[1]); + vsum16_2 = vabaq_u16(vsum16_2, p1.val[1], p4.val[1]); + vsum16_3 = vabaq_u16(vsum16_3, p1.val[1], p5.val[1]); } if (lx & 8) { @@ -1041,10 +1046,10 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const uint16x8_t p3 = vld1q_u16(pix3 + x); uint16x8_t p4 = vld1q_u16(pix4 + x); uint16x8_t p5 = vld1q_u16(pix5 + x); - vsum16_0 = vabaq_s16(vsum16_0, p1, p2); - vsum16_1 = vabaq_s16(vsum16_1, p1, p3); - vsum16_2 = vabaq_s16(vsum16_2, p1, p4); - vsum16_3 = vabaq_s16(vsum16_3, p1, p5); + vsum16_0 = vabaq_u16(vsum16_0, p1, p2); + vsum16_1 = vabaq_u16(vsum16_1, p1, p3); + vsum16_2 = vabaq_u16(vsum16_2, p1, p4); + vsum16_3 = vabaq_u16(vsum16_3, p1, p5); x += 8; } if (lx & 4) @@ -1052,15 +1057,17 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const /* This is equivalent to getting the absolute difference of pix1[x] with each of * pix2 - pix5, then summing across the vector (4 values each) and adding the * result to result. */ - uint16x8_t p1 = vreinterpretq_s16_u64( - vld1q_dup_u64((uint64_t *)&pix1[x])); - uint16x8_t p2_3 = vcombine_s16(vld1_u16(pix2 + x), vld1_u16(pix3 + x)); - uint16x8_t p4_5 = vcombine_s16(vld1_u16(pix4 + x), vld1_u16(pix5 + x)); + uint16x8_t p1 = vreinterpretq_u16_u64( + vld1q_dup_u64((uint64_t *)&pix1[x])); + uint16x8_t p2_3 = vcombine_u16(vld1_u16(pix2 + x), + vld1_u16(pix3 + x)); + uint16x8_t p4_5 = vcombine_u16(vld1_u16(pix4 + x), + vld1_u16(pix5 + x)); uint16x8_t a = vabdq_u16(p1, p2_3); uint16x8_t b = vabdq_u16(p1, p4_5); - result = vpadalq_s16(result, vpaddq_s16(a, b)); + result = vpadalq_u16(result, vpaddq_u16(a, b)); x += 4; } if (lx >= 4) @@ -1105,7 +1112,7 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const } if (lx & 4) { - uint8x16_t p1 = vreinterpretq_u32_u8( + uint8x16_t p1 = vreinterpretq_u8_u32( vld1q_dup_u32((uint32_t *)&pix1[x])); uint32x4_t p_x4 = vdupq_n_u32(0); @@ -1114,11 +1121,11 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const p_x4 = vld1q_lane_u32((uint32_t *)&pix4[x], p_x4, 2); p_x4 = vld1q_lane_u32((uint32_t *)&pix5[x], p_x4, 3); - uint16x8_t sum = vabdl_u8(vget_low_u8(p1), vget_low_u8(p_x4)); - uint16x8_t sum2 = vabdl_high_u8(p1, p_x4); + uint16x8_t sum = vabdl_u8(vget_low_u8(p1), + vget_low_u8(vreinterpretq_u8_u32(p_x4))); + uint16x8_t sum2 = vabdl_high_u8(p1, vreinterpretq_u8_u32(p_x4)); - uint16x8_t a = vpaddq_u16(sum, sum2); - result = vpadalq_u16(result, a); + result = vpadalq_u16(result, vpaddq_u16(sum, sum2)); } if (lx >= 4) { @@ -1143,7 +1150,8 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pix4 += frefstride; pix5 += frefstride; } - vst1q_s32(res, result); + + vst1q_s32(res, vreinterpretq_s32_u32(result)); } @@ -1164,17 +1172,17 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st { // We have to cast to the 'real' type so that this block // will compile for both low and high bitdepth. - const uint16_t *p1_u16 = (const uint16_t*)pix1; - const uint16_t *p2_u16 = (const uint16_t*)pix2; - tmp = vsubq_u16(vld1q_u16(p1_u16 + x), vld1q_u16(p2_u16 + x)); + uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x); + uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x); + tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2)); } else if (sizeof(T1) == 1 && sizeof(T2) == 1) { // We have to cast to the 'real' type so that this block // will compile for both low and high bitdepth. - const uint8_t *p1_u8 = (const uint8_t*)pix1; - const uint8_t *p2_u8 = (const uint8_t*)pix2; - tmp = vsubl_u8(vld1_u8(p1_u8 + x), vld1_u8(p2_u8 + x)); + uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x); + uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x); + tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2)); } else { @@ -1191,17 +1199,18 @@ sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t st if (sizeof(T1) == 2 && sizeof(T2) == 2) { - int32x4_t vsum = vaddq_u32(vsum1, vsum2);; - sum += vaddvq_u32(vsum); - vsum1 = vsum2 = vdupq_n_u16(0); + int32x4_t vsum = vaddq_s32(vsum1, vsum2); + sum += vaddvq_s32(vsum); + vsum1 = vsum2 = vdupq_n_s32(0); } pix1 += stride_pix1; pix2 += stride_pix2; } - int32x4_t vsum = vaddq_u32(vsum1, vsum2); - return sum + vaddvq_u32(vsum); + int32x4_t vsum = vaddq_s32(vsum1, vsum2); + + return sum + vaddvq_s32(vsum); } @@ -1214,9 +1223,10 @@ void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t st for (; (x + 8) <= bx; x += 8) { #if HIGH_BIT_DEPTH - vst1q_s16(a + x, vld1q_u16(b + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(vld1q_u16(b + x))); #else - vst1q_s16(a + x, vmovl_u8(vld1_u8(b + x))); + int16x8_t in = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b + x))); + vst1q_s16(a + x, in); #endif } for (; x < bx; x++) @@ -1283,9 +1293,11 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixe for (; (x + 8) <= bx; x += 8) { #if HIGH_BIT_DEPTH - vst1q_s16(a + x, vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x))); + uint16x8_t diff = vsubq_u16(vld1q_u16(b0 + x), vld1q_u16(b1 + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(diff)); #else - vst1q_s16(a + x, vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x))); + uint16x8_t diff = vsubl_u8(vld1_u8(b0 + x), vld1_u8(b1 + x)); + vst1q_s16(a + x, vreinterpretq_s16_u16(diff)); #endif } for (; x < bx; x++) @@ -1312,13 +1324,13 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_ int16x8_t b1e = vld1q_s16(b1 + x); int16x8_t b0e; #if HIGH_BIT_DEPTH - b0e = vld1q_u16(b0 + x); + b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x)); t = vaddq_s16(b0e, b1e); t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); t = vmaxq_s16(t, vdupq_n_s16(0)); - vst1q_u16(a + x, t); + vst1q_u16(a + x, vreinterpretq_u16_s16(t)); #else - b0e = vmovl_u8(vld1_u8(b0 + x)); + b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x))); t = vaddq_s16(b0e, b1e); vst1_u8(a + x, vqmovun_s16(t)); #endif @@ -1357,11 +1369,12 @@ void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t t2 = vaddq_s32(t2, addon); t1 = vshrq_n_s32(t1, shiftNum); t2 = vshrq_n_s32(t2, shiftNum); - int16x8_t t = vuzp1q_s16(t1, t2); + int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1), + vreinterpretq_s16_s32(t2)); #if HIGH_BIT_DEPTH t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); t = vmaxq_s16(t, vdupq_n_s16(0)); - vst1q_u16(dst + x, t); + vst1q_u16(dst + x, vreinterpretq_u16_s16(t)); #else vst1_u8(dst + x, vqmovun_s16(t)); #endif @@ -1393,9 +1406,9 @@ void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t uint16x8_t t = vrhaddq_u16(in0, in1); vst1q_u16(dst + x, t); #else - int16x8_t in0 = vmovl_u8(vld1_u8(src0 + x)); - int16x8_t in1 = vmovl_u8(vld1_u8(src1 + x)); - int16x8_t t = vrhaddq_s16(in0, in1); + uint16x8_t in0 = vmovl_u8(vld1_u8(src0 + x)); + uint16x8_t in1 = vmovl_u8(vld1_u8(src1 + x)); + uint16x8_t t = vrhaddq_u16(in0, in1); vst1_u8(dst + x, vmovn_u16(t)); #endif } @@ -1440,25 +1453,27 @@ uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride) { uint32_t sum = 0, sqr = 0; - int32x4_t vsqr = vdupq_n_s32(0); + uint32x4_t vsqr = vdupq_n_u32(0); + for (int y = 0; y < size; y++) { int x = 0; - int16x8_t vsum = vdupq_n_s16(0); + uint16x8_t vsum = vdupq_n_u16(0); for (; (x + 8) <= size; x += 8) { - int16x8_t in; + uint16x8_t in; in = vmovl_u8(vld1_u8(pix + x)); vsum = vaddq_u16(vsum, in); - vsqr = vmlal_s16(vsqr, vget_low_s16(in), vget_low_s16(in)); - vsqr = vmlal_high_s16(vsqr, in, in); + vsqr = vmlal_u16(vsqr, vget_low_u16(in), vget_low_u16(in)); + vsqr = vmlal_high_u16(vsqr, in, in); } for (; x < size; x++) { sum += pix[x]; sqr += pix[x] * pix[x]; } - sum += vaddvq_s16(vsum); + + sum += vaddvq_u16(vsum); pix += i_stride; } @@ -1474,7 +1489,7 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i int x = 0; for (; (x + 8) < blockSize; x += 8) { - int16x8_t vfenc, vpred; + uint16x8_t vfenc, vpred; #if HIGH_BIT_DEPTH vfenc = vld1q_u16(fenc + x); vpred = vld1q_u16(pred + x); @@ -1482,7 +1497,8 @@ void getResidual_neon(const pixel *fenc, const pixel *pred, int16_t *residual, i vfenc = vmovl_u8(vld1_u8(fenc + x)); vpred = vmovl_u8(vld1_u8(pred + x)); #endif - vst1q_s16(residual + x, vsubq_s16(vfenc, vpred)); + int16x8_t res = vreinterpretq_s16_u16(vsubq_u16(vfenc, vpred)); + vst1q_s16(residual + x, res); } for (; x < blockSize; x++) { -- 2.42.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel