Optimise the HBD 4-tap and 8-tap Neon implementations of interp_horiz_ps_neon when X265_DEPTH is 10.
The 4-tap filter implementation is up to 45% faster when coeffIdx==4 and up to 40% faster for the other filter values compared to the existing Neon implementation. The 8-tap filter implementation is up to 25% faster when coeffIdx is 1 or 3 compared to the existing Neon implementation. The existing high bitdepth Neon implementation is retained for use with 12-bit input. --- source/common/aarch64/filter-prim.cpp | 228 +++++++++++++++++++++++++- 1 file changed, 224 insertions(+), 4 deletions(-) diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp index 0ed3fb78c..945aa9845 100644 --- a/source/common/aarch64/filter-prim.cpp +++ b/source/common/aarch64/filter-prim.cpp @@ -2712,6 +2712,66 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride, } } +#if X265_DEPTH == 10 +template<int coeff4> +void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f, + const uint16x8_t offset, int16x4_t &d) +{ + if (coeff4) + { + // { -4, 36, 36, -4 } + // Filter values are divisible by 4, factor that out in order to only + // need a multiplication by 9 and a subtraction (which is a + // multiplication by -1). + uint16x4_t sum03 = vadd_u16(s[0], s[3]); + uint16x4_t sum12 = vadd_u16(s[1], s[2]); + + int16x4_t sum = + vreinterpret_s16_u16(vmla_n_u16(vget_low_u16(offset), sum12, 9)); + d = vsub_s16(sum, vreinterpret_s16_u16(sum03)); + } + else + { + uint16x4_t sum = vmls_lane_u16(vget_low_u16(offset), s[0], f, 0); + sum = vmla_lane_u16(sum, s[1], f, 1); + sum = vmla_lane_u16(sum, s[2], f, 2); + sum = vmls_lane_u16(sum, s[3], f, 3); + + // We halved filter values so -1 from right shift. + d = vshr_n_s16(vreinterpret_s16_u16(sum), SHIFT_INTERP_PS - 1); + } +} + +template<bool coeff4> +void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f, + const uint16x8_t offset, int16x8_t &d) +{ + if (coeff4) + { + // { -4, 36, 36, -4 } + // Filter values are divisible by 4, factor that out in order to only + // need a multiplication by 9 and a subtraction (which is a + // multiplication by -1). + uint16x8_t sum03 = vaddq_u16(s[0], s[3]); + uint16x8_t sum12 = vaddq_u16(s[1], s[2]); + + int16x8_t sum = + vreinterpretq_s16_u16(vmlaq_n_u16(offset, sum12, 9)); + d = vsubq_s16(sum, vreinterpretq_s16_u16(sum03)); + } + else + { + uint16x8_t sum = vmlsq_lane_u16(offset, s[0], f, 0); + sum = vmlaq_lane_u16(sum, s[1], f, 1); + sum = vmlaq_lane_u16(sum, s[2], f, 2); + sum = vmlsq_lane_u16(sum, s[3], f, 3); + + // We halved filter values so -1 from right shift. + d = vshrq_n_s16(vreinterpretq_s16_u16(sum), SHIFT_INTERP_PS - 1); + } +} + +#else // X265_DEPTH == 12 template<int coeff4> void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f, const uint32x4_t offset, int16x4_t &d) @@ -2787,25 +2847,40 @@ void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f, } } +#endif // X265_DEPTH == 10 + template<int coeff4, int width, int height> void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) { const int N_TAPS = 4; int blkheight = height; - const uint16x4_t filter = vreinterpret_u16_s16( + uint16x4_t filter = vreinterpret_u16_s16( vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx]))); - uint32x4_t offset; + uint32_t offset_u32; if (coeff4) { // The -2 is needed because we will divide the filter values by 4. - offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2)); + offset_u32 = (unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2); } else { - offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS); + offset_u32 = (unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS; } +#if X265_DEPTH == 10 + if (!coeff4) + { + // All filter values are even, halve them to avoid needing to widen to + // 32-bit elements in filter kernels. + filter = vshr_n_u16(filter, 1); + offset_u32 >>= 1; + } + + const uint16x8_t offset = vdupq_n_u16((uint16_t)offset_u32); +#else + const uint32x4_t offset = vdupq_n_u32(offset_u32); +#endif // X265_DEPTH == 10 if (isRowExt) { @@ -2881,6 +2956,149 @@ void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, } } +#if X265_DEPTH == 10 +template<int coeffIdx> +void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d, + uint32x4_t offset, uint16x8_t filter) +{ + uint16x4_t offset_u16 = vdup_n_u16((uint16_t)vgetq_lane_u32(offset, 0)); + + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + uint16x4_t sum012456 = vsub_u16(s[6], s[0]); + sum012456 = vmla_laneq_u16(sum012456, s[1], filter, 1); + sum012456 = vmls_laneq_u16(sum012456, s[2], filter, 2); + sum012456 = vmla_laneq_u16(sum012456, s[4], filter, 4); + sum012456 = vmls_laneq_u16(sum012456, s[5], filter, 5); + + uint16x4_t sum3 = + vmla_laneq_u16(offset_u16, s[3], filter, 3); + + int32x4_t sum = vaddl_s16(vreinterpret_s16_u16(sum3), + vreinterpret_s16_u16(sum012456)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + uint16x4_t sum07 = vadd_u16(s[0], s[7]); + uint16x4_t sum16 = vadd_u16(s[1], s[6]); + uint16x4_t sum25 = vadd_u16(s[2], s[5]); + uint16x4_t sum34 = vadd_u16(s[3], s[4]); + + uint16x4_t sum0167 = vshl_n_u16(sum16, 2); + sum0167 = vsub_u16(sum0167, sum07); + + uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3); + sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2); + + int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345), + vreinterpret_s16_u16(sum0167)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + uint16x4_t sum123567 = vsub_u16(s[1], s[7]); + sum123567 = vmls_laneq_u16(sum123567, s[2], filter, 2); + sum123567 = vmla_laneq_u16(sum123567, s[3], filter, 3); + sum123567 = vmla_laneq_u16(sum123567, s[6], filter, 6); + sum123567 = vmls_laneq_u16(sum123567, s[5], filter, 5); + + uint16x4_t sum4 = + vmla_laneq_u16(offset_u16, s[4], filter, 4); + + int32x4_t sum = vaddl_s16(vreinterpret_s16_u16(sum4), + vreinterpret_s16_u16(sum123567)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } +} + +template<int coeffIdx> +void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d, + uint32x4_t offset, uint16x8_t filter) +{ + uint16x8_t offset_u16 = vdupq_n_u16((uint16_t)vgetq_lane_u32(offset, 0)); + + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + uint16x8_t sum012456 = vsubq_u16(s[6], s[0]); + sum012456 = vmlaq_laneq_u16(sum012456, s[1], filter, 1); + sum012456 = vmlsq_laneq_u16(sum012456, s[2], filter, 2); + sum012456 = vmlaq_laneq_u16(sum012456, s[4], filter, 4); + sum012456 = vmlsq_laneq_u16(sum012456, s[5], filter, 5); + + uint16x8_t sum3 = + vmlaq_laneq_u16(offset_u16, s[3], filter, 3); + + int32x4_t sum_lo = vaddl_s16(vget_low_s16(vreinterpretq_s16_u16(sum3)), + vget_low_s16(vreinterpretq_s16_u16(sum012456))); + int32x4_t sum_hi = vaddl_s16(vget_high_s16(vreinterpretq_s16_u16(sum3)), + vget_high_s16(vreinterpretq_s16_u16(sum012456))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + uint16x8_t sum07 = vaddq_u16(s[0], s[7]); + uint16x8_t sum16 = vaddq_u16(s[1], s[6]); + uint16x8_t sum25 = vaddq_u16(s[2], s[5]); + uint16x8_t sum34 = vaddq_u16(s[3], s[4]); + + uint16x8_t sum0167 = vshlq_n_u16(sum16, 2); + sum0167 = vsubq_u16(sum0167, sum07); + + uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34), + filter, 3); + sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25), + filter, 2); + + uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34), + filter, 3); + sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25), + filter, 2); + + int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo), + vget_low_s16(vreinterpretq_s16_u16(sum0167))); + int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi), + vget_high_s16(vreinterpretq_s16_u16(sum0167))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + uint16x8_t sum123567 = vsubq_u16(s[1], s[7]); + sum123567 = vmlsq_laneq_u16(sum123567, s[2], filter, 2); + sum123567 = vmlaq_laneq_u16(sum123567, s[3], filter, 3); + sum123567 = vmlaq_laneq_u16(sum123567, s[6], filter, 6); + sum123567 = vmlsq_laneq_u16(sum123567, s[5], filter, 5); + + uint16x8_t sum4 = + vmlaq_laneq_u16(offset_u16, s[4], filter, 4); + + int32x4_t sum_lo = vaddl_s16(vget_low_s16(vreinterpretq_s16_u16(sum4)), + vget_low_s16(vreinterpretq_s16_u16(sum123567))); + int32x4_t sum_hi = vaddl_s16(vget_high_s16(vreinterpretq_s16_u16(sum4)), + vget_high_s16(vreinterpretq_s16_u16(sum123567))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } +} + +#else // X265_DEPTH == 12 template<int coeffIdx> void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d, uint32x4_t offset, uint16x8_t filter) @@ -3023,6 +3241,8 @@ void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d, } } +#endif // X265_DEPTH == 10 + template<int coeffIdx, int width, int height> void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int isRowExt) -- 2.39.5 (Apple Git-154)
>From 27e3d455302166dd5eec2385a4d1f90c5e125d4a Mon Sep 17 00:00:00 2001 Message-Id: <27e3d455302166dd5eec2385a4d1f90c5e125d4a.1740153395.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1740153395.git.gerdazsejke.m...@arm.com> References: <cover.1740153395.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Mon, 20 Jan 2025 09:16:19 +0100 Subject: [PATCH 04/10] AArch64: Optimise HBD interp_horiz_ps_neon for 10-bit Optimise the HBD 4-tap and 8-tap Neon implementations of interp_horiz_ps_neon when X265_DEPTH is 10. The 4-tap filter implementation is up to 45% faster when coeffIdx==4 and up to 40% faster for the other filter values compared to the existing Neon implementation. The 8-tap filter implementation is up to 25% faster when coeffIdx is 1 or 3 compared to the existing Neon implementation. The existing high bitdepth Neon implementation is retained for use with 12-bit input. --- source/common/aarch64/filter-prim.cpp | 228 +++++++++++++++++++++++++- 1 file changed, 224 insertions(+), 4 deletions(-) diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp index 0ed3fb78c..945aa9845 100644 --- a/source/common/aarch64/filter-prim.cpp +++ b/source/common/aarch64/filter-prim.cpp @@ -2712,6 +2712,66 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride, } } +#if X265_DEPTH == 10 +template<int coeff4> +void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f, + const uint16x8_t offset, int16x4_t &d) +{ + if (coeff4) + { + // { -4, 36, 36, -4 } + // Filter values are divisible by 4, factor that out in order to only + // need a multiplication by 9 and a subtraction (which is a + // multiplication by -1). + uint16x4_t sum03 = vadd_u16(s[0], s[3]); + uint16x4_t sum12 = vadd_u16(s[1], s[2]); + + int16x4_t sum = + vreinterpret_s16_u16(vmla_n_u16(vget_low_u16(offset), sum12, 9)); + d = vsub_s16(sum, vreinterpret_s16_u16(sum03)); + } + else + { + uint16x4_t sum = vmls_lane_u16(vget_low_u16(offset), s[0], f, 0); + sum = vmla_lane_u16(sum, s[1], f, 1); + sum = vmla_lane_u16(sum, s[2], f, 2); + sum = vmls_lane_u16(sum, s[3], f, 3); + + // We halved filter values so -1 from right shift. + d = vshr_n_s16(vreinterpret_s16_u16(sum), SHIFT_INTERP_PS - 1); + } +} + +template<bool coeff4> +void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f, + const uint16x8_t offset, int16x8_t &d) +{ + if (coeff4) + { + // { -4, 36, 36, -4 } + // Filter values are divisible by 4, factor that out in order to only + // need a multiplication by 9 and a subtraction (which is a + // multiplication by -1). + uint16x8_t sum03 = vaddq_u16(s[0], s[3]); + uint16x8_t sum12 = vaddq_u16(s[1], s[2]); + + int16x8_t sum = + vreinterpretq_s16_u16(vmlaq_n_u16(offset, sum12, 9)); + d = vsubq_s16(sum, vreinterpretq_s16_u16(sum03)); + } + else + { + uint16x8_t sum = vmlsq_lane_u16(offset, s[0], f, 0); + sum = vmlaq_lane_u16(sum, s[1], f, 1); + sum = vmlaq_lane_u16(sum, s[2], f, 2); + sum = vmlsq_lane_u16(sum, s[3], f, 3); + + // We halved filter values so -1 from right shift. + d = vshrq_n_s16(vreinterpretq_s16_u16(sum), SHIFT_INTERP_PS - 1); + } +} + +#else // X265_DEPTH == 12 template<int coeff4> void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f, const uint32x4_t offset, int16x4_t &d) @@ -2787,25 +2847,40 @@ void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f, } } +#endif // X265_DEPTH == 10 + template<int coeff4, int width, int height> void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) { const int N_TAPS = 4; int blkheight = height; - const uint16x4_t filter = vreinterpret_u16_s16( + uint16x4_t filter = vreinterpret_u16_s16( vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx]))); - uint32x4_t offset; + uint32_t offset_u32; if (coeff4) { // The -2 is needed because we will divide the filter values by 4. - offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2)); + offset_u32 = (unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2); } else { - offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS); + offset_u32 = (unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS; } +#if X265_DEPTH == 10 + if (!coeff4) + { + // All filter values are even, halve them to avoid needing to widen to + // 32-bit elements in filter kernels. + filter = vshr_n_u16(filter, 1); + offset_u32 >>= 1; + } + + const uint16x8_t offset = vdupq_n_u16((uint16_t)offset_u32); +#else + const uint32x4_t offset = vdupq_n_u32(offset_u32); +#endif // X265_DEPTH == 10 if (isRowExt) { @@ -2881,6 +2956,149 @@ void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, } } +#if X265_DEPTH == 10 +template<int coeffIdx> +void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d, + uint32x4_t offset, uint16x8_t filter) +{ + uint16x4_t offset_u16 = vdup_n_u16((uint16_t)vgetq_lane_u32(offset, 0)); + + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + uint16x4_t sum012456 = vsub_u16(s[6], s[0]); + sum012456 = vmla_laneq_u16(sum012456, s[1], filter, 1); + sum012456 = vmls_laneq_u16(sum012456, s[2], filter, 2); + sum012456 = vmla_laneq_u16(sum012456, s[4], filter, 4); + sum012456 = vmls_laneq_u16(sum012456, s[5], filter, 5); + + uint16x4_t sum3 = + vmla_laneq_u16(offset_u16, s[3], filter, 3); + + int32x4_t sum = vaddl_s16(vreinterpret_s16_u16(sum3), + vreinterpret_s16_u16(sum012456)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + uint16x4_t sum07 = vadd_u16(s[0], s[7]); + uint16x4_t sum16 = vadd_u16(s[1], s[6]); + uint16x4_t sum25 = vadd_u16(s[2], s[5]); + uint16x4_t sum34 = vadd_u16(s[3], s[4]); + + uint16x4_t sum0167 = vshl_n_u16(sum16, 2); + sum0167 = vsub_u16(sum0167, sum07); + + uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3); + sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2); + + int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345), + vreinterpret_s16_u16(sum0167)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + uint16x4_t sum123567 = vsub_u16(s[1], s[7]); + sum123567 = vmls_laneq_u16(sum123567, s[2], filter, 2); + sum123567 = vmla_laneq_u16(sum123567, s[3], filter, 3); + sum123567 = vmla_laneq_u16(sum123567, s[6], filter, 6); + sum123567 = vmls_laneq_u16(sum123567, s[5], filter, 5); + + uint16x4_t sum4 = + vmla_laneq_u16(offset_u16, s[4], filter, 4); + + int32x4_t sum = vaddl_s16(vreinterpret_s16_u16(sum4), + vreinterpret_s16_u16(sum123567)); + + d = vshrn_n_s32(sum, SHIFT_INTERP_PS); + } +} + +template<int coeffIdx> +void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d, + uint32x4_t offset, uint16x8_t filter) +{ + uint16x8_t offset_u16 = vdupq_n_u16((uint16_t)vgetq_lane_u32(offset, 0)); + + if (coeffIdx == 1) + { + // { -1, 4, -10, 58, 17, -5, 1, 0 } + uint16x8_t sum012456 = vsubq_u16(s[6], s[0]); + sum012456 = vmlaq_laneq_u16(sum012456, s[1], filter, 1); + sum012456 = vmlsq_laneq_u16(sum012456, s[2], filter, 2); + sum012456 = vmlaq_laneq_u16(sum012456, s[4], filter, 4); + sum012456 = vmlsq_laneq_u16(sum012456, s[5], filter, 5); + + uint16x8_t sum3 = + vmlaq_laneq_u16(offset_u16, s[3], filter, 3); + + int32x4_t sum_lo = vaddl_s16(vget_low_s16(vreinterpretq_s16_u16(sum3)), + vget_low_s16(vreinterpretq_s16_u16(sum012456))); + int32x4_t sum_hi = vaddl_s16(vget_high_s16(vreinterpretq_s16_u16(sum3)), + vget_high_s16(vreinterpretq_s16_u16(sum012456))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } + else if (coeffIdx == 2) + { + // { -1, 4, -11, 40, 40, -11, 4, -1 } + uint16x8_t sum07 = vaddq_u16(s[0], s[7]); + uint16x8_t sum16 = vaddq_u16(s[1], s[6]); + uint16x8_t sum25 = vaddq_u16(s[2], s[5]); + uint16x8_t sum34 = vaddq_u16(s[3], s[4]); + + uint16x8_t sum0167 = vshlq_n_u16(sum16, 2); + sum0167 = vsubq_u16(sum0167, sum07); + + uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34), + filter, 3); + sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25), + filter, 2); + + uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34), + filter, 3); + sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25), + filter, 2); + + int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo), + vget_low_s16(vreinterpretq_s16_u16(sum0167))); + int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi), + vget_high_s16(vreinterpretq_s16_u16(sum0167))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } + else + { + // { 0, 1, -5, 17, 58, -10, 4, -1 } + uint16x8_t sum123567 = vsubq_u16(s[1], s[7]); + sum123567 = vmlsq_laneq_u16(sum123567, s[2], filter, 2); + sum123567 = vmlaq_laneq_u16(sum123567, s[3], filter, 3); + sum123567 = vmlaq_laneq_u16(sum123567, s[6], filter, 6); + sum123567 = vmlsq_laneq_u16(sum123567, s[5], filter, 5); + + uint16x8_t sum4 = + vmlaq_laneq_u16(offset_u16, s[4], filter, 4); + + int32x4_t sum_lo = vaddl_s16(vget_low_s16(vreinterpretq_s16_u16(sum4)), + vget_low_s16(vreinterpretq_s16_u16(sum123567))); + int32x4_t sum_hi = vaddl_s16(vget_high_s16(vreinterpretq_s16_u16(sum4)), + vget_high_s16(vreinterpretq_s16_u16(sum123567))); + + int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS); + int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS); + d = vcombine_s16(d_lo, d_hi); + } +} + +#else // X265_DEPTH == 12 template<int coeffIdx> void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d, uint32x4_t offset, uint16x8_t filter) @@ -3023,6 +3241,8 @@ void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d, } } +#endif // X265_DEPTH == 10 + template<int coeffIdx, int width, int height> void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int isRowExt) -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel