Optimize interp8_vert_sp_neon function by replacing the existing right shift by a value of 12 and a narrowing instruction with a table lookup instruction, that imitates a right shift by a value of 8, and a narrowing right shift by 4. This is possible because the maximum value of filtering can fit into 24 bits.
This optimization gives a performance uplift of up to 9%. --- source/common/aarch64/filter-prim.cpp | 78 +++++++++++++-------------- source/common/aarch64/filter-prim.h | 11 ++++ 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp index 5e069695e..e4f4e4773 100644 --- a/source/common/aarch64/filter-prim.cpp +++ b/source/common/aarch64/filter-prim.cpp @@ -32,6 +32,14 @@ #include <arm_neon.h> namespace { +#if !HIGH_BIT_DEPTH +// This is to use with vtbl2q_s32_s16. +// Extract the middle two bytes from each 32-bit element in a vector, using these byte +// indices. +static const uint8_t vert_shr_tbl[16] = { + 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 +}; +#endif #if HIGH_BIT_DEPTH #define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH)) @@ -1901,14 +1909,16 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, assert(X265_DEPTH == 8); const int headRoom = IF_INTERNAL_PREC - X265_DEPTH; const int shift = IF_FILTER_PREC + headRoom; - const int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << - IF_FILTER_PREC); + // Subtract 8 from shift since we account for that in table lookups. + const int shift_offset = shift - 8; + const int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); const int N_TAPS = 8; src -= (N_TAPS / 2 - 1) * srcStride; const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]); const int32x4_t c = vdupq_n_s32(offset); + const uint8x16_t shr_tbl = vld1q_u8(vert_shr_tbl); if (width % 8 != 0) { @@ -1925,28 +1935,23 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, { load_s16x8xn<4>(s, srcStride, in + 7); - int32x4_t sum_lo[4]; - int32x4_t sum_hi[4]; + int32x4_t sum_lo[4], sum_hi[4]; filter8_s16x8<coeffIdx>(in + 0, filter, c, sum_lo[0], sum_hi[0]); filter8_s16x8<coeffIdx>(in + 1, filter, c, sum_lo[1], sum_hi[1]); filter8_s16x8<coeffIdx>(in + 2, filter, c, sum_lo[2], sum_hi[2]); filter8_s16x8<coeffIdx>(in + 3, filter, c, sum_lo[3], sum_hi[3]); int16x8_t sum[4]; - sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], shift), - vshrn_n_s32(sum_hi[0], shift)); - sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], shift), - vshrn_n_s32(sum_hi[1], shift)); - sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], shift), - vshrn_n_s32(sum_hi[2], shift)); - sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], shift), - vshrn_n_s32(sum_hi[3], shift)); + sum[0] = vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl); + sum[1] = vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl); + sum[2] = vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl); + sum[3] = vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl); uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(sum[0]); - sum_u8[1] = vqmovun_s16(sum[1]); - sum_u8[2] = vqmovun_s16(sum[2]); - sum_u8[3] = vqmovun_s16(sum[3]); + sum_u8[0] = vqshrun_n_s16(sum[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum[1], shift_offset); + sum_u8[2] = vqshrun_n_s16(sum[2], shift_offset); + sum_u8[3] = vqshrun_n_s16(sum[3], shift_offset); store_u8x8xn<4>(d, dstStride, sum_u8); @@ -1980,19 +1985,15 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, filter8_s16x4<coeffIdx>(in + 2, filter, c, sum[2]); filter8_s16x4<coeffIdx>(in + 3, filter, c, sum[3]); - int16x4_t sum_s16[4]; - sum_s16[0] = vshrn_n_s32(sum[0], shift); - sum_s16[1] = vshrn_n_s32(sum[1], shift); - sum_s16[2] = vshrn_n_s32(sum[2], shift); - sum_s16[3] = vshrn_n_s32(sum[3], shift); + int16x8_t sum_s16[2]; + sum_s16[0] = vtbl2q_s32_s16(sum[0], sum[1], shr_tbl); + sum_s16[1] = vtbl2q_s32_s16(sum[2], sum[3], shr_tbl); - uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(vcombine_s16(sum_s16[0], vdup_n_s16(0))); - sum_u8[1] = vqmovun_s16(vcombine_s16(sum_s16[1], vdup_n_s16(0))); - sum_u8[2] = vqmovun_s16(vcombine_s16(sum_s16[2], vdup_n_s16(0))); - sum_u8[3] = vqmovun_s16(vcombine_s16(sum_s16[3], vdup_n_s16(0))); + uint8x8_t sum_u8[2]; + sum_u8[0] = vqshrun_n_s16(sum_s16[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum_s16[1], shift_offset); - store_u8x4xn<4>(d, dstStride, sum_u8); + store_u8x4_strided_xN<4>(d, dstStride, sum_u8); in[0] = in[4]; in[1] = in[5]; @@ -2021,28 +2022,23 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, { load_s16x8xn<4>(s, srcStride, in + 7); - int32x4_t sum_lo[4]; - int32x4_t sum_hi[4]; + int32x4_t sum_lo[4], sum_hi[4]; filter8_s16x8<coeffIdx>(in + 0, filter, c, sum_lo[0], sum_hi[0]); filter8_s16x8<coeffIdx>(in + 1, filter, c, sum_lo[1], sum_hi[1]); filter8_s16x8<coeffIdx>(in + 2, filter, c, sum_lo[2], sum_hi[2]); filter8_s16x8<coeffIdx>(in + 3, filter, c, sum_lo[3], sum_hi[3]); int16x8_t sum[4]; - sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], shift), - vshrn_n_s32(sum_hi[0], shift)); - sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], shift), - vshrn_n_s32(sum_hi[1], shift)); - sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], shift), - vshrn_n_s32(sum_hi[2], shift)); - sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], shift), - vshrn_n_s32(sum_hi[3], shift)); + sum[0] = vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl); + sum[1] = vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl); + sum[2] = vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl); + sum[3] = vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl); uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(sum[0]); - sum_u8[1] = vqmovun_s16(sum[1]); - sum_u8[2] = vqmovun_s16(sum[2]); - sum_u8[3] = vqmovun_s16(sum[3]); + sum_u8[0] = vqshrun_n_s16(sum[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum[1], shift_offset); + sum_u8[2] = vqshrun_n_s16(sum[2], shift_offset); + sum_u8[3] = vqshrun_n_s16(sum[3], shift_offset); store_u8x8xn<4>(d, dstStride, sum_u8); diff --git a/source/common/aarch64/filter-prim.h b/source/common/aarch64/filter-prim.h index 6f0208cef..299e0367a 100644 --- a/source/common/aarch64/filter-prim.h +++ b/source/common/aarch64/filter-prim.h @@ -7,6 +7,17 @@ #include "primitives.h" #include "x265.h" +#include <arm_neon.h> + +static inline int16x8_t vtbl2q_s32_s16(int32x4_t a, int32x4_t b, uint8x16_t index) +{ + uint8x16x2_t ab; + + ab.val[0] = vreinterpretq_u8_s32(a); + ab.val[1] = vreinterpretq_u8_s32(b); + + return vreinterpretq_s16_u8(vqtbl2q_u8(ab, index)); +} namespace X265_NS { -- 2.39.5 (Apple Git-154)
>From cf0eae39d0df5c09a8694eb6dc301e4c81c860a5 Mon Sep 17 00:00:00 2001 Message-Id: <cf0eae39d0df5c09a8694eb6dc301e4c81c860a5.1750321821.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1750321821.git.gerdazsejke.m...@arm.com> References: <cover.1750321821.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Tue, 3 Jun 2025 11:42:26 +0200 Subject: [PATCH 1/6] AArch64: Optimize interp8_vert_sp_neon impl Optimize interp8_vert_sp_neon function by replacing the existing right shift by a value of 12 and a narrowing instruction with a table lookup instruction, that imitates a right shift by a value of 8, and a narrowing right shift by 4. This is possible because the maximum value of filtering can fit into 24 bits. This optimization gives a performance uplift of up to 9%. --- source/common/aarch64/filter-prim.cpp | 78 +++++++++++++-------------- source/common/aarch64/filter-prim.h | 11 ++++ 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp index 5e069695e..e4f4e4773 100644 --- a/source/common/aarch64/filter-prim.cpp +++ b/source/common/aarch64/filter-prim.cpp @@ -32,6 +32,14 @@ #include <arm_neon.h> namespace { +#if !HIGH_BIT_DEPTH +// This is to use with vtbl2q_s32_s16. +// Extract the middle two bytes from each 32-bit element in a vector, using these byte +// indices. +static const uint8_t vert_shr_tbl[16] = { + 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 +}; +#endif #if HIGH_BIT_DEPTH #define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH)) @@ -1901,14 +1909,16 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, assert(X265_DEPTH == 8); const int headRoom = IF_INTERNAL_PREC - X265_DEPTH; const int shift = IF_FILTER_PREC + headRoom; - const int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << - IF_FILTER_PREC); + // Subtract 8 from shift since we account for that in table lookups. + const int shift_offset = shift - 8; + const int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC); const int N_TAPS = 8; src -= (N_TAPS / 2 - 1) * srcStride; const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]); const int32x4_t c = vdupq_n_s32(offset); + const uint8x16_t shr_tbl = vld1q_u8(vert_shr_tbl); if (width % 8 != 0) { @@ -1925,28 +1935,23 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, { load_s16x8xn<4>(s, srcStride, in + 7); - int32x4_t sum_lo[4]; - int32x4_t sum_hi[4]; + int32x4_t sum_lo[4], sum_hi[4]; filter8_s16x8<coeffIdx>(in + 0, filter, c, sum_lo[0], sum_hi[0]); filter8_s16x8<coeffIdx>(in + 1, filter, c, sum_lo[1], sum_hi[1]); filter8_s16x8<coeffIdx>(in + 2, filter, c, sum_lo[2], sum_hi[2]); filter8_s16x8<coeffIdx>(in + 3, filter, c, sum_lo[3], sum_hi[3]); int16x8_t sum[4]; - sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], shift), - vshrn_n_s32(sum_hi[0], shift)); - sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], shift), - vshrn_n_s32(sum_hi[1], shift)); - sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], shift), - vshrn_n_s32(sum_hi[2], shift)); - sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], shift), - vshrn_n_s32(sum_hi[3], shift)); + sum[0] = vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl); + sum[1] = vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl); + sum[2] = vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl); + sum[3] = vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl); uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(sum[0]); - sum_u8[1] = vqmovun_s16(sum[1]); - sum_u8[2] = vqmovun_s16(sum[2]); - sum_u8[3] = vqmovun_s16(sum[3]); + sum_u8[0] = vqshrun_n_s16(sum[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum[1], shift_offset); + sum_u8[2] = vqshrun_n_s16(sum[2], shift_offset); + sum_u8[3] = vqshrun_n_s16(sum[3], shift_offset); store_u8x8xn<4>(d, dstStride, sum_u8); @@ -1980,19 +1985,15 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, filter8_s16x4<coeffIdx>(in + 2, filter, c, sum[2]); filter8_s16x4<coeffIdx>(in + 3, filter, c, sum[3]); - int16x4_t sum_s16[4]; - sum_s16[0] = vshrn_n_s32(sum[0], shift); - sum_s16[1] = vshrn_n_s32(sum[1], shift); - sum_s16[2] = vshrn_n_s32(sum[2], shift); - sum_s16[3] = vshrn_n_s32(sum[3], shift); + int16x8_t sum_s16[2]; + sum_s16[0] = vtbl2q_s32_s16(sum[0], sum[1], shr_tbl); + sum_s16[1] = vtbl2q_s32_s16(sum[2], sum[3], shr_tbl); - uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(vcombine_s16(sum_s16[0], vdup_n_s16(0))); - sum_u8[1] = vqmovun_s16(vcombine_s16(sum_s16[1], vdup_n_s16(0))); - sum_u8[2] = vqmovun_s16(vcombine_s16(sum_s16[2], vdup_n_s16(0))); - sum_u8[3] = vqmovun_s16(vcombine_s16(sum_s16[3], vdup_n_s16(0))); + uint8x8_t sum_u8[2]; + sum_u8[0] = vqshrun_n_s16(sum_s16[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum_s16[1], shift_offset); - store_u8x4xn<4>(d, dstStride, sum_u8); + store_u8x4_strided_xN<4>(d, dstStride, sum_u8); in[0] = in[4]; in[1] = in[5]; @@ -2021,28 +2022,23 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, { load_s16x8xn<4>(s, srcStride, in + 7); - int32x4_t sum_lo[4]; - int32x4_t sum_hi[4]; + int32x4_t sum_lo[4], sum_hi[4]; filter8_s16x8<coeffIdx>(in + 0, filter, c, sum_lo[0], sum_hi[0]); filter8_s16x8<coeffIdx>(in + 1, filter, c, sum_lo[1], sum_hi[1]); filter8_s16x8<coeffIdx>(in + 2, filter, c, sum_lo[2], sum_hi[2]); filter8_s16x8<coeffIdx>(in + 3, filter, c, sum_lo[3], sum_hi[3]); int16x8_t sum[4]; - sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], shift), - vshrn_n_s32(sum_hi[0], shift)); - sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], shift), - vshrn_n_s32(sum_hi[1], shift)); - sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], shift), - vshrn_n_s32(sum_hi[2], shift)); - sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], shift), - vshrn_n_s32(sum_hi[3], shift)); + sum[0] = vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl); + sum[1] = vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl); + sum[2] = vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl); + sum[3] = vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl); uint8x8_t sum_u8[4]; - sum_u8[0] = vqmovun_s16(sum[0]); - sum_u8[1] = vqmovun_s16(sum[1]); - sum_u8[2] = vqmovun_s16(sum[2]); - sum_u8[3] = vqmovun_s16(sum[3]); + sum_u8[0] = vqshrun_n_s16(sum[0], shift_offset); + sum_u8[1] = vqshrun_n_s16(sum[1], shift_offset); + sum_u8[2] = vqshrun_n_s16(sum[2], shift_offset); + sum_u8[3] = vqshrun_n_s16(sum[3], shift_offset); store_u8x8xn<4>(d, dstStride, sum_u8); diff --git a/source/common/aarch64/filter-prim.h b/source/common/aarch64/filter-prim.h index 6f0208cef..299e0367a 100644 --- a/source/common/aarch64/filter-prim.h +++ b/source/common/aarch64/filter-prim.h @@ -7,6 +7,17 @@ #include "primitives.h" #include "x265.h" +#include <arm_neon.h> + +static inline int16x8_t vtbl2q_s32_s16(int32x4_t a, int32x4_t b, uint8x16_t index) +{ + uint8x16x2_t ab; + + ab.val[0] = vreinterpretq_u8_s32(a); + ab.val[1] = vreinterpretq_u8_s32(b); + + return vreinterpretq_s16_u8(vqtbl2q_u8(ab, index)); +} namespace X265_NS { -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel