Add SVE implementation of HBD interp_vert_ss for block sizes with width >= 8 for CHROMA filtering.
This implementation gives up to 16% uplift compared to the existing Neon implementation. --- source/common/aarch64/filter-prim-sve.cpp | 324 ++++++++++++++++++++++ 1 file changed, 324 insertions(+) diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp index 62df833bf..ee26bcc68 100644 --- a/source/common/aarch64/filter-prim-sve.cpp +++ b/source/common/aarch64/filter-prim-sve.cpp @@ -39,6 +39,13 @@ static const uint16_t dotprod_h_permute_tbl[32] = { // clang-format on }; +static const uint8_t dotprod_v_permute_tbl[80] = { + 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19, + 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23, + 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27, + 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31, +}; + template<bool coeff2> void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter, uint16x4_t maxVal) @@ -302,6 +309,253 @@ void inline interp8_hps_sve(const pixel *src, intptr_t srcStride, } } +void inline transpose_concat_s16_4x4(const int16x4_t s[4], int16x8_t res[2]) +{ + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s[0], vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s[1], vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s[2], vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s[3], vdup_n_s16(0)); + + int16x8_t s02 = vzip1q_s16(s0q, s2q); + int16x8_t s13 = vzip1q_s16(s1q, s3q); + + int16x8x2_t s0123 = vzipq_s16(s02, s13); + + res[0] = s0123.val[0]; + res[1] = s0123.val[1]; +} + +void inline transpose_concat_s16_8x4(const int16x8_t s[4], int16x8_t res[4]) +{ + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t s02 = vzipq_s16(s[0], s[2]); + int16x8x2_t s13 = vzipq_s16(s[1], s[3]); + + int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]); + int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]); + + res[0] = s0123_lo.val[0]; + res[1] = s0123_lo.val[1]; + res[2] = s0123_hi.val[0]; + res[3] = s0123_hi.val[1]; +} + +void inline insert_row_into_window_s16x8(int16x8_t *s, int16x8_t s_new, + uint8x16_t *merge_block_tbl) +{ + int8x16x2_t samples_tbl[4]; + + // Insert 8 new elements into a 8x4 source window represented by four uint16x8_t + // vectors using table lookups. Each lookup contains 16 byte indices: + // - 0–15 select bytes from the original source + // - 16–31 select bytes from the new row values + + // { 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 } + samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]); + samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new); + s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0])); + + // { 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 } + samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]); + samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new); + s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1])); + + // { 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27 } + samples_tbl[2].val[0] = vreinterpretq_s8_s16(s[2]); + samples_tbl[2].val[1] = vreinterpretq_s8_s16(s_new); + s[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[2], merge_block_tbl[2])); + + // { 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31 } + samples_tbl[3].val[0] = vreinterpretq_s8_s16(s[3]); + samples_tbl[3].val[1] = vreinterpretq_s8_s16(s_new); + s[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[3], merge_block_tbl[3])); +} + +void inline insert_row_into_window_s16x4(int16x8_t *s, int16x8_t s_new, + uint8x16_t *merge_block_tbl) +{ + int8x16x2_t samples_tbl[2]; + + // Insert 4 new elements into a 4x4 source window represented by two uint16x8_t + // vectors using table lookups. Each lookup contains 16 byte indices: + // - 0–15 select bytes from the original source + // - 16–31 select bytes from the new row values + + // { 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 } + samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]); + samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new); + s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0])); + + // { 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 } + samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]); + samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new); + s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1])); +} + +void inline filter4_s16x4(const int16x8_t *ss, const int16x8_t filter, + const int64x2_t offset, int16x4_t &d) +{ + int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter); + int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter); + int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + + d = vshrn_n_s32(sum, IF_FILTER_PREC); +} + +void inline filter4_s16x8(const int16x8_t *ss, const int16x8_t filter, + const int64x2_t offset, int16x8_t &d) +{ + int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter); + int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter); + int64x2_t sum2 = x265_sdotq_s16(offset, ss[2], filter); + int64x2_t sum3 = x265_sdotq_s16(offset, ss[3], filter); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3)); + + int16x4_t d0 = vshrn_n_s32(sum_lo, IF_FILTER_PREC); + int16x4_t d1 = vshrn_n_s32(sum_hi, IF_FILTER_PREC); + + d = vcombine_s16(d0, d1); +} + +template<int width, int height> +void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, const int16_t coeffIdx) +{ + const int N_TAPS = 4; + int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]); + int16x8_t filter = vcombine_s16(f, f); + int64x2_t offset = vdupq_n_s64(0); + uint8x16_t merge_block_tbl[4]; + + merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0); + merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16); + merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32); + merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48); + + src -= (N_TAPS / 2 - 1) * srcStride; + + if (width % 8 != 0) + { + if (width == 12) + { + const int n_store = 8; + const int16_t *s = src; + int16_t *d = dst; + + int16x8_t in[4]; + load_s16x8xn<4>(s, srcStride, in); + s += 4 * srcStride; + + int16x8_t ss[4]; + transpose_concat_s16_8x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + + store_s16xnxm<n_store, 4>(&res, d, dstStride); + + int16x8_t new_r = vld1q_s16(s); + insert_row_into_window_s16x8(ss, new_r, merge_block_tbl); + + s += srcStride; + d += dstStride; + } + + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + store_s16xnxm<n_store, 4>(&res, d, dstStride); + + src += 8; + dst += 8; + } + const int n_store = width > 4 ? 4 : width; + + int16x4_t in[4]; + load_s16x4xn<4>(src, srcStride, in); + src += 4 * srcStride; + + int16x8_t ss[2]; + transpose_concat_s16_4x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x4_t res; + filter4_s16x4(ss, filter, offset, res); + + store_s16xnxm<n_store, 1>(&res, dst, dstStride); + + int16x8_t new_r = vld1q_s16(src); + insert_row_into_window_s16x4(ss, new_r, merge_block_tbl); + + src += srcStride; + dst += dstStride; + } + + int16x4_t res; + filter4_s16x4(ss, filter, offset, res); + store_s16xnxm<n_store, 1>(&res, dst, dstStride); + } + else + { + for (int col = 0; col < width; col += 8) + { + const int16_t *s = src; + int16_t *d = dst; + + int16x8_t in[4]; + load_s16x8xn<4>(s, srcStride, in); + s += 4 * srcStride; + + int16x8_t ss[4]; + transpose_concat_s16_8x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + + vst1q_s16(d, res); + + int16x8_t new_r = vld1q_s16(s); + insert_row_into_window_s16x8(ss, new_r, merge_block_tbl); + + s += srcStride; + d += dstStride; + } + + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + vst1q_s16(d, res); + + src += 8; + dst += 8; + } + } +} + namespace X265_NS { // Declaration for use in interp8_horiz_pp_sve(). template<int N, int width, int height> @@ -365,6 +619,26 @@ void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst, } } +// Declaration for use in interp4_vert_ss_sve(). +template<int N, int width, int height> +void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, int coeffIdx); + +template<int width, int height> +void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 4: + return interp_vert_ss_neon<4, width, height>(src, srcStride, dst, dstStride, + coeffIdx); + default: + return interp4_vss_sve<width, height>(src, srcStride, dst, dstStride, + coeffIdx); + } +} + void setupFilterPrimitives_sve(EncoderPrimitives &p) { p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>; @@ -393,6 +667,56 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p) p.pu[LUMA_4x4].luma_hps = interp8_horiz_ps_sve<4, 4>; p.pu[LUMA_4x8].luma_hps = interp8_horiz_ps_sve<4, 8>; p.pu[LUMA_4x16].luma_hps = interp8_horiz_ps_sve<4, 16>; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = interp4_vert_ss_sve<12, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = interp4_vert_ss_sve<16, 4>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = interp4_vert_ss_sve<16, 12>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = interp4_vert_ss_sve<24, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = interp4_vert_ss_sve<32, 8>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = interp4_vert_ss_sve<32, 24>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = interp4_vert_ss_sve<8, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = interp4_vert_ss_sve<12, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = interp4_vert_ss_sve<16, 24>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = interp4_vert_ss_sve<16, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = interp4_vert_ss_sve<24, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = interp4_vert_ss_sve<32, 48>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = interp4_vert_ss_sve<32, 64>; + + p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = interp4_vert_ss_sve<12, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = interp4_vert_ss_sve<16, 4>; + p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = interp4_vert_ss_sve<16, 12>; + p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = interp4_vert_ss_sve<16, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = interp4_vert_ss_sve<24, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = interp4_vert_ss_sve<32, 8>; + p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = interp4_vert_ss_sve<32, 24>; + p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = interp4_vert_ss_sve<32, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = interp4_vert_ss_sve<48, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = interp4_vert_ss_sve<64, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>; + p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>; } } // namespace X265_NS #else // !HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
>From 85938f912ee0fb5c145a3fff185c8a4de27a2f98 Mon Sep 17 00:00:00 2001 Message-Id: <85938f912ee0fb5c145a3fff185c8a4de27a2f98.1745588006.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1745588006.git.gerdazsejke.m...@arm.com> References: <cover.1745588006.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Mon, 31 Mar 2025 00:23:31 +0200 Subject: [PATCH v3 3/4] AArch64: Add SVE implementation of HBD interp_vert_ss Add SVE implementation of HBD interp_vert_ss for block sizes with width >= 8 for CHROMA filtering. This implementation gives up to 16% uplift compared to the existing Neon implementation. --- source/common/aarch64/filter-prim-sve.cpp | 324 ++++++++++++++++++++++ 1 file changed, 324 insertions(+) diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp index 62df833bf..ee26bcc68 100644 --- a/source/common/aarch64/filter-prim-sve.cpp +++ b/source/common/aarch64/filter-prim-sve.cpp @@ -39,6 +39,13 @@ static const uint16_t dotprod_h_permute_tbl[32] = { // clang-format on }; +static const uint8_t dotprod_v_permute_tbl[80] = { + 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19, + 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23, + 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27, + 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31, +}; + template<bool coeff2> void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter, uint16x4_t maxVal) @@ -302,6 +309,253 @@ void inline interp8_hps_sve(const pixel *src, intptr_t srcStride, } } +void inline transpose_concat_s16_4x4(const int16x4_t s[4], int16x8_t res[2]) +{ + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s[0], vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s[1], vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s[2], vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s[3], vdup_n_s16(0)); + + int16x8_t s02 = vzip1q_s16(s0q, s2q); + int16x8_t s13 = vzip1q_s16(s1q, s3q); + + int16x8x2_t s0123 = vzipq_s16(s02, s13); + + res[0] = s0123.val[0]; + res[1] = s0123.val[1]; +} + +void inline transpose_concat_s16_8x4(const int16x8_t s[4], int16x8_t res[4]) +{ + // Transpose 16-bit elements: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t s02 = vzipq_s16(s[0], s[2]); + int16x8x2_t s13 = vzipq_s16(s[1], s[3]); + + int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]); + int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]); + + res[0] = s0123_lo.val[0]; + res[1] = s0123_lo.val[1]; + res[2] = s0123_hi.val[0]; + res[3] = s0123_hi.val[1]; +} + +void inline insert_row_into_window_s16x8(int16x8_t *s, int16x8_t s_new, + uint8x16_t *merge_block_tbl) +{ + int8x16x2_t samples_tbl[4]; + + // Insert 8 new elements into a 8x4 source window represented by four uint16x8_t + // vectors using table lookups. Each lookup contains 16 byte indices: + // - 0–15 select bytes from the original source + // - 16–31 select bytes from the new row values + + // { 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 } + samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]); + samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new); + s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0])); + + // { 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 } + samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]); + samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new); + s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1])); + + // { 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27 } + samples_tbl[2].val[0] = vreinterpretq_s8_s16(s[2]); + samples_tbl[2].val[1] = vreinterpretq_s8_s16(s_new); + s[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[2], merge_block_tbl[2])); + + // { 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31 } + samples_tbl[3].val[0] = vreinterpretq_s8_s16(s[3]); + samples_tbl[3].val[1] = vreinterpretq_s8_s16(s_new); + s[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[3], merge_block_tbl[3])); +} + +void inline insert_row_into_window_s16x4(int16x8_t *s, int16x8_t s_new, + uint8x16_t *merge_block_tbl) +{ + int8x16x2_t samples_tbl[2]; + + // Insert 4 new elements into a 4x4 source window represented by two uint16x8_t + // vectors using table lookups. Each lookup contains 16 byte indices: + // - 0–15 select bytes from the original source + // - 16–31 select bytes from the new row values + + // { 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 } + samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]); + samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new); + s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0])); + + // { 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 } + samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]); + samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new); + s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1])); +} + +void inline filter4_s16x4(const int16x8_t *ss, const int16x8_t filter, + const int64x2_t offset, int16x4_t &d) +{ + int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter); + int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter); + int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + + d = vshrn_n_s32(sum, IF_FILTER_PREC); +} + +void inline filter4_s16x8(const int16x8_t *ss, const int16x8_t filter, + const int64x2_t offset, int16x8_t &d) +{ + int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter); + int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter); + int64x2_t sum2 = x265_sdotq_s16(offset, ss[2], filter); + int64x2_t sum3 = x265_sdotq_s16(offset, ss[3], filter); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3)); + + int16x4_t d0 = vshrn_n_s32(sum_lo, IF_FILTER_PREC); + int16x4_t d1 = vshrn_n_s32(sum_hi, IF_FILTER_PREC); + + d = vcombine_s16(d0, d1); +} + +template<int width, int height> +void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, const int16_t coeffIdx) +{ + const int N_TAPS = 4; + int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]); + int16x8_t filter = vcombine_s16(f, f); + int64x2_t offset = vdupq_n_s64(0); + uint8x16_t merge_block_tbl[4]; + + merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0); + merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16); + merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32); + merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48); + + src -= (N_TAPS / 2 - 1) * srcStride; + + if (width % 8 != 0) + { + if (width == 12) + { + const int n_store = 8; + const int16_t *s = src; + int16_t *d = dst; + + int16x8_t in[4]; + load_s16x8xn<4>(s, srcStride, in); + s += 4 * srcStride; + + int16x8_t ss[4]; + transpose_concat_s16_8x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + + store_s16xnxm<n_store, 4>(&res, d, dstStride); + + int16x8_t new_r = vld1q_s16(s); + insert_row_into_window_s16x8(ss, new_r, merge_block_tbl); + + s += srcStride; + d += dstStride; + } + + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + store_s16xnxm<n_store, 4>(&res, d, dstStride); + + src += 8; + dst += 8; + } + const int n_store = width > 4 ? 4 : width; + + int16x4_t in[4]; + load_s16x4xn<4>(src, srcStride, in); + src += 4 * srcStride; + + int16x8_t ss[2]; + transpose_concat_s16_4x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x4_t res; + filter4_s16x4(ss, filter, offset, res); + + store_s16xnxm<n_store, 1>(&res, dst, dstStride); + + int16x8_t new_r = vld1q_s16(src); + insert_row_into_window_s16x4(ss, new_r, merge_block_tbl); + + src += srcStride; + dst += dstStride; + } + + int16x4_t res; + filter4_s16x4(ss, filter, offset, res); + store_s16xnxm<n_store, 1>(&res, dst, dstStride); + } + else + { + for (int col = 0; col < width; col += 8) + { + const int16_t *s = src; + int16_t *d = dst; + + int16x8_t in[4]; + load_s16x8xn<4>(s, srcStride, in); + s += 4 * srcStride; + + int16x8_t ss[4]; + transpose_concat_s16_8x4(in, ss); + + for (int row = 0; row < height - 1; ++row) + { + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + + vst1q_s16(d, res); + + int16x8_t new_r = vld1q_s16(s); + insert_row_into_window_s16x8(ss, new_r, merge_block_tbl); + + s += srcStride; + d += dstStride; + } + + int16x8_t res; + filter4_s16x8(ss, filter, offset, res); + vst1q_s16(d, res); + + src += 8; + dst += 8; + } + } +} + namespace X265_NS { // Declaration for use in interp8_horiz_pp_sve(). template<int N, int width, int height> @@ -365,6 +619,26 @@ void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst, } } +// Declaration for use in interp4_vert_ss_sve(). +template<int N, int width, int height> +void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, int coeffIdx); + +template<int width, int height> +void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 4: + return interp_vert_ss_neon<4, width, height>(src, srcStride, dst, dstStride, + coeffIdx); + default: + return interp4_vss_sve<width, height>(src, srcStride, dst, dstStride, + coeffIdx); + } +} + void setupFilterPrimitives_sve(EncoderPrimitives &p) { p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>; @@ -393,6 +667,56 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p) p.pu[LUMA_4x4].luma_hps = interp8_horiz_ps_sve<4, 4>; p.pu[LUMA_4x8].luma_hps = interp8_horiz_ps_sve<4, 8>; p.pu[LUMA_4x16].luma_hps = interp8_horiz_ps_sve<4, 16>; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = interp4_vert_ss_sve<12, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = interp4_vert_ss_sve<16, 4>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = interp4_vert_ss_sve<16, 12>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = interp4_vert_ss_sve<24, 32>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = interp4_vert_ss_sve<32, 8>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = interp4_vert_ss_sve<32, 24>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = interp4_vert_ss_sve<8, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = interp4_vert_ss_sve<12, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = interp4_vert_ss_sve<16, 24>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = interp4_vert_ss_sve<16, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = interp4_vert_ss_sve<24, 64>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = interp4_vert_ss_sve<32, 48>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = interp4_vert_ss_sve<32, 64>; + + p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = interp4_vert_ss_sve<8, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = interp4_vert_ss_sve<8, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = interp4_vert_ss_sve<12, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = interp4_vert_ss_sve<16, 4>; + p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = interp4_vert_ss_sve<16, 8>; + p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = interp4_vert_ss_sve<16, 12>; + p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = interp4_vert_ss_sve<16, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = interp4_vert_ss_sve<16, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = interp4_vert_ss_sve<16, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = interp4_vert_ss_sve<24, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = interp4_vert_ss_sve<32, 8>; + p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = interp4_vert_ss_sve<32, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = interp4_vert_ss_sve<32, 24>; + p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = interp4_vert_ss_sve<32, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = interp4_vert_ss_sve<32, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = interp4_vert_ss_sve<48, 64>; + p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = interp4_vert_ss_sve<64, 16>; + p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>; + p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>; + p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>; } } // namespace X265_NS #else // !HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel