[x265] [PATCH 07/12] AArch64: Optimise low bitdepth interp_vert_ss_neon

Hari Limaye Fri, 30 Aug 2024 12:20:00 -0700

Optimise the Neon implementations of luma_vss and chroma_vss, and
extend these functions to support all LUMA and CHROMA block sizes
respectively.


Geomean uplift across all block sizes for luma filters:

    Neoverse-N2: 1.399x
    Neoverse-V1: 1.743x
    Neoverse-V2: 1.816x

Geomean uplift across all block sizes for chroma filters:

    Neoverse-N2: 1.367x
    Neoverse-V1: 1.452x
    Neoverse-V2: 1.217x
---
 source/common/aarch64/filter-prim.cpp | 517 +++++++++++++++++++++++---
 source/common/aarch64/mem-neon.h      |  57 +++
 2 files changed, 517 insertions(+), 57 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp 
b/source/common/aarch64/filter-prim.cpp
index 561f5da9e..63521e94f 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -6,6 +6,450 @@
 #include <arm_neon.h>
 
 namespace {
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+{
+    if (coeffIdx == 4)
+    {
+        // { -4, 36, 36, -4 }
+        int16x8_t t0 = vaddq_s16(s[1], s[2]);
+        int16x8_t t1 = vaddq_s16(s[0], s[3]);
+        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
+
+        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
+    }
+    else
+    {
+        d0 = vmlal_lane_s16(c, vget_low_s16(s[0]), f, 0);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[1]), f, 1);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[2]), f, 2);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[3]), f, 3);
+
+        d1 = vmlal_lane_s16(c, vget_high_s16(s[0]), f, 0);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[1]), f, 1);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[2]), f, 2);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[3]), f, 3);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d = vsubl_s16(s[6], s[0]);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s[1], 4);
+        d = vmlsl_n_s16(d, s[2], 10);
+        d = vmlal_n_s16(d, s[3], 58);
+        d = vmlal_n_s16(d, s[4], 17);
+        d = vmlsl_n_s16(d, s[5], 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(s[3], s[4]);
+        int32x4_t t1 = vaddl_s16(s[2], s[5]);
+        int32x4_t t2 = vaddl_s16(s[1], s[6]);
+        int32x4_t t3 = vaddl_s16(s[0], s[7]);
+
+        d = vmlaq_n_s32(c, t0, 40);
+        d = vmlaq_n_s32(d, t1, -11);
+        d = vmlaq_n_s32(d, t2, 4);
+        d = vmlaq_n_s32(d, t3, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d = vsubl_s16(s[1], s[7]);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s[6], 4);
+        d = vmlsl_n_s16(d, s[5], 10);
+        d = vmlal_n_s16(d, s[4], 58);
+        d = vmlal_n_s16(d, s[3], 17);
+        d = vmlsl_n_s16(d, s[2], 5);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
+                          int32x4_t &d1)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = vsubl_s16(vget_low_s16(s[6]), vget_low_s16(s[0]));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[1]), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[2]), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[3]), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[4]), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[5]), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s[6]), vget_high_s16(s[0]));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[1]), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[2]), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[3]), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[4]), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[5]), 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(vget_low_s16(s[3]), vget_low_s16(s[4]));
+        int32x4_t t1 = vaddl_s16(vget_low_s16(s[2]), vget_low_s16(s[5]));
+        int32x4_t t2 = vaddl_s16(vget_low_s16(s[1]), vget_low_s16(s[6]));
+        int32x4_t t3 = vaddl_s16(vget_low_s16(s[0]), vget_low_s16(s[7]));
+
+        d0 = vmlaq_n_s32(c, t0, 40);
+        d0 = vmlaq_n_s32(d0, t1, -11);
+        d0 = vmlaq_n_s32(d0, t2, 4);
+        d0 = vmlaq_n_s32(d0, t3, -1);
+
+        int32x4_t t4 = vaddl_s16(vget_high_s16(s[3]), vget_high_s16(s[4]));
+        int32x4_t t5 = vaddl_s16(vget_high_s16(s[2]), vget_high_s16(s[5]));
+        int32x4_t t6 = vaddl_s16(vget_high_s16(s[1]), vget_high_s16(s[6]));
+        int32x4_t t7 = vaddl_s16(vget_high_s16(s[0]), vget_high_s16(s[7]));
+
+        d1 = vmlaq_n_s32(c, t4, 40);
+        d1 = vmlaq_n_s32(d1, t5, -11);
+        d1 = vmlaq_n_s32(d1, t6, 4);
+        d1 = vmlaq_n_s32(d1, t7, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = vsubl_s16(vget_low_s16(s[1]), vget_low_s16(s[7]));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[6]), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[5]), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[4]), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[3]), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[2]), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s[1]), vget_high_s16(s[7]));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[6]), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[5]), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[4]), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[3]), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[2]), 5);
+    }
+}
+
+template<int width, int height>
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    const int16x4_t filter = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width == 12)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+
+        int16x8_t in[7];
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo[4];
+            int32x4_t sum_hi[4];
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0], sum_hi[0]);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1], sum_hi[1]);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2], sum_hi[2]);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3], sum_hi[3]);
+
+            int16x8_t sum[4];
+            sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+            sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+            sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+            sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+            store_s16x8xn<4>(d, dstStride, sum);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+        s = src;
+        d = dst;
+
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo[4];
+            int32x4_t sum_hi[4];
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0], sum_hi[0]);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1], sum_hi[1]);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2], sum_hi[2]);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3], sum_hi[3]);
+
+            int16x8_t sum[4];
+            sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+            sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+            sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+            sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+            store_s16x4xn<4>(d, dstStride, sum);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+    else
+    {
+        const int n_store = (width < 8) ? width : 8;
+        for (int col = 0; col < width; col += 8)
+        {
+            const int16_t *s = src;
+            int16_t *d = dst;
+
+            int16x8_t in[7];
+            load_s16x8xn<3>(s, srcStride, in);
+            s += 3 * srcStride;
+
+            for (int row = 0; (row + 4) <= height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 3);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0],
+                              sum_hi[0]);
+                filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1],
+                              sum_hi[1]);
+                filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2],
+                              sum_hi[2]);
+                filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3],
+                              sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16xnxm<n_store, 4>(sum, d, dstStride);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            if (height & 2)
+            {
+                load_s16x8xn<2>(s, srcStride, in + 3);
+
+                int32x4_t sum_lo[2];
+                int32x4_t sum_hi[2];
+                filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0],
+                              sum_hi[0]);
+                filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1],
+                              sum_hi[1]);
+
+                int16x8_t sum[2];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+
+                store_s16xnxm<n_store, 2>(sum, d, dstStride);
+            }
+
+            src += 8;
+            dst += 8;
+        }
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width % 8 != 0)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+        if (width == 12)
+        {
+            int16x8_t in[11];
+            load_s16x8xn<7>(s, srcStride, in);
+            s += 7 * srcStride;
+
+            for (int row = 0; row < height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 7);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter8_s16x8<coeffIdx>(in + 0, c, sum_lo[0], sum_hi[0]);
+                filter8_s16x8<coeffIdx>(in + 1, c, sum_lo[1], sum_hi[1]);
+                filter8_s16x8<coeffIdx>(in + 2, c, sum_lo[2], sum_hi[2]);
+                filter8_s16x8<coeffIdx>(in + 3, c, sum_lo[3], sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16x8xn<4>(d, dstStride, sum);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+                in[3] = in[7];
+                in[4] = in[8];
+                in[5] = in[9];
+                in[6] = in[10];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            s = src + 8;
+            d = dst + 8;
+        }
+
+        int16x4_t in[11];
+        load_s16x4xn<7>(s, srcStride, in);
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            load_s16x4xn<4>(s, srcStride, in + 7);
+
+            int32x4_t sum[4];
+            filter8_s16x4<coeffIdx>(in + 0, c, sum[0]);
+            filter8_s16x4<coeffIdx>(in + 1, c, sum[1]);
+            filter8_s16x4<coeffIdx>(in + 2, c, sum[2]);
+            filter8_s16x4<coeffIdx>(in + 3, c, sum[3]);
+
+            int16x4_t sum_s16[4];
+            sum_s16[0] = vshrn_n_s32(sum[0], IF_FILTER_PREC);
+            sum_s16[1] = vshrn_n_s32(sum[1], IF_FILTER_PREC);
+            sum_s16[2] = vshrn_n_s32(sum[2], IF_FILTER_PREC);
+            sum_s16[3] = vshrn_n_s32(sum[3], IF_FILTER_PREC);
+
+            store_s16x4xn<4>(d, dstStride, sum_s16);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+            in[3] = in[7];
+            in[4] = in[8];
+            in[5] = in[9];
+            in[6] = in[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+    else
+    {
+        for (int col = 0; col < width; col += 8)
+        {
+            const int16_t *s = src;
+            int16_t *d = dst;
+
+            int16x8_t in[11];
+            load_s16x8xn<7>(s, srcStride, in);
+            s += 7 * srcStride;
+
+            for (int row = 0; row < height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 7);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter8_s16x8<coeffIdx>(in + 0, c, sum_lo[0], sum_hi[0]);
+                filter8_s16x8<coeffIdx>(in + 1, c, sum_lo[1], sum_hi[1]);
+                filter8_s16x8<coeffIdx>(in + 2, c, sum_lo[2], sum_hi[2]);
+                filter8_s16x8<coeffIdx>(in + 3, c, sum_lo[3], sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16x8xn<4>(d, dstStride, sum);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+                in[3] = in[7];
+                in[4] = in[8];
+                in[5] = in[9];
+                in[6] = in[10];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            src += 8;
+            dst += 8;
+        }
+    }
+}
+
 #if !HIGH_BIT_DEPTH
 // Element-wise ABS of g_chromaFilter
 const uint8_t g_chromaFilterAbs8[8][NTAPS_CHROMA] =
@@ -940,69 +1384,28 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t 
srcStride, int16_t *dst,
 template<int N, int width, int height>
 void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, 
intptr_t dstStride, int coeffIdx)
 {
-    const int16_t *c = (N == 8 ? g_lumaFilter[coeffIdx] : 
g_chromaFilter[coeffIdx]);
-    int shift = IF_FILTER_PREC;
-    src -= (N / 2 - 1) * srcStride;
-    int16x8_t vc = vld1q_s16(c);
-    int16x4_t low_vc = vget_low_s16(vc);
-    int16x4_t high_vc = vget_high_s16(vc);
-
-    const int32x4_t vhr = vdupq_n_s32(-shift);
-
-    int row, col;
-    for (row = 0; row < height; row++)
+    if (N == 8)
     {
-        for (col = 0; col < width; col += 8)
+        switch (coeffIdx)
         {
-            int32x4_t vsum1, vsum2;
-
-            int16x8_t input[N];
-
-            for (int i = 0; i < N; i++)
-            {
-                input[i] = vld1q_s16(src + col + i * srcStride);
-            }
-
-            vsum1 = vmull_lane_s16(vget_low_s16(input[0]), low_vc, 0);
-            vsum2 = vmull_high_lane_s16(input[0], low_vc, 0);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
-
-            if (N == 8)
-            {
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 
0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 
1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 
2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 
3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
-
-            }
-
-            vsum1 = vshlq_s32(vsum1, vhr);
-            vsum2 = vshlq_s32(vsum2, vhr);
-
-            int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
-                                        vreinterpretq_s16_s32(vsum2));
-            vst1q_s16(dst + col, vsum);
+        case 1:
+            return interp8_vert_ss_neon<1, width, height>(src, srcStride, dst,
+                                                          dstStride);
+        case 2:
+            return interp8_vert_ss_neon<2, width, height>(src, srcStride, dst,
+                                                          dstStride);
+        case 3:
+            return interp8_vert_ss_neon<3, width, height>(src, srcStride, dst,
+                                                          dstStride);
         }
-
-        src += srcStride;
-        dst += dstStride;
     }
-
+    else
+    {
+        return interp4_vert_ss_neon<width, height>(src, srcStride, dst,
+                                                   dstStride, coeffIdx);
+    }
 }
 
-
 #if HIGH_BIT_DEPTH
 
 template<int N, int width, int height>
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 3f68b885b..34ace3d60 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -112,6 +112,28 @@ static void inline store_u8x6xn(uint8_t *dst, intptr_t 
dst_stride,
     }
 }
 
+template<int N>
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
+                                int16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
+                                int16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1q_s16(src);
+        src += stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x4_t *src)
@@ -134,6 +156,17 @@ static void inline store_s16x2xn(int16_t *dst, intptr_t 
dst_stride,
     }
 }
 
+template<int N>
+static void inline store_s16x4xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_s16(dst, src[i]);
+        dst += dst_stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x4xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x8_t *src)
@@ -157,4 +190,28 @@ static void inline store_s16x6xn(int16_t *dst, intptr_t 
dst_stride,
     }
 }
 
+template<int N>
+static void inline store_s16x8xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_s16(dst, src[i]);
+        dst += dst_stride;
+    }
+}
+
+template<int N, int M>
+static void inline store_s16xnxm(const int16x8_t *src, int16_t *dst,
+                                 intptr_t dst_stride)
+{
+    switch (N)
+    {
+    case 2: return store_s16x2xn<M>(dst, dst_stride, src);
+    case 4: return store_s16x4xn<M>(dst, dst_stride, src);
+    case 6: return store_s16x6xn<M>(dst, dst_stride, src);
+    case 8: return store_s16x8xn<M>(dst, dst_stride, src);
+    }
+}
+
 #endif // X265_COMMON_AARCH64_MEM_NEON_H
-- 
2.42.1

>From 70271d66ae2ca26882367d895de612337cdab3eb Mon Sep 17 00:00:00 2001
Message-ID: <70271d66ae2ca26882367d895de612337cdab3eb.1725045303.git.hari.lim...@arm.com>
In-Reply-To: <cover.1725045303.git.hari.lim...@arm.com>
References: <cover.1725045303.git.hari.lim...@arm.com>
From: Hari Limaye <hari.lim...@arm.com>
Date: Mon, 11 Mar 2024 00:26:26 +0000
Subject: [PATCH 07/12] AArch64: Optimise low bitdepth interp_vert_ss_neon

Optimise the Neon implementations of luma_vss and chroma_vss, and
extend these functions to support all LUMA and CHROMA block sizes
respectively.

Geomean uplift across all block sizes for luma filters:

    Neoverse-N2: 1.399x
    Neoverse-V1: 1.743x
    Neoverse-V2: 1.816x

Geomean uplift across all block sizes for chroma filters:

    Neoverse-N2: 1.367x
    Neoverse-V1: 1.452x
    Neoverse-V2: 1.217x
---
 source/common/aarch64/filter-prim.cpp | 517 +++++++++++++++++++++++---
 source/common/aarch64/mem-neon.h      |  57 +++
 2 files changed, 517 insertions(+), 57 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 561f5da9e..63521e94f 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -6,6 +6,450 @@
 #include <arm_neon.h>
 
 namespace {
+void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
+                          const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
+{
+    if (coeffIdx == 4)
+    {
+        // { -4, 36, 36, -4 }
+        int16x8_t t0 = vaddq_s16(s[1], s[2]);
+        int16x8_t t1 = vaddq_s16(s[0], s[3]);
+        d0 = vmlal_n_s16(c, vget_low_s16(t0), 36);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(t1), 4);
+
+        d1 = vmlal_n_s16(c, vget_high_s16(t0), 36);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(t1), 4);
+    }
+    else
+    {
+        d0 = vmlal_lane_s16(c, vget_low_s16(s[0]), f, 0);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[1]), f, 1);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[2]), f, 2);
+        d0 = vmlal_lane_s16(d0, vget_low_s16(s[3]), f, 3);
+
+        d1 = vmlal_lane_s16(c, vget_high_s16(s[0]), f, 0);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[1]), f, 1);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[2]), f, 2);
+        d1 = vmlal_lane_s16(d1, vget_high_s16(s[3]), f, 3);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x4(const int16x4_t *s, const int32x4_t c, int32x4_t &d)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d = vsubl_s16(s[6], s[0]);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s[1], 4);
+        d = vmlsl_n_s16(d, s[2], 10);
+        d = vmlal_n_s16(d, s[3], 58);
+        d = vmlal_n_s16(d, s[4], 17);
+        d = vmlsl_n_s16(d, s[5], 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(s[3], s[4]);
+        int32x4_t t1 = vaddl_s16(s[2], s[5]);
+        int32x4_t t2 = vaddl_s16(s[1], s[6]);
+        int32x4_t t3 = vaddl_s16(s[0], s[7]);
+
+        d = vmlaq_n_s32(c, t0, 40);
+        d = vmlaq_n_s32(d, t1, -11);
+        d = vmlaq_n_s32(d, t2, 4);
+        d = vmlaq_n_s32(d, t3, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d = vsubl_s16(s[1], s[7]);
+        d = vaddq_s32(d, c);
+        d = vmlal_n_s16(d, s[6], 4);
+        d = vmlsl_n_s16(d, s[5], 10);
+        d = vmlal_n_s16(d, s[4], 58);
+        d = vmlal_n_s16(d, s[3], 17);
+        d = vmlsl_n_s16(d, s[2], 5);
+    }
+}
+
+template<int coeffIdx>
+void inline filter8_s16x8(const int16x8_t *s, const int32x4_t c, int32x4_t &d0,
+                          int32x4_t &d1)
+{
+    if (coeffIdx == 1)
+    {
+        // { -1, 4, -10, 58, 17, -5, 1, 0 }
+        d0 = vsubl_s16(vget_low_s16(s[6]), vget_low_s16(s[0]));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[1]), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[2]), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[3]), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[4]), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[5]), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s[6]), vget_high_s16(s[0]));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[1]), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[2]), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[3]), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[4]), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[5]), 5);
+    }
+    else if (coeffIdx == 2)
+    {
+        // { -1, 4, -11, 40, 40, -11, 4, -1 }
+        int32x4_t t0 = vaddl_s16(vget_low_s16(s[3]), vget_low_s16(s[4]));
+        int32x4_t t1 = vaddl_s16(vget_low_s16(s[2]), vget_low_s16(s[5]));
+        int32x4_t t2 = vaddl_s16(vget_low_s16(s[1]), vget_low_s16(s[6]));
+        int32x4_t t3 = vaddl_s16(vget_low_s16(s[0]), vget_low_s16(s[7]));
+
+        d0 = vmlaq_n_s32(c, t0, 40);
+        d0 = vmlaq_n_s32(d0, t1, -11);
+        d0 = vmlaq_n_s32(d0, t2, 4);
+        d0 = vmlaq_n_s32(d0, t3, -1);
+
+        int32x4_t t4 = vaddl_s16(vget_high_s16(s[3]), vget_high_s16(s[4]));
+        int32x4_t t5 = vaddl_s16(vget_high_s16(s[2]), vget_high_s16(s[5]));
+        int32x4_t t6 = vaddl_s16(vget_high_s16(s[1]), vget_high_s16(s[6]));
+        int32x4_t t7 = vaddl_s16(vget_high_s16(s[0]), vget_high_s16(s[7]));
+
+        d1 = vmlaq_n_s32(c, t4, 40);
+        d1 = vmlaq_n_s32(d1, t5, -11);
+        d1 = vmlaq_n_s32(d1, t6, 4);
+        d1 = vmlaq_n_s32(d1, t7, -1);
+    }
+    else
+    {
+        // { 0, 1, -5, 17, 58, -10, 4, -1 }
+        d0 = vsubl_s16(vget_low_s16(s[1]), vget_low_s16(s[7]));
+        d0 = vaddq_s32(d0, c);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[6]), 4);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[5]), 10);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[4]), 58);
+        d0 = vmlal_n_s16(d0, vget_low_s16(s[3]), 17);
+        d0 = vmlsl_n_s16(d0, vget_low_s16(s[2]), 5);
+
+        d1 = vsubl_s16(vget_high_s16(s[1]), vget_high_s16(s[7]));
+        d1 = vaddq_s32(d1, c);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[6]), 4);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[5]), 10);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[4]), 58);
+        d1 = vmlal_n_s16(d1, vget_high_s16(s[3]), 17);
+        d1 = vmlsl_n_s16(d1, vget_high_s16(s[2]), 5);
+    }
+}
+
+template<int width, int height>
+void interp4_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    const int16x4_t filter = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width == 12)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+
+        int16x8_t in[7];
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo[4];
+            int32x4_t sum_hi[4];
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0], sum_hi[0]);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1], sum_hi[1]);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2], sum_hi[2]);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3], sum_hi[3]);
+
+            int16x8_t sum[4];
+            sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+            sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+            sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+            sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+            store_s16x8xn<4>(d, dstStride, sum);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+        s = src;
+        d = dst;
+
+        load_s16x8xn<3>(s, srcStride, in);
+        s += 3 * srcStride;
+
+        for (int row = 0; (row + 4) <= height; row += 4)
+        {
+            load_s16x8xn<4>(s, srcStride, in + 3);
+
+            int32x4_t sum_lo[4];
+            int32x4_t sum_hi[4];
+            filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0], sum_hi[0]);
+            filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1], sum_hi[1]);
+            filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2], sum_hi[2]);
+            filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3], sum_hi[3]);
+
+            int16x8_t sum[4];
+            sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+            sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+            sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+            sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                  vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+            store_s16x4xn<4>(d, dstStride, sum);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+    else
+    {
+        const int n_store = (width < 8) ? width : 8;
+        for (int col = 0; col < width; col += 8)
+        {
+            const int16_t *s = src;
+            int16_t *d = dst;
+
+            int16x8_t in[7];
+            load_s16x8xn<3>(s, srcStride, in);
+            s += 3 * srcStride;
+
+            for (int row = 0; (row + 4) <= height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 3);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0],
+                              sum_hi[0]);
+                filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1],
+                              sum_hi[1]);
+                filter4_s16x8(coeffIdx, in + 2, filter, c, sum_lo[2],
+                              sum_hi[2]);
+                filter4_s16x8(coeffIdx, in + 3, filter, c, sum_lo[3],
+                              sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16xnxm<n_store, 4>(sum, d, dstStride);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            if (height & 2)
+            {
+                load_s16x8xn<2>(s, srcStride, in + 3);
+
+                int32x4_t sum_lo[2];
+                int32x4_t sum_hi[2];
+                filter4_s16x8(coeffIdx, in + 0, filter, c, sum_lo[0],
+                              sum_hi[0]);
+                filter4_s16x8(coeffIdx, in + 1, filter, c, sum_lo[1],
+                              sum_hi[1]);
+
+                int16x8_t sum[2];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+
+                store_s16xnxm<n_store, 2>(sum, d, dstStride);
+            }
+
+            src += 8;
+            dst += 8;
+        }
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+                          intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    src -= (N_TAPS / 2 - 1) * srcStride;
+
+    // Zero constant in order to use filter helper functions (optimised away).
+    const int32x4_t c = vdupq_n_s32(0);
+
+    if (width % 8 != 0)
+    {
+        const int16_t *s = src;
+        int16_t *d = dst;
+        if (width == 12)
+        {
+            int16x8_t in[11];
+            load_s16x8xn<7>(s, srcStride, in);
+            s += 7 * srcStride;
+
+            for (int row = 0; row < height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 7);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter8_s16x8<coeffIdx>(in + 0, c, sum_lo[0], sum_hi[0]);
+                filter8_s16x8<coeffIdx>(in + 1, c, sum_lo[1], sum_hi[1]);
+                filter8_s16x8<coeffIdx>(in + 2, c, sum_lo[2], sum_hi[2]);
+                filter8_s16x8<coeffIdx>(in + 3, c, sum_lo[3], sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16x8xn<4>(d, dstStride, sum);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+                in[3] = in[7];
+                in[4] = in[8];
+                in[5] = in[9];
+                in[6] = in[10];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            s = src + 8;
+            d = dst + 8;
+        }
+
+        int16x4_t in[11];
+        load_s16x4xn<7>(s, srcStride, in);
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            load_s16x4xn<4>(s, srcStride, in + 7);
+
+            int32x4_t sum[4];
+            filter8_s16x4<coeffIdx>(in + 0, c, sum[0]);
+            filter8_s16x4<coeffIdx>(in + 1, c, sum[1]);
+            filter8_s16x4<coeffIdx>(in + 2, c, sum[2]);
+            filter8_s16x4<coeffIdx>(in + 3, c, sum[3]);
+
+            int16x4_t sum_s16[4];
+            sum_s16[0] = vshrn_n_s32(sum[0], IF_FILTER_PREC);
+            sum_s16[1] = vshrn_n_s32(sum[1], IF_FILTER_PREC);
+            sum_s16[2] = vshrn_n_s32(sum[2], IF_FILTER_PREC);
+            sum_s16[3] = vshrn_n_s32(sum[3], IF_FILTER_PREC);
+
+            store_s16x4xn<4>(d, dstStride, sum_s16);
+
+            in[0] = in[4];
+            in[1] = in[5];
+            in[2] = in[6];
+            in[3] = in[7];
+            in[4] = in[8];
+            in[5] = in[9];
+            in[6] = in[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+    else
+    {
+        for (int col = 0; col < width; col += 8)
+        {
+            const int16_t *s = src;
+            int16_t *d = dst;
+
+            int16x8_t in[11];
+            load_s16x8xn<7>(s, srcStride, in);
+            s += 7 * srcStride;
+
+            for (int row = 0; row < height; row += 4)
+            {
+                load_s16x8xn<4>(s, srcStride, in + 7);
+
+                int32x4_t sum_lo[4];
+                int32x4_t sum_hi[4];
+                filter8_s16x8<coeffIdx>(in + 0, c, sum_lo[0], sum_hi[0]);
+                filter8_s16x8<coeffIdx>(in + 1, c, sum_lo[1], sum_hi[1]);
+                filter8_s16x8<coeffIdx>(in + 2, c, sum_lo[2], sum_hi[2]);
+                filter8_s16x8<coeffIdx>(in + 3, c, sum_lo[3], sum_hi[3]);
+
+                int16x8_t sum[4];
+                sum[0] = vcombine_s16(vshrn_n_s32(sum_lo[0], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[0], IF_FILTER_PREC));
+                sum[1] = vcombine_s16(vshrn_n_s32(sum_lo[1], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[1], IF_FILTER_PREC));
+                sum[2] = vcombine_s16(vshrn_n_s32(sum_lo[2], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[2], IF_FILTER_PREC));
+                sum[3] = vcombine_s16(vshrn_n_s32(sum_lo[3], IF_FILTER_PREC),
+                                      vshrn_n_s32(sum_hi[3], IF_FILTER_PREC));
+
+                store_s16x8xn<4>(d, dstStride, sum);
+
+                in[0] = in[4];
+                in[1] = in[5];
+                in[2] = in[6];
+                in[3] = in[7];
+                in[4] = in[8];
+                in[5] = in[9];
+                in[6] = in[10];
+
+                s += 4 * srcStride;
+                d += 4 * dstStride;
+            }
+
+            src += 8;
+            dst += 8;
+        }
+    }
+}
+
 #if !HIGH_BIT_DEPTH
 // Element-wise ABS of g_chromaFilter
 const uint8_t g_chromaFilterAbs8[8][NTAPS_CHROMA] =
@@ -940,69 +1384,28 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
 template<int N, int width, int height>
 void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 {
-    const int16_t *c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
-    int shift = IF_FILTER_PREC;
-    src -= (N / 2 - 1) * srcStride;
-    int16x8_t vc = vld1q_s16(c);
-    int16x4_t low_vc = vget_low_s16(vc);
-    int16x4_t high_vc = vget_high_s16(vc);
-
-    const int32x4_t vhr = vdupq_n_s32(-shift);
-
-    int row, col;
-    for (row = 0; row < height; row++)
+    if (N == 8)
     {
-        for (col = 0; col < width; col += 8)
+        switch (coeffIdx)
         {
-            int32x4_t vsum1, vsum2;
-
-            int16x8_t input[N];
-
-            for (int i = 0; i < N; i++)
-            {
-                input[i] = vld1q_s16(src + col + i * srcStride);
-            }
-
-            vsum1 = vmull_lane_s16(vget_low_s16(input[0]), low_vc, 0);
-            vsum2 = vmull_high_lane_s16(input[0], low_vc, 0);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
-
-            vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
-            vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
-
-            if (N == 8)
-            {
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 0);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 1);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 2);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
-                vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 3);
-                vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
-
-            }
-
-            vsum1 = vshlq_s32(vsum1, vhr);
-            vsum2 = vshlq_s32(vsum2, vhr);
-
-            int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
-                                        vreinterpretq_s16_s32(vsum2));
-            vst1q_s16(dst + col, vsum);
+        case 1:
+            return interp8_vert_ss_neon<1, width, height>(src, srcStride, dst,
+                                                          dstStride);
+        case 2:
+            return interp8_vert_ss_neon<2, width, height>(src, srcStride, dst,
+                                                          dstStride);
+        case 3:
+            return interp8_vert_ss_neon<3, width, height>(src, srcStride, dst,
+                                                          dstStride);
         }
-
-        src += srcStride;
-        dst += dstStride;
     }
-
+    else
+    {
+        return interp4_vert_ss_neon<width, height>(src, srcStride, dst,
+                                                   dstStride, coeffIdx);
+    }
 }
 
-
 #if HIGH_BIT_DEPTH
 
 template<int N, int width, int height>
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 3f68b885b..34ace3d60 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -112,6 +112,28 @@ static void inline store_u8x6xn(uint8_t *dst, intptr_t dst_stride,
     }
 }
 
+template<int N>
+static void inline load_s16x4xn(const int16_t *src, const intptr_t stride,
+                                int16x4_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1_s16(src);
+        src += stride;
+    }
+}
+
+template<int N>
+static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
+                                int16x8_t *dst)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        dst[i] = vld1q_s16(src);
+        src += stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x4_t *src)
@@ -134,6 +156,17 @@ static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
     }
 }
 
+template<int N>
+static void inline store_s16x4xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_s16(dst, src[i]);
+        dst += dst_stride;
+    }
+}
+
 template<int N>
 static void inline store_s16x4xn(int16_t *dst, intptr_t dst_stride,
                                  const int16x8_t *src)
@@ -157,4 +190,28 @@ static void inline store_s16x6xn(int16_t *dst, intptr_t dst_stride,
     }
 }
 
+template<int N>
+static void inline store_s16x8xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x8_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1q_s16(dst, src[i]);
+        dst += dst_stride;
+    }
+}
+
+template<int N, int M>
+static void inline store_s16xnxm(const int16x8_t *src, int16_t *dst,
+                                 intptr_t dst_stride)
+{
+    switch (N)
+    {
+    case 2: return store_s16x2xn<M>(dst, dst_stride, src);
+    case 4: return store_s16x4xn<M>(dst, dst_stride, src);
+    case 6: return store_s16x6xn<M>(dst, dst_stride, src);
+    case 8: return store_s16x8xn<M>(dst, dst_stride, src);
+    }
+}
+
 #endif // X265_COMMON_AARCH64_MEM_NEON_H
-- 
2.42.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 07/12] AArch64: Optimise low bitdepth interp_vert_ss_neon

Reply via email to