[x265] [PATCH v2 8/9] AArch64: Add SVE implementation of 16x16 DCT

Jonathan Wright Tue, 27 Aug 2024 08:11:54 -0700

The widening 16-bit multiply + pairwise add pattern in the Neon DCT
paths is a good fit for the SVE 16-bit dot-product instructions. This
patch adds an SVE implementation of the 16x16 DCT path.


Relative performance compared to the Neon implementation:

  Neoverse-V1: 1.04x
  Neoverse-V2: 1.35x
  Neoverse-N2: 1.42x
---
 source/common/aarch64/dct-prim-sve.cpp | 122 +++++++++++++++++++++++++
 source/common/aarch64/dct-prim.cpp     |  16 ----
 source/common/aarch64/dct-prim.h       |  27 ++++++
 3 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/source/common/aarch64/dct-prim-sve.cpp 
b/source/common/aarch64/dct-prim-sve.cpp
index a2118c174..3f6de3bff 100644
--- a/source/common/aarch64/dct-prim-sve.cpp
+++ b/source/common/aarch64/dct-prim-sve.cpp
@@ -135,6 +135,110 @@ static inline void partialButterfly8_sve(const int16_t 
*src, int16_t *dst)
     }
 }
 
+template<int shift>
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 16;
+
+    int16x8_t O[line];
+    int16x8_t EO[line / 2];
+    int32x4_t EEE[line];
+    int32x4_t EEO[line];
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+
+        int32x4_t E0[2];
+        E0[0] = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E0[1] = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
+
+        int32x4_t E1[2];
+        E1[0] = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E1[1] = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        O[i + 0] = vsubq_s16(s0_lo, s0_hi);
+        O[i + 1] = vsubq_s16(s1_lo, s1_hi);
+
+        int16x4_t EO_lo = vmovn_s32(vsubq_s32(E0[0], rev32(E0[1])));
+        int16x4_t EO_hi = vmovn_s32(vsubq_s32(E1[0], rev32(E1[1])));
+        EO[i / 2] = vcombine_s16(EO_lo, EO_hi);
+
+        int32x4_t EE0 = vaddq_s32(E0[0], rev32(E0[1]));
+        int32x4_t EE1 = vaddq_s32(E1[0], rev32(E1[1]));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), 
vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
+                       vreinterpretq_s64_s32(EE1))));
+
+        EEE[i / 2] = vaddq_s32(t0, t1);
+        EEO[i / 2] = vsubq_s32(t0, t1);
+    }
+
+    for (int i = 0; i < line; i += 4)
+    {
+        for (int k = 1; k < 16; k += 2)
+        {
+            int16x8_t c0_c4 = vld1q_s16(&g_t16[k][0]);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 1]);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 2]);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 3]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        for (int k = 2; k < 16; k += 4)
+        {
+            int16x8_t c0 = vld1q_s16(t8_odd[(k - 2) / 4]);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i / 2 + 0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i / 2 + 1]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int16x4_t res = vrshrn_n_s32(t01, shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        int32x4_t c0 = vld1q_s32(t8_even[0]);
+        int32x4_t c4 = vld1q_s32(t8_even[1]);
+        int32x4_t c8 = vld1q_s32(t8_even[2]);
+        int32x4_t c12 = vld1q_s32(t8_even[3]);
+
+        int32x4_t t0 = vpaddq_s32(EEE[i / 2 + 0], EEE[i / 2 + 1]);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(dst + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c4, EEO[i / 2 + 0]);
+        int32x4_t t3 = vmulq_s32(c4, EEO[i / 2 + 1]);
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(dst + 4 * line, res4);
+
+        int32x4_t t4 = vmulq_s32(c8, EEE[i / 2 + 0]);
+        int32x4_t t5 = vmulq_s32(c8, EEE[i / 2 + 1]);
+        int16x4_t res8 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(dst + 8 * line, res8);
+
+        int32x4_t t6 = vmulq_s32(c12, EEO[i / 2 + 0]);
+        int32x4_t t7 = vmulq_s32(c12, EEO[i / 2 + 1]);
+        int16x4_t res12 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(dst + 12 * line, res12);
+
+        dst += 4;
+    }
+}
+
 }
 
 
@@ -158,9 +262,27 @@ void dct8_sve(const int16_t *src, int16_t *dst, intptr_t 
srcStride)
     partialButterfly8_sve<shift_pass2>(coef, dst);
 }
 
+void dct16_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
+{
+    const int shift_pass1 = 3 + X265_DEPTH - 8;
+    const int shift_pass2 = 10;
+
+    ALIGN_VAR_32(int16_t, coef[16 * 16]);
+    ALIGN_VAR_32(int16_t, block[16 * 16]);
+
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
+    }
+
+    partialButterfly16_sve<shift_pass1>(block, coef);
+    partialButterfly16_sve<shift_pass2>(coef, dst);
+}
+
 void setupDCTPrimitives_sve(EncoderPrimitives &p)
 {
     p.cu[BLOCK_8x8].dct   = dct8_sve;
+    p.cu[BLOCK_16x16].dct = dct16_sve;
 }
 
 };
diff --git a/source/common/aarch64/dct-prim.cpp 
b/source/common/aarch64/dct-prim.cpp
index 9122a479d..96dd9a4b0 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -18,22 +18,6 @@ namespace
 {
 using namespace X265_NS;
 
-static int16x8_t rev16(const int16x8_t a)
-{
-    static const uint8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 
2, 3, 0, 1};
-    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
-
-    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
-}
-
-static int32x4_t rev32(const int32x4_t a)
-{
-    static const uint8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 
0, 1, 2, 3};
-    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
-
-    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
-}
-
 static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, 
int16x4_t &x3)
 {
     int32x2_t s0, s1, s2, s3;
diff --git a/source/common/aarch64/dct-prim.h b/source/common/aarch64/dct-prim.h
index 602e6ac73..dc296962b 100644
--- a/source/common/aarch64/dct-prim.h
+++ b/source/common/aarch64/dct-prim.h
@@ -6,6 +6,7 @@
 #include "primitives.h"
 #include "contexts.h"   // costCoeffNxN_c
 #include "threading.h"  // CLZ
+#include <arm_neon.h>
 
 namespace X265_NS
 {
@@ -19,6 +20,32 @@ const int32_t t8_even[4][4] =
     { 36, -83, 36, -83 },
 };
 
+const uint8_t rev16_tbl[16] =
+{
+    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+};
+
+const uint8_t rev32_tbl[16] =
+{
+    12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+};
+
+static inline int16x8_t rev16(const int16x8_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev16_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
+
+    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
+static inline int32x4_t rev32(const int32x4_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev32_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
+
+    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
 // x265 private namespace
 void setupDCTPrimitives_neon(EncoderPrimitives &p);
 #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
-- 
2.42.1

>From 43f7d94e8e33244aaff38f8ae81550c3cf1ce699 Mon Sep 17 00:00:00 2001
Message-ID: <43f7d94e8e33244aaff38f8ae81550c3cf1ce699.1724771133.git.hari.lim...@arm.com>
In-Reply-To: <cover.1724771133.git.hari.lim...@arm.com>
References: <cover.1724771133.git.hari.lim...@arm.com>
From: Jonathan Wright <jonathan.wri...@arm.com>
Date: Tue, 20 Aug 2024 23:07:35 +0100
Subject: [PATCH v2 8/9] AArch64: Add SVE implementation of 16x16 DCT

The widening 16-bit multiply + pairwise add pattern in the Neon DCT
paths is a good fit for the SVE 16-bit dot-product instructions. This
patch adds an SVE implementation of the 16x16 DCT path.

Relative performance compared to the Neon implementation:

  Neoverse-V1: 1.04x
  Neoverse-V2: 1.35x
  Neoverse-N2: 1.42x
---
 source/common/aarch64/dct-prim-sve.cpp | 122 +++++++++++++++++++++++++
 source/common/aarch64/dct-prim.cpp     |  16 ----
 source/common/aarch64/dct-prim.h       |  27 ++++++
 3 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/source/common/aarch64/dct-prim-sve.cpp b/source/common/aarch64/dct-prim-sve.cpp
index a2118c174..3f6de3bff 100644
--- a/source/common/aarch64/dct-prim-sve.cpp
+++ b/source/common/aarch64/dct-prim-sve.cpp
@@ -135,6 +135,110 @@ static inline void partialButterfly8_sve(const int16_t *src, int16_t *dst)
     }
 }
 
+template<int shift>
+static inline void partialButterfly16_sve(const int16_t *src, int16_t *dst)
+{
+    const int line = 16;
+
+    int16x8_t O[line];
+    int16x8_t EO[line / 2];
+    int32x4_t EEE[line];
+    int32x4_t EEO[line];
+
+    for (int i = 0; i < line; i += 2)
+    {
+        int16x8_t s0_lo = vld1q_s16(src + i * line);
+        int16x8_t s0_hi = rev16(vld1q_s16(src + i * line + 8));
+
+        int16x8_t s1_lo = vld1q_s16(src + (i + 1) * line);
+        int16x8_t s1_hi = rev16(vld1q_s16(src + (i + 1) * line + 8));
+
+        int32x4_t E0[2];
+        E0[0] = vaddl_s16(vget_low_s16(s0_lo), vget_low_s16(s0_hi));
+        E0[1] = vaddl_s16(vget_high_s16(s0_lo), vget_high_s16(s0_hi));
+
+        int32x4_t E1[2];
+        E1[0] = vaddl_s16(vget_low_s16(s1_lo), vget_low_s16(s1_hi));
+        E1[1] = vaddl_s16(vget_high_s16(s1_lo), vget_high_s16(s1_hi));
+
+        O[i + 0] = vsubq_s16(s0_lo, s0_hi);
+        O[i + 1] = vsubq_s16(s1_lo, s1_hi);
+
+        int16x4_t EO_lo = vmovn_s32(vsubq_s32(E0[0], rev32(E0[1])));
+        int16x4_t EO_hi = vmovn_s32(vsubq_s32(E1[0], rev32(E1[1])));
+        EO[i / 2] = vcombine_s16(EO_lo, EO_hi);
+
+        int32x4_t EE0 = vaddq_s32(E0[0], rev32(E0[1]));
+        int32x4_t EE1 = vaddq_s32(E1[0], rev32(E1[1]));
+
+        int32x4_t t0 = vreinterpretq_s32_s64(
+            vzip1q_s64(vreinterpretq_s64_s32(EE0), vreinterpretq_s64_s32(EE1)));
+        int32x4_t t1 = vrev64q_s32(vreinterpretq_s32_s64(
+            vzip2q_s64(vreinterpretq_s64_s32(EE0),
+                       vreinterpretq_s64_s32(EE1))));
+
+        EEE[i / 2] = vaddq_s32(t0, t1);
+        EEO[i / 2] = vsubq_s32(t0, t1);
+    }
+
+    for (int i = 0; i < line; i += 4)
+    {
+        for (int k = 1; k < 16; k += 2)
+        {
+            int16x8_t c0_c4 = vld1q_s16(&g_t16[k][0]);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 1]);
+            int64x2_t t2 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 2]);
+            int64x2_t t3 = x265_sdotq_s16(vdupq_n_s64(0), c0_c4, O[i + 3]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int32x4_t t23 = vcombine_s32(vmovn_s64(t2), vmovn_s64(t3));
+            int16x4_t res = vrshrn_n_s32(vpaddq_s32(t01, t23), shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        for (int k = 2; k < 16; k += 4)
+        {
+            int16x8_t c0 = vld1q_s16(t8_odd[(k - 2) / 4]);
+
+            int64x2_t t0 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i / 2 + 0]);
+            int64x2_t t1 = x265_sdotq_s16(vdupq_n_s64(0), c0, EO[i / 2 + 1]);
+
+            int32x4_t t01 = vcombine_s32(vmovn_s64(t0), vmovn_s64(t1));
+            int16x4_t res = vrshrn_n_s32(t01, shift);
+            vst1_s16(dst + k * line, res);
+        }
+
+        int32x4_t c0 = vld1q_s32(t8_even[0]);
+        int32x4_t c4 = vld1q_s32(t8_even[1]);
+        int32x4_t c8 = vld1q_s32(t8_even[2]);
+        int32x4_t c12 = vld1q_s32(t8_even[3]);
+
+        int32x4_t t0 = vpaddq_s32(EEE[i / 2 + 0], EEE[i / 2 + 1]);
+        int32x4_t t1 = vmulq_s32(c0, t0);
+        int16x4_t res0 = vrshrn_n_s32(t1, shift);
+        vst1_s16(dst + 0 * line, res0);
+
+        int32x4_t t2 = vmulq_s32(c4, EEO[i / 2 + 0]);
+        int32x4_t t3 = vmulq_s32(c4, EEO[i / 2 + 1]);
+        int16x4_t res4 = vrshrn_n_s32(vpaddq_s32(t2, t3), shift);
+        vst1_s16(dst + 4 * line, res4);
+
+        int32x4_t t4 = vmulq_s32(c8, EEE[i / 2 + 0]);
+        int32x4_t t5 = vmulq_s32(c8, EEE[i / 2 + 1]);
+        int16x4_t res8 = vrshrn_n_s32(vpaddq_s32(t4, t5), shift);
+        vst1_s16(dst + 8 * line, res8);
+
+        int32x4_t t6 = vmulq_s32(c12, EEO[i / 2 + 0]);
+        int32x4_t t7 = vmulq_s32(c12, EEO[i / 2 + 1]);
+        int16x4_t res12 = vrshrn_n_s32(vpaddq_s32(t6, t7), shift);
+        vst1_s16(dst + 12 * line, res12);
+
+        dst += 4;
+    }
+}
+
 }
 
 
@@ -158,9 +262,27 @@ void dct8_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
     partialButterfly8_sve<shift_pass2>(coef, dst);
 }
 
+void dct16_sve(const int16_t *src, int16_t *dst, intptr_t srcStride)
+{
+    const int shift_pass1 = 3 + X265_DEPTH - 8;
+    const int shift_pass2 = 10;
+
+    ALIGN_VAR_32(int16_t, coef[16 * 16]);
+    ALIGN_VAR_32(int16_t, block[16 * 16]);
+
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
+    }
+
+    partialButterfly16_sve<shift_pass1>(block, coef);
+    partialButterfly16_sve<shift_pass2>(coef, dst);
+}
+
 void setupDCTPrimitives_sve(EncoderPrimitives &p)
 {
     p.cu[BLOCK_8x8].dct   = dct8_sve;
+    p.cu[BLOCK_16x16].dct = dct16_sve;
 }
 
 };
diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index 9122a479d..96dd9a4b0 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -18,22 +18,6 @@ namespace
 {
 using namespace X265_NS;
 
-static int16x8_t rev16(const int16x8_t a)
-{
-    static const uint8x16_t tbl = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
-    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
-
-    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
-}
-
-static int32x4_t rev32(const int32x4_t a)
-{
-    static const uint8x16_t tbl = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
-    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
-
-    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
-}
-
 static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
 {
     int32x2_t s0, s1, s2, s3;
diff --git a/source/common/aarch64/dct-prim.h b/source/common/aarch64/dct-prim.h
index 602e6ac73..dc296962b 100644
--- a/source/common/aarch64/dct-prim.h
+++ b/source/common/aarch64/dct-prim.h
@@ -6,6 +6,7 @@
 #include "primitives.h"
 #include "contexts.h"   // costCoeffNxN_c
 #include "threading.h"  // CLZ
+#include <arm_neon.h>
 
 namespace X265_NS
 {
@@ -19,6 +20,32 @@ const int32_t t8_even[4][4] =
     { 36, -83, 36, -83 },
 };
 
+const uint8_t rev16_tbl[16] =
+{
+    14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+};
+
+const uint8_t rev32_tbl[16] =
+{
+    12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+};
+
+static inline int16x8_t rev16(const int16x8_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev16_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s16(a);
+
+    return vreinterpretq_s16_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
+static inline int32x4_t rev32(const int32x4_t a)
+{
+    const uint8x16_t tbl = vld1q_u8(rev32_tbl);
+    const int8x16_t a_s8 = vreinterpretq_s8_s32(a);
+
+    return vreinterpretq_s32_s8(vqtbx1q_s8(a_s8, a_s8, tbl));
+}
+
 // x265 private namespace
 void setupDCTPrimitives_neon(EncoderPrimitives &p);
 #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
-- 
2.42.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH v2 8/9] AArch64: Add SVE implementation of 16x16 DCT

Reply via email to