[x265] [PATCH 1/2] AArch64: Add Neon implementation of findPosFirstLast

Micro Daryl Robles Tue, 08 Apr 2025 08:13:55 -0700

Relative performance compared to scalar C:

 Neoverse N1: 4.97-5.87x
 Neoverse N2: 3.80-4.87x
 Neoverse V1: 4.70-5.41x
 Neoverse V2: 3.79-4.91x
---
 source/common/aarch64/dct-prim.cpp | 55 +++++++++++++++++++++++++++++-
 source/common/threading.h          |  6 ++++
 source/test/pixelharness.cpp       | 12 +++++--
 3 files changed, 70 insertions(+), 3 deletions(-)


diff --git a/source/common/aarch64/dct-prim.cpp 
b/source/common/aarch64/dct-prim.cpp
index dea20e522..6a3d95e91 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -1862,6 +1862,59 @@ void idct32_neon(const int16_t *src, int16_t *dst, 
intptr_t dstStride)
     partialButterflyInverse32_neon<shift_pass2>(coef, dst, dstStride);
 }
 
+uint32_t findPosFirstLast_neon(const int16_t *coeff, const intptr_t trSize,
+                               const uint16_t scanTbl[16])
+{
+    X265_CHECK(SCAN_SET_SIZE == 16, "SCAN_SET_SIZE must be 16\n");
+    X265_CHECK(MLS_CG_SIZE == 4, "MLS_CG_SIZE must be 4\n");
+    X265_CHECK(scanTbl[2] == 1 || scanTbl[2] == 2 || scanTbl[2] == 8,
+               "scanTbl is invalid\n");
+
+    int16x4_t c0 = vld1_s16(&coeff[0 * trSize]);
+    int16x4_t c1 = vld1_s16(&coeff[1 * trSize]);
+    int16x4_t c2 = vld1_s16(&coeff[2 * trSize]);
+    int16x4_t c3 = vld1_s16(&coeff[3 * trSize]);
+    int16x8_t coeff01 = vcombine_s16(c0, c1);
+    int16x8_t coeff23 = vcombine_s16(c2, c3);
+
+    // Set cmp bits if coeff[x] != 0.
+    uint16x8_t cmp01 = vtstq_s16(coeff01, coeff01);
+    uint16x8_t cmp23 = vtstq_s16(coeff23, coeff23);
+    uint8x16_t cmp_8bit = vcombine_u8(vmovn_u16(cmp01), vmovn_u16(cmp23));
+
+    if (scanTbl[2] != 2) // Skip if SCAN_HOR.
+    {
+        // Load scanTbl.
+        uint16x8_t t0 = vld1q_u16(scanTbl + 0);
+        uint16x8_t t1 = vld1q_u16(scanTbl + 8);
+        uint8x16_t scan_tbl = vcombine_u8(vmovn_u16(t0), vmovn_u16(t1));
+
+        cmp_8bit = vqtbl1q_u8(cmp_8bit, scan_tbl);
+    }
+
+    // Convert the 8x16 cmp_8bit into 4x16 cmp_4bit.
+    uint64_t cmp_4bit = vget_lane_u64(
+        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_8bit), 4)), 
0);
+
+    // NOTE: If coeff block are all zeros, the lastNZPosInCG is undefined and
+    // firstNZPosInCG is 16.
+    if (cmp_4bit == 0)
+    {
+        return (uint32_t)-1 << 8 | SCAN_SET_SIZE;
+    }
+
+    unsigned long id_first, id_last;
+    CTZ64(id_first, cmp_4bit);
+    uint32_t firstNZPosInCG = (uint32_t)id_first >> 2;
+    CLZ64(id_last, cmp_4bit);
+    uint32_t lastNZPosInCG = (uint32_t)id_last >> 2;
+
+    // Add long not needed, we only need LSB.
+    uint32_t absSumSign = (uint32_t)vaddvq_s16(vaddq_s16(coeff01, coeff23));
+
+    return (absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG;
+}
+
 void setupDCTPrimitives_neon(EncoderPrimitives &p)
 {
     p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_neon<2>;
@@ -1901,7 +1954,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>;
 
     p.scanPosLast  = scanPosLast_opt;
-
+    p.findPosFirstLast = findPosFirstLast_neon;
 }
 
 };
diff --git a/source/common/threading.h b/source/common/threading.h
index 8a5c39cf0..2a1743738 100644
--- a/source/common/threading.h
+++ b/source/common/threading.h
@@ -60,6 +60,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
 #define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x)          id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x)          id = (unsigned long)__builtin_ctzll(x)
 #define ATOMIC_OR(ptr, mask)  no_atomic_or((int*)ptr, mask)
 #define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
 #define ATOMIC_INC(ptr)       no_atomic_inc((int*)ptr)
@@ -74,6 +76,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
 #define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x)          id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x)          id = (unsigned long)__builtin_ctzll(x)
 #define ATOMIC_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
 #define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
 #define ATOMIC_INC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, 1)
@@ -87,6 +91,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            _BitScanReverse(&id, x)
 #define CTZ(id, x)            _BitScanForward(&id, x)
+#define CLZ64(id, x)          _BitScanReverse64(&id, x)
+#define CTZ64(id, x)          _BitScanForward64(&id, x)
 #define ATOMIC_INC(ptr)       InterlockedIncrement((volatile LONG*)ptr)
 #define ATOMIC_DEC(ptr)       InterlockedDecrement((volatile LONG*)ptr)
 #define ATOMIC_ADD(ptr, val)  InterlockedExchangeAdd((volatile LONG*)ptr, val)
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 10f66cda1..380390e1a 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -3697,7 +3697,6 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& 
ref, const EncoderPrimi
 
     if (opt.findPosFirstLast)
     {
-        HEADER0("findPosFirstLast");
         coeff_t coefBuf[32 * MLS_CG_SIZE];
         memset(coefBuf, 0, sizeof(coefBuf));
         // every CG can't be all zeros!
@@ -3705,7 +3704,16 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& 
ref, const EncoderPrimi
         coefBuf[3 + 1 * 32] = 0x0BAD;
         coefBuf[3 + 2 * 32] = 0x0BAD;
         coefBuf[3 + 3 * 32] = 0x0BAD;
-        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 
32, g_scan4x4[SCAN_DIAG]);
+        const intptr_t trSize = 32;
+        HEADER0("findPosFirstLast[SCAN_DIAG]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 
trSize,
+                       g_scan4x4[SCAN_DIAG]);
+        HEADER0("findPosFirstLast[SCAN_HOR]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 
trSize,
+                       g_scan4x4[SCAN_HOR]);
+        HEADER0("findPosFirstLast[SCAN_VER]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 
trSize,
+                       g_scan4x4[SCAN_VER]);
     }
 
     if (opt.costCoeffNxN)
-- 
2.34.1

>From b5f8ab7cc0f9a4af0865ee87d2c129b8169ddadb Mon Sep 17 00:00:00 2001
Message-Id: <b5f8ab7cc0f9a4af0865ee87d2c129b8169ddadb.1744124029.git.microdaryl.rob...@arm.com>
In-Reply-To: <cover.1744124029.git.microdaryl.rob...@arm.com>
References: <cover.1744124029.git.microdaryl.rob...@arm.com>
From: Micro Daryl Robles <microdaryl.rob...@arm.com>
Date: Fri, 7 Mar 2025 17:27:58 +0000
Subject: [PATCH 1/2] AArch64: Add Neon implementation of findPosFirstLast

Relative performance compared to scalar C:

 Neoverse N1: 4.97-5.87x
 Neoverse N2: 3.80-4.87x
 Neoverse V1: 4.70-5.41x
 Neoverse V2: 3.79-4.91x
---
 source/common/aarch64/dct-prim.cpp | 55 +++++++++++++++++++++++++++++-
 source/common/threading.h          |  6 ++++
 source/test/pixelharness.cpp       | 12 +++++--
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index dea20e522..6a3d95e91 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -1862,6 +1862,59 @@ void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
     partialButterflyInverse32_neon<shift_pass2>(coef, dst, dstStride);
 }
 
+uint32_t findPosFirstLast_neon(const int16_t *coeff, const intptr_t trSize,
+                               const uint16_t scanTbl[16])
+{
+    X265_CHECK(SCAN_SET_SIZE == 16, "SCAN_SET_SIZE must be 16\n");
+    X265_CHECK(MLS_CG_SIZE == 4, "MLS_CG_SIZE must be 4\n");
+    X265_CHECK(scanTbl[2] == 1 || scanTbl[2] == 2 || scanTbl[2] == 8,
+               "scanTbl is invalid\n");
+
+    int16x4_t c0 = vld1_s16(&coeff[0 * trSize]);
+    int16x4_t c1 = vld1_s16(&coeff[1 * trSize]);
+    int16x4_t c2 = vld1_s16(&coeff[2 * trSize]);
+    int16x4_t c3 = vld1_s16(&coeff[3 * trSize]);
+    int16x8_t coeff01 = vcombine_s16(c0, c1);
+    int16x8_t coeff23 = vcombine_s16(c2, c3);
+
+    // Set cmp bits if coeff[x] != 0.
+    uint16x8_t cmp01 = vtstq_s16(coeff01, coeff01);
+    uint16x8_t cmp23 = vtstq_s16(coeff23, coeff23);
+    uint8x16_t cmp_8bit = vcombine_u8(vmovn_u16(cmp01), vmovn_u16(cmp23));
+
+    if (scanTbl[2] != 2) // Skip if SCAN_HOR.
+    {
+        // Load scanTbl.
+        uint16x8_t t0 = vld1q_u16(scanTbl + 0);
+        uint16x8_t t1 = vld1q_u16(scanTbl + 8);
+        uint8x16_t scan_tbl = vcombine_u8(vmovn_u16(t0), vmovn_u16(t1));
+
+        cmp_8bit = vqtbl1q_u8(cmp_8bit, scan_tbl);
+    }
+
+    // Convert the 8x16 cmp_8bit into 4x16 cmp_4bit.
+    uint64_t cmp_4bit = vget_lane_u64(
+        vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_8bit), 4)), 0);
+
+    // NOTE: If coeff block are all zeros, the lastNZPosInCG is undefined and
+    // firstNZPosInCG is 16.
+    if (cmp_4bit == 0)
+    {
+        return (uint32_t)-1 << 8 | SCAN_SET_SIZE;
+    }
+
+    unsigned long id_first, id_last;
+    CTZ64(id_first, cmp_4bit);
+    uint32_t firstNZPosInCG = (uint32_t)id_first >> 2;
+    CLZ64(id_last, cmp_4bit);
+    uint32_t lastNZPosInCG = (uint32_t)id_last >> 2;
+
+    // Add long not needed, we only need LSB.
+    uint32_t absSumSign = (uint32_t)vaddvq_s16(vaddq_s16(coeff01, coeff23));
+
+    return (absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG;
+}
+
 void setupDCTPrimitives_neon(EncoderPrimitives &p)
 {
     p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_neon<2>;
@@ -1901,7 +1954,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>;
 
     p.scanPosLast  = scanPosLast_opt;
-
+    p.findPosFirstLast = findPosFirstLast_neon;
 }
 
 };
diff --git a/source/common/threading.h b/source/common/threading.h
index 8a5c39cf0..2a1743738 100644
--- a/source/common/threading.h
+++ b/source/common/threading.h
@@ -60,6 +60,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
 #define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x)          id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x)          id = (unsigned long)__builtin_ctzll(x)
 #define ATOMIC_OR(ptr, mask)  no_atomic_or((int*)ptr, mask)
 #define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask)
 #define ATOMIC_INC(ptr)       no_atomic_inc((int*)ptr)
@@ -74,6 +76,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
 #define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
+#define CLZ64(id, x)          id = (unsigned long)__builtin_clzll(x) ^ 63
+#define CTZ64(id, x)          id = (unsigned long)__builtin_ctzll(x)
 #define ATOMIC_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
 #define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
 #define ATOMIC_INC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, 1)
@@ -87,6 +91,8 @@ int no_atomic_add(int* ptr, int val);
 
 #define CLZ(id, x)            _BitScanReverse(&id, x)
 #define CTZ(id, x)            _BitScanForward(&id, x)
+#define CLZ64(id, x)          _BitScanReverse64(&id, x)
+#define CTZ64(id, x)          _BitScanForward64(&id, x)
 #define ATOMIC_INC(ptr)       InterlockedIncrement((volatile LONG*)ptr)
 #define ATOMIC_DEC(ptr)       InterlockedDecrement((volatile LONG*)ptr)
 #define ATOMIC_ADD(ptr, val)  InterlockedExchangeAdd((volatile LONG*)ptr, val)
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 10f66cda1..380390e1a 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -3697,7 +3697,6 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
 
     if (opt.findPosFirstLast)
     {
-        HEADER0("findPosFirstLast");
         coeff_t coefBuf[32 * MLS_CG_SIZE];
         memset(coefBuf, 0, sizeof(coefBuf));
         // every CG can't be all zeros!
@@ -3705,7 +3704,16 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
         coefBuf[3 + 1 * 32] = 0x0BAD;
         coefBuf[3 + 2 * 32] = 0x0BAD;
         coefBuf[3 + 3 * 32] = 0x0BAD;
-        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]);
+        const intptr_t trSize = 32;
+        HEADER0("findPosFirstLast[SCAN_DIAG]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+                       g_scan4x4[SCAN_DIAG]);
+        HEADER0("findPosFirstLast[SCAN_HOR]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+                       g_scan4x4[SCAN_HOR]);
+        HEADER0("findPosFirstLast[SCAN_VER]");
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize,
+                       g_scan4x4[SCAN_VER]);
     }
 
     if (opt.costCoeffNxN)
-- 
2.34.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 1/2] AArch64: Add Neon implementation of findPosFirstLast

Reply via email to