Relative performance compared to scalar C: Neoverse N1: 4.97-5.87x Neoverse N2: 3.80-4.87x Neoverse V1: 4.70-5.41x Neoverse V2: 3.79-4.91x --- source/common/aarch64/dct-prim.cpp | 55 +++++++++++++++++++++++++++++- source/common/threading.h | 6 ++++ source/test/pixelharness.cpp | 12 +++++-- 3 files changed, 70 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp index dea20e522..6a3d95e91 100644 --- a/source/common/aarch64/dct-prim.cpp +++ b/source/common/aarch64/dct-prim.cpp @@ -1862,6 +1862,59 @@ void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride) partialButterflyInverse32_neon<shift_pass2>(coef, dst, dstStride); } +uint32_t findPosFirstLast_neon(const int16_t *coeff, const intptr_t trSize, + const uint16_t scanTbl[16]) +{ + X265_CHECK(SCAN_SET_SIZE == 16, "SCAN_SET_SIZE must be 16\n"); + X265_CHECK(MLS_CG_SIZE == 4, "MLS_CG_SIZE must be 4\n"); + X265_CHECK(scanTbl[2] == 1 || scanTbl[2] == 2 || scanTbl[2] == 8, + "scanTbl is invalid\n"); + + int16x4_t c0 = vld1_s16(&coeff[0 * trSize]); + int16x4_t c1 = vld1_s16(&coeff[1 * trSize]); + int16x4_t c2 = vld1_s16(&coeff[2 * trSize]); + int16x4_t c3 = vld1_s16(&coeff[3 * trSize]); + int16x8_t coeff01 = vcombine_s16(c0, c1); + int16x8_t coeff23 = vcombine_s16(c2, c3); + + // Set cmp bits if coeff[x] != 0. + uint16x8_t cmp01 = vtstq_s16(coeff01, coeff01); + uint16x8_t cmp23 = vtstq_s16(coeff23, coeff23); + uint8x16_t cmp_8bit = vcombine_u8(vmovn_u16(cmp01), vmovn_u16(cmp23)); + + if (scanTbl[2] != 2) // Skip if SCAN_HOR. + { + // Load scanTbl. + uint16x8_t t0 = vld1q_u16(scanTbl + 0); + uint16x8_t t1 = vld1q_u16(scanTbl + 8); + uint8x16_t scan_tbl = vcombine_u8(vmovn_u16(t0), vmovn_u16(t1)); + + cmp_8bit = vqtbl1q_u8(cmp_8bit, scan_tbl); + } + + // Convert the 8x16 cmp_8bit into 4x16 cmp_4bit. + uint64_t cmp_4bit = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_8bit), 4)), 0); + + // NOTE: If coeff block are all zeros, the lastNZPosInCG is undefined and + // firstNZPosInCG is 16. + if (cmp_4bit == 0) + { + return (uint32_t)-1 << 8 | SCAN_SET_SIZE; + } + + unsigned long id_first, id_last; + CTZ64(id_first, cmp_4bit); + uint32_t firstNZPosInCG = (uint32_t)id_first >> 2; + CLZ64(id_last, cmp_4bit); + uint32_t lastNZPosInCG = (uint32_t)id_last >> 2; + + // Add long not needed, we only need LSB. + uint32_t absSumSign = (uint32_t)vaddvq_s16(vaddq_s16(coeff01, coeff23)); + + return (absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG; +} + void setupDCTPrimitives_neon(EncoderPrimitives &p) { p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_neon<2>; @@ -1901,7 +1954,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>; p.scanPosLast = scanPosLast_opt; - + p.findPosFirstLast = findPosFirstLast_neon; } }; diff --git a/source/common/threading.h b/source/common/threading.h index 8a5c39cf0..2a1743738 100644 --- a/source/common/threading.h +++ b/source/common/threading.h @@ -60,6 +60,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 #define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63 +#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) #define ATOMIC_OR(ptr, mask) no_atomic_or((int*)ptr, mask) #define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask) #define ATOMIC_INC(ptr) no_atomic_inc((int*)ptr) @@ -74,6 +76,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 #define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63 +#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) #define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) #define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) #define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) @@ -87,6 +91,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) _BitScanReverse(&id, x) #define CTZ(id, x) _BitScanForward(&id, x) +#define CLZ64(id, x) _BitScanReverse64(&id, x) +#define CTZ64(id, x) _BitScanForward64(&id, x) #define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) #define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) #define ATOMIC_ADD(ptr, val) InterlockedExchangeAdd((volatile LONG*)ptr, val) diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp index 10f66cda1..380390e1a 100644 --- a/source/test/pixelharness.cpp +++ b/source/test/pixelharness.cpp @@ -3697,7 +3697,6 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.findPosFirstLast) { - HEADER0("findPosFirstLast"); coeff_t coefBuf[32 * MLS_CG_SIZE]; memset(coefBuf, 0, sizeof(coefBuf)); // every CG can't be all zeros! @@ -3705,7 +3704,16 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi coefBuf[3 + 1 * 32] = 0x0BAD; coefBuf[3 + 2 * 32] = 0x0BAD; coefBuf[3 + 3 * 32] = 0x0BAD; - REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]); + const intptr_t trSize = 32; + HEADER0("findPosFirstLast[SCAN_DIAG]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_DIAG]); + HEADER0("findPosFirstLast[SCAN_HOR]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_HOR]); + HEADER0("findPosFirstLast[SCAN_VER]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_VER]); } if (opt.costCoeffNxN) -- 2.34.1
>From b5f8ab7cc0f9a4af0865ee87d2c129b8169ddadb Mon Sep 17 00:00:00 2001 Message-Id: <b5f8ab7cc0f9a4af0865ee87d2c129b8169ddadb.1744124029.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1744124029.git.microdaryl.rob...@arm.com> References: <cover.1744124029.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Fri, 7 Mar 2025 17:27:58 +0000 Subject: [PATCH 1/2] AArch64: Add Neon implementation of findPosFirstLast Relative performance compared to scalar C: Neoverse N1: 4.97-5.87x Neoverse N2: 3.80-4.87x Neoverse V1: 4.70-5.41x Neoverse V2: 3.79-4.91x --- source/common/aarch64/dct-prim.cpp | 55 +++++++++++++++++++++++++++++- source/common/threading.h | 6 ++++ source/test/pixelharness.cpp | 12 +++++-- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp index dea20e522..6a3d95e91 100644 --- a/source/common/aarch64/dct-prim.cpp +++ b/source/common/aarch64/dct-prim.cpp @@ -1862,6 +1862,59 @@ void idct32_neon(const int16_t *src, int16_t *dst, intptr_t dstStride) partialButterflyInverse32_neon<shift_pass2>(coef, dst, dstStride); } +uint32_t findPosFirstLast_neon(const int16_t *coeff, const intptr_t trSize, + const uint16_t scanTbl[16]) +{ + X265_CHECK(SCAN_SET_SIZE == 16, "SCAN_SET_SIZE must be 16\n"); + X265_CHECK(MLS_CG_SIZE == 4, "MLS_CG_SIZE must be 4\n"); + X265_CHECK(scanTbl[2] == 1 || scanTbl[2] == 2 || scanTbl[2] == 8, + "scanTbl is invalid\n"); + + int16x4_t c0 = vld1_s16(&coeff[0 * trSize]); + int16x4_t c1 = vld1_s16(&coeff[1 * trSize]); + int16x4_t c2 = vld1_s16(&coeff[2 * trSize]); + int16x4_t c3 = vld1_s16(&coeff[3 * trSize]); + int16x8_t coeff01 = vcombine_s16(c0, c1); + int16x8_t coeff23 = vcombine_s16(c2, c3); + + // Set cmp bits if coeff[x] != 0. + uint16x8_t cmp01 = vtstq_s16(coeff01, coeff01); + uint16x8_t cmp23 = vtstq_s16(coeff23, coeff23); + uint8x16_t cmp_8bit = vcombine_u8(vmovn_u16(cmp01), vmovn_u16(cmp23)); + + if (scanTbl[2] != 2) // Skip if SCAN_HOR. + { + // Load scanTbl. + uint16x8_t t0 = vld1q_u16(scanTbl + 0); + uint16x8_t t1 = vld1q_u16(scanTbl + 8); + uint8x16_t scan_tbl = vcombine_u8(vmovn_u16(t0), vmovn_u16(t1)); + + cmp_8bit = vqtbl1q_u8(cmp_8bit, scan_tbl); + } + + // Convert the 8x16 cmp_8bit into 4x16 cmp_4bit. + uint64_t cmp_4bit = vget_lane_u64( + vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(cmp_8bit), 4)), 0); + + // NOTE: If coeff block are all zeros, the lastNZPosInCG is undefined and + // firstNZPosInCG is 16. + if (cmp_4bit == 0) + { + return (uint32_t)-1 << 8 | SCAN_SET_SIZE; + } + + unsigned long id_first, id_last; + CTZ64(id_first, cmp_4bit); + uint32_t firstNZPosInCG = (uint32_t)id_first >> 2; + CLZ64(id_last, cmp_4bit); + uint32_t lastNZPosInCG = (uint32_t)id_last >> 2; + + // Add long not needed, we only need LSB. + uint32_t absSumSign = (uint32_t)vaddvq_s16(vaddq_s16(coeff01, coeff23)); + + return (absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG; +} + void setupDCTPrimitives_neon(EncoderPrimitives &p) { p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_neon<2>; @@ -1901,7 +1954,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_neon<5>; p.scanPosLast = scanPosLast_opt; - + p.findPosFirstLast = findPosFirstLast_neon; } }; diff --git a/source/common/threading.h b/source/common/threading.h index 8a5c39cf0..2a1743738 100644 --- a/source/common/threading.h +++ b/source/common/threading.h @@ -60,6 +60,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 #define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63 +#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) #define ATOMIC_OR(ptr, mask) no_atomic_or((int*)ptr, mask) #define ATOMIC_AND(ptr, mask) no_atomic_and((int*)ptr, mask) #define ATOMIC_INC(ptr) no_atomic_inc((int*)ptr) @@ -74,6 +76,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 #define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define CLZ64(id, x) id = (unsigned long)__builtin_clzll(x) ^ 63 +#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x) #define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) #define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) #define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) @@ -87,6 +91,8 @@ int no_atomic_add(int* ptr, int val); #define CLZ(id, x) _BitScanReverse(&id, x) #define CTZ(id, x) _BitScanForward(&id, x) +#define CLZ64(id, x) _BitScanReverse64(&id, x) +#define CTZ64(id, x) _BitScanForward64(&id, x) #define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) #define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) #define ATOMIC_ADD(ptr, val) InterlockedExchangeAdd((volatile LONG*)ptr, val) diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp index 10f66cda1..380390e1a 100644 --- a/source/test/pixelharness.cpp +++ b/source/test/pixelharness.cpp @@ -3697,7 +3697,6 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.findPosFirstLast) { - HEADER0("findPosFirstLast"); coeff_t coefBuf[32 * MLS_CG_SIZE]; memset(coefBuf, 0, sizeof(coefBuf)); // every CG can't be all zeros! @@ -3705,7 +3704,16 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi coefBuf[3 + 1 * 32] = 0x0BAD; coefBuf[3 + 2 * 32] = 0x0BAD; coefBuf[3 + 3 * 32] = 0x0BAD; - REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]); + const intptr_t trSize = 32; + HEADER0("findPosFirstLast[SCAN_DIAG]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_DIAG]); + HEADER0("findPosFirstLast[SCAN_HOR]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_HOR]); + HEADER0("findPosFirstLast[SCAN_VER]"); + REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, trSize, + g_scan4x4[SCAN_VER]); } if (opt.costCoeffNxN) -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel