The existing Neon assembly of dequant_normal only works for SBD, so add the missing code to make it work for HBD.
The parameter values used in REPORT_SPEEDUP are updated to fully test the high bit-depth version. --- source/common/aarch64/asm-primitives.cpp | 2 +- source/common/aarch64/pixel-util.S | 13 +++++++++++-- source/test/mbdstharness.cpp | 6 ++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1317eb74..a8560d269 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) // dequant_scaling p.dequant_scaling = PFX(dequant_scaling_neon); - p.dequant_normal = PFX(dequant_normal_neon); // ssim_4x4x2_core p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon); @@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p) #endif // quant + p.dequant_normal = PFX(dequant_normal_neon); p.quant = PFX(quant_neon); p.nquant = PFX(nquant_neon); } diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index 1825466ea..495bac1fa 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -1626,7 +1626,16 @@ endfunc // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) function PFX(dequant_normal_neon) - lsr w2, w2, #4 // num / 16 +// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num); +// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); +// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num); +#if HIGH_BIT_DEPTH + cmp w3, #32768 + blt .dqn_skip + lsr w3, w3, #(BIT_DEPTH - 8) + sub w4, w4, #(BIT_DEPTH - 8) +.dqn_skip: +#endif neg w4, w4 dup v0.8h, w3 dup v1.4s, w4 @@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon) sqxtn v3.4h, v18.4s sqxtn2 v3.8h, v19.4s - sub w2, w2, #1 + sub w2, w2, #16 st1 {v2.8h, v3.8h}, [x1], #32 cbnz w2, .dqn_loop1 ret diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp index cceadd833..05027d109 100644 --- a/source/test/mbdstharness.cpp +++ b/source/test/mbdstharness.cpp @@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr { if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal)) { - printf("dequant: Failed!\n"); + printf("dequant_normal: Failed!\n"); return false; } } @@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.dequant_normal) { + int scale = 72 << X265_DEPTH; + int shift = X265_DEPTH - 4; printf("dequant_normal\t"); - REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1); + REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, scale, shift); } if (opt.dequant_scaling) -- 2.34.1
>From ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595 Mon Sep 17 00:00:00 2001 Message-Id: <ca57ccc3f57b6e6e75cb3683d2c6c5ce61089595.1744030934.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1744030934.git.microdaryl.rob...@arm.com> References: <cover.1744030934.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Fri, 7 Mar 2025 12:07:13 +0000 Subject: [PATCH 1/2] AArch64: Fix Neon asm implementation of dequant_normal for HBD The existing Neon assembly of dequant_normal only works for SBD, so add the missing code to make it work for HBD. The parameter values used in REPORT_SPEEDUP are updated to fully test the high bit-depth version. --- source/common/aarch64/asm-primitives.cpp | 2 +- source/common/aarch64/pixel-util.S | 13 +++++++++++-- source/test/mbdstharness.cpp | 6 ++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1317eb74..a8560d269 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -714,7 +714,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) // dequant_scaling p.dequant_scaling = PFX(dequant_scaling_neon); - p.dequant_normal = PFX(dequant_normal_neon); // ssim_4x4x2_core p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon); @@ -743,6 +742,7 @@ void setupNeonPrimitives(EncoderPrimitives &p) #endif // quant + p.dequant_normal = PFX(dequant_normal_neon); p.quant = PFX(quant_neon); p.nquant = PFX(nquant_neon); } diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S index 1825466ea..495bac1fa 100644 --- a/source/common/aarch64/pixel-util.S +++ b/source/common/aarch64/pixel-util.S @@ -1626,7 +1626,16 @@ endfunc // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) function PFX(dequant_normal_neon) - lsr w2, w2, #4 // num / 16 +// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num); +// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); +// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num); +#if HIGH_BIT_DEPTH + cmp w3, #32768 + blt .dqn_skip + lsr w3, w3, #(BIT_DEPTH - 8) + sub w4, w4, #(BIT_DEPTH - 8) +.dqn_skip: +#endif neg w4, w4 dup v0.8h, w3 dup v1.4s, w4 @@ -1648,7 +1657,7 @@ function PFX(dequant_normal_neon) sqxtn v3.4h, v18.4s sqxtn2 v3.8h, v19.4s - sub w2, w2, #1 + sub w2, w2, #16 st1 {v2.8h, v3.8h}, [x1], #32 cbnz w2, .dqn_loop1 ret diff --git a/source/test/mbdstharness.cpp b/source/test/mbdstharness.cpp index cceadd833..05027d109 100644 --- a/source/test/mbdstharness.cpp +++ b/source/test/mbdstharness.cpp @@ -524,7 +524,7 @@ bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPr { if (!check_dequant_primitive(ref.dequant_normal, opt.dequant_normal)) { - printf("dequant: Failed!\n"); + printf("dequant_normal: Failed!\n"); return false; } } @@ -655,8 +655,10 @@ void MBDstHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi if (opt.dequant_normal) { + int scale = 72 << X265_DEPTH; + int shift = X265_DEPTH - 4; printf("dequant_normal\t"); - REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, 70, 1); + REPORT_SPEEDUP(opt.dequant_normal, ref.dequant_normal, short_test_buff[0], mshortbuf2, 32 * 32, scale, shift); } if (opt.dequant_scaling) -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel