The existing SVE2 assembly of dequant_normal only works for SBD, so add the missing code to make it work for HBD. --- source/common/aarch64/asm-primitives.cpp | 13 ++++--------- source/common/aarch64/pixel-util-sve2.S | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index a8560d269..f9f03f423 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -899,9 +899,9 @@ void setupSvePrimitives(EncoderPrimitives &p) #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) #if defined(HAVE_SVE2) -#if !HIGH_BIT_DEPTH void setupSve2Primitives(EncoderPrimitives &p) { +#if !HIGH_BIT_DEPTH // pixel_avg_pp LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2); LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2); @@ -971,9 +971,6 @@ void setupSve2Primitives(EncoderPrimitives &p) p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2); p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2); - // dequant_normal - p.dequant_normal = PFX(dequant_normal_sve2); - // ssim_4x4x2_core p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2); @@ -989,12 +986,10 @@ void setupSve2Primitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].normFact = PFX(normFact16_sve2); p.cu[BLOCK_32x32].normFact = PFX(normFact32_sve2); p.cu[BLOCK_64x64].normFact = PFX(normFact64_sve2); -} -#else // !HIGH_BIT_DEPTH -void setupSve2Primitives(EncoderPrimitives &) -{ -} #endif // !HIGH_BIT_DEPTH + + p.dequant_normal = PFX(dequant_normal_sve2); +} #endif // defined(HAVE_SVE2) #ifdef HAVE_NEON_DOTPROD diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S index c7ff0b35e..56a2253ea 100644 --- a/source/common/aarch64/pixel-util-sve2.S +++ b/source/common/aarch64/pixel-util-sve2.S @@ -869,7 +869,16 @@ endfunc // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) function PFX(dequant_normal_sve2) - lsr w2, w2, #4 // num / 16 +// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num); +// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); +// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num); +#if HIGH_BIT_DEPTH + cmp w3, #32768 + blt .dqn_skip + lsr w3, w3, #(BIT_DEPTH - 8) + sub w4, w4, #(BIT_DEPTH - 8) +.dqn_skip: +#endif neg w4, w4 mov z0.h, w3 mov z1.s, w4 @@ -893,7 +902,7 @@ function PFX(dequant_normal_sve2) sqxtn v3.4h, v18.4s sqxtn2 v3.8h, v19.4s - sub w2, w2, #1 + sub w2, w2, #16 st1 {v2.8h, v3.8h}, [x1], #32 cbnz w2, .dqn_loop1_sve2 ret @@ -910,8 +919,8 @@ function PFX(dequant_normal_sve2) sqxtnb z2.h, z16.s sqxtnt z2.h, z17.s - - sub w2, w2, #1 + + sub w2, w2, #16 st1h {z2.h}, p0, [x1] add x1, x1, #32 cbnz w2, .gt_16_dqn_loop1_sve2 -- 2.34.1
>From 3684bfad7dcf9f8845e6a84a89ba0335576dabdf Mon Sep 17 00:00:00 2001 Message-Id: <3684bfad7dcf9f8845e6a84a89ba0335576dabdf.1744030934.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1744030934.git.microdaryl.rob...@arm.com> References: <cover.1744030934.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Wed, 2 Apr 2025 20:07:09 +0100 Subject: [PATCH 2/2] AArch64: Fix SVE2 asm implementation of dequant_normal for HBD The existing SVE2 assembly of dequant_normal only works for SBD, so add the missing code to make it work for HBD. --- source/common/aarch64/asm-primitives.cpp | 13 ++++--------- source/common/aarch64/pixel-util-sve2.S | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index a8560d269..f9f03f423 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -899,9 +899,9 @@ void setupSvePrimitives(EncoderPrimitives &p) #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) #if defined(HAVE_SVE2) -#if !HIGH_BIT_DEPTH void setupSve2Primitives(EncoderPrimitives &p) { +#if !HIGH_BIT_DEPTH // pixel_avg_pp LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2); LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2); @@ -971,9 +971,6 @@ void setupSve2Primitives(EncoderPrimitives &p) p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2); p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2); - // dequant_normal - p.dequant_normal = PFX(dequant_normal_sve2); - // ssim_4x4x2_core p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_sve2); @@ -989,12 +986,10 @@ void setupSve2Primitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].normFact = PFX(normFact16_sve2); p.cu[BLOCK_32x32].normFact = PFX(normFact32_sve2); p.cu[BLOCK_64x64].normFact = PFX(normFact64_sve2); -} -#else // !HIGH_BIT_DEPTH -void setupSve2Primitives(EncoderPrimitives &) -{ -} #endif // !HIGH_BIT_DEPTH + + p.dequant_normal = PFX(dequant_normal_sve2); +} #endif // defined(HAVE_SVE2) #ifdef HAVE_NEON_DOTPROD diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S index c7ff0b35e..56a2253ea 100644 --- a/source/common/aarch64/pixel-util-sve2.S +++ b/source/common/aarch64/pixel-util-sve2.S @@ -869,7 +869,16 @@ endfunc // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) function PFX(dequant_normal_sve2) - lsr w2, w2, #4 // num / 16 +// X265_CHECK(num >= 4 * 4, "dequant num %d too small\n", num); +// X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num); +// X265_CHECK((num % 16) == 0, "dequant num %d not multiple of 16\n", num); +#if HIGH_BIT_DEPTH + cmp w3, #32768 + blt .dqn_skip + lsr w3, w3, #(BIT_DEPTH - 8) + sub w4, w4, #(BIT_DEPTH - 8) +.dqn_skip: +#endif neg w4, w4 mov z0.h, w3 mov z1.s, w4 @@ -893,7 +902,7 @@ function PFX(dequant_normal_sve2) sqxtn v3.4h, v18.4s sqxtn2 v3.8h, v19.4s - sub w2, w2, #1 + sub w2, w2, #16 st1 {v2.8h, v3.8h}, [x1], #32 cbnz w2, .dqn_loop1_sve2 ret @@ -910,8 +919,8 @@ function PFX(dequant_normal_sve2) sqxtnb z2.h, z16.s sqxtnt z2.h, z17.s - - sub w2, w2, #1 + + sub w2, w2, #16 st1h {z2.h}, p0, [x1] add x1, x1, #32 cbnz w2, .gt_16_dqn_loop1_sve2 -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel