Relative performance compared to scalar C: Neoverse N1: 4.53x Neoverse N2: 4.19x Neoverse V1: 4.26x Neoverse V2: 3.93x --- source/common/aarch64/intrapred-prim.cpp | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+)
diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp index c5d47fe0d..3d4b4769f 100644 --- a/source/common/aarch64/intrapred-prim.cpp +++ b/source/common/aarch64/intrapred-prim.cpp @@ -445,6 +445,91 @@ void intra_pred_planar4_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix } #endif +#if !HIGH_BIT_DEPTH +void intra_pred_planar32_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix, + int /*dirMode*/, int /*bFilter*/) +{ + const int log2Size = 5; + const int blkSize = 1 << log2Size; + + const pixel *src0 = srcPix + 1; + const pixel *src1 = srcPix + 2 * blkSize + 1; + + uint8x8_t above0 = vld1_u8(src0 + 0 * 8); + uint8x8_t above1 = vld1_u8(src0 + 1 * 8); + uint8x8_t above2 = vld1_u8(src0 + 2 * 8); + uint8x8_t above3 = vld1_u8(src0 + 3 * 8); + + uint8x8_t topRight = vdup_n_u8(src0[blkSize]); + uint8x8_t bottomLeft = vdup_n_u8(src1[blkSize]); + + const uint8_t c[2][32] = + { + {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} + }; + + // left constant + const uint8x8_t l0 = vld1_u8(c[0] + 0 * 8); + const uint8x8_t l1 = vld1_u8(c[0] + 1 * 8); + const uint8x8_t l2 = vld1_u8(c[0] + 2 * 8); + const uint8x8_t l3 = vld1_u8(c[0] + 3 * 8); + + // topRight constant + const uint8x8_t tR0 = vld1_u8(c[1] + 0 * 8); + const uint8x8_t tR1 = vld1_u8(c[1] + 1 * 8); + const uint8x8_t tR2 = vld1_u8(c[1] + 2 * 8); + const uint8x8_t tR3 = vld1_u8(c[1] + 3 * 8); + + const uint16x8_t offset = vdupq_n_u16(blkSize); + const uint16x8_t offset_bottomLeft = vaddw_u8(offset, bottomLeft); + + const uint8x8_t c31 = vdup_n_u8(31); + + uint16x8_t t0 = vmlal_u8(offset_bottomLeft, topRight, tR0); + t0 = vmlal_u8(t0, above0, c31); + + uint16x8_t t1 = vmlal_u8(offset_bottomLeft, topRight, tR1); + t1 = vmlal_u8(t1, above1, c31); + + uint16x8_t t2 = vmlal_u8(offset_bottomLeft, topRight, tR2); + t2 = vmlal_u8(t2, above2, c31); + + uint16x8_t t3 = vmlal_u8(offset_bottomLeft, topRight, tR3); + t3 = vmlal_u8(t3, above3, c31); + + uint16x8_t sub_bottomLeft_above0 = vsubl_u8(bottomLeft, above0); + uint16x8_t sub_bottomLeft_above1 = vsubl_u8(bottomLeft, above1); + uint16x8_t sub_bottomLeft_above2 = vsubl_u8(bottomLeft, above2); + uint16x8_t sub_bottomLeft_above3 = vsubl_u8(bottomLeft, above3); + + for (int y = 0; y < 32; y++) + { + uint8x8_t left = vdup_n_u8(src1[y]); + + uint16x8_t r0 = vmlal_u8(t0, left, l0); + uint16x8_t r1 = vmlal_u8(t1, left, l1); + uint16x8_t r2 = vmlal_u8(t2, left, l2); + uint16x8_t r3 = vmlal_u8(t3, left, l3); + + uint8x8_t d[4]; + d[0] = vshrn_n_u16(r0, log2Size + 1); + d[1] = vshrn_n_u16(r1, log2Size + 1); + d[2] = vshrn_n_u16(r2, log2Size + 1); + d[3] = vshrn_n_u16(r3, log2Size + 1); + + store_u8x8xn<4>(dst + y * dstStride, 8, d); + + t0 = vaddq_u16(t0, sub_bottomLeft_above0); + t1 = vaddq_u16(t1, sub_bottomLeft_above1); + t2 = vaddq_u16(t2, sub_bottomLeft_above2); + t3 = vaddq_u16(t3, sub_bottomLeft_above3); + } +} +#endif + static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size) { // boundary pixels processing @@ -625,6 +710,7 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = intra_pred_planar4_neon; p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon); p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_neon); + p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = intra_pred_planar32_neon; #endif p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_neon<4>; -- 2.34.1
>From a0a965497613e96701f56104acc279295cfdf320 Mon Sep 17 00:00:00 2001 Message-Id: <a0a965497613e96701f56104acc279295cfdf320.1737641283.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1737641283.git.microdaryl.rob...@arm.com> References: <cover.1737641283.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Thu, 9 Jan 2025 15:56:29 +0000 Subject: [PATCH 2/2] AArch64: Add Neon implementation of 32x32 intra_pred_planar Relative performance compared to scalar C: Neoverse N1: 4.53x Neoverse N2: 4.19x Neoverse V1: 4.26x Neoverse V2: 3.93x --- source/common/aarch64/intrapred-prim.cpp | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp index c5d47fe0d..3d4b4769f 100644 --- a/source/common/aarch64/intrapred-prim.cpp +++ b/source/common/aarch64/intrapred-prim.cpp @@ -445,6 +445,91 @@ void intra_pred_planar4_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix } #endif +#if !HIGH_BIT_DEPTH +void intra_pred_planar32_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix, + int /*dirMode*/, int /*bFilter*/) +{ + const int log2Size = 5; + const int blkSize = 1 << log2Size; + + const pixel *src0 = srcPix + 1; + const pixel *src1 = srcPix + 2 * blkSize + 1; + + uint8x8_t above0 = vld1_u8(src0 + 0 * 8); + uint8x8_t above1 = vld1_u8(src0 + 1 * 8); + uint8x8_t above2 = vld1_u8(src0 + 2 * 8); + uint8x8_t above3 = vld1_u8(src0 + 3 * 8); + + uint8x8_t topRight = vdup_n_u8(src0[blkSize]); + uint8x8_t bottomLeft = vdup_n_u8(src1[blkSize]); + + const uint8_t c[2][32] = + { + {31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} + }; + + // left constant + const uint8x8_t l0 = vld1_u8(c[0] + 0 * 8); + const uint8x8_t l1 = vld1_u8(c[0] + 1 * 8); + const uint8x8_t l2 = vld1_u8(c[0] + 2 * 8); + const uint8x8_t l3 = vld1_u8(c[0] + 3 * 8); + + // topRight constant + const uint8x8_t tR0 = vld1_u8(c[1] + 0 * 8); + const uint8x8_t tR1 = vld1_u8(c[1] + 1 * 8); + const uint8x8_t tR2 = vld1_u8(c[1] + 2 * 8); + const uint8x8_t tR3 = vld1_u8(c[1] + 3 * 8); + + const uint16x8_t offset = vdupq_n_u16(blkSize); + const uint16x8_t offset_bottomLeft = vaddw_u8(offset, bottomLeft); + + const uint8x8_t c31 = vdup_n_u8(31); + + uint16x8_t t0 = vmlal_u8(offset_bottomLeft, topRight, tR0); + t0 = vmlal_u8(t0, above0, c31); + + uint16x8_t t1 = vmlal_u8(offset_bottomLeft, topRight, tR1); + t1 = vmlal_u8(t1, above1, c31); + + uint16x8_t t2 = vmlal_u8(offset_bottomLeft, topRight, tR2); + t2 = vmlal_u8(t2, above2, c31); + + uint16x8_t t3 = vmlal_u8(offset_bottomLeft, topRight, tR3); + t3 = vmlal_u8(t3, above3, c31); + + uint16x8_t sub_bottomLeft_above0 = vsubl_u8(bottomLeft, above0); + uint16x8_t sub_bottomLeft_above1 = vsubl_u8(bottomLeft, above1); + uint16x8_t sub_bottomLeft_above2 = vsubl_u8(bottomLeft, above2); + uint16x8_t sub_bottomLeft_above3 = vsubl_u8(bottomLeft, above3); + + for (int y = 0; y < 32; y++) + { + uint8x8_t left = vdup_n_u8(src1[y]); + + uint16x8_t r0 = vmlal_u8(t0, left, l0); + uint16x8_t r1 = vmlal_u8(t1, left, l1); + uint16x8_t r2 = vmlal_u8(t2, left, l2); + uint16x8_t r3 = vmlal_u8(t3, left, l3); + + uint8x8_t d[4]; + d[0] = vshrn_n_u16(r0, log2Size + 1); + d[1] = vshrn_n_u16(r1, log2Size + 1); + d[2] = vshrn_n_u16(r2, log2Size + 1); + d[3] = vshrn_n_u16(r3, log2Size + 1); + + store_u8x8xn<4>(dst + y * dstStride, 8, d); + + t0 = vaddq_u16(t0, sub_bottomLeft_above0); + t1 = vaddq_u16(t1, sub_bottomLeft_above1); + t2 = vaddq_u16(t2, sub_bottomLeft_above2); + t3 = vaddq_u16(t3, sub_bottomLeft_above3); + } +} +#endif + static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size) { // boundary pixels processing @@ -625,6 +710,7 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = intra_pred_planar4_neon; p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon); p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_neon); + p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = intra_pred_planar32_neon; #endif p.cu[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_neon<4>; -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel