Delete the SADx3 Neon intrinsics primitives since we now have optimized Neon assembly implementations for all block sizes and bitdepths. --- source/common/aarch64/pixel-prim.cpp | 105 --------------------------- 1 file changed, 105 deletions(-)
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index c57057f5d..947e2b132 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -909,106 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp return sum; } -template<int lx, int ly> -void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, intptr_t frefstride, - int32_t *res) -{ - res[0] = 0; - res[1] = 0; - res[2] = 0; - for (int y = 0; y < ly; y++) - { - int x = 0; - uint16x8_t vsum16_0 = vdupq_n_u16(0); - uint16x8_t vsum16_1 = vdupq_n_u16(0); - uint16x8_t vsum16_2 = vdupq_n_u16(0); -#if HIGH_BIT_DEPTH - for (; (x + 8) <= lx; x += 8) - { - uint16x8_t p1 = vld1q_u16(pix1 + x); - uint16x8_t p2 = vld1q_u16(pix2 + x); - uint16x8_t p3 = vld1q_u16(pix3 + x); - uint16x8_t p4 = vld1q_u16(pix4 + x); - vsum16_0 = vabaq_u16(vsum16_0, p1, p2); - vsum16_1 = vabaq_u16(vsum16_1, p1, p3); - vsum16_2 = vabaq_u16(vsum16_2, p1, p4); - } - if (lx & 4) - { - uint16x4_t p1 = vld1_u16(pix1 + x); - uint16x4_t p2 = vld1_u16(pix2 + x); - uint16x4_t p3 = vld1_u16(pix3 + x); - uint16x4_t p4 = vld1_u16(pix4 + x); - res[0] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); - res[1] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p3)); - res[2] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p4)); - x += 4; - } - if (lx >= 4) - { - res[0] += vaddlvq_u16(vsum16_0); - res[1] += vaddlvq_u16(vsum16_1); - res[2] += vaddlvq_u16(vsum16_2); - } -#else - - for (; (x + 16) <= lx; x += 16) - { - uint8x16_t p1 = vld1q_u8(pix1 + x); - uint8x16_t p2 = vld1q_u8(pix2 + x); - uint8x16_t p3 = vld1q_u8(pix3 + x); - uint8x16_t p4 = vld1q_u8(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2)); - vsum16_0 = vabal_high_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3)); - vsum16_1 = vabal_high_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4)); - vsum16_2 = vabal_high_u8(vsum16_2, p1, p4); - } - if (lx & 8) - { - uint8x8_t p1 = vld1_u8(pix1 + x); - uint8x8_t p2 = vld1_u8(pix2 + x); - uint8x8_t p3 = vld1_u8(pix3 + x); - uint8x8_t p4 = vld1_u8(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, p1, p4); - x += 8; - } - if (lx & 4) - { - uint8x8_t p1 = load_u8x4x1(pix1 + x); - uint8x8_t p2 = load_u8x4x1(pix2 + x); - uint8x8_t p3 = load_u8x4x1(pix3 + x); - uint8x8_t p4 = load_u8x4x1(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, p1, p4); - x += 4; - } - if (lx >= 4) - { - res[0] += vaddvq_u16(vsum16_0); - res[1] += vaddvq_u16(vsum16_1); - res[2] += vaddvq_u16(vsum16_2); - } - -#endif - if (lx & 3) for (; x < lx; x++) - { - res[0] += abs(pix1[x] - pix2[x]); - res[1] += abs(pix1[x] - pix3[x]); - res[2] += abs(pix1[x] - pix4[x]); - } - - pix1 += FENC_STRIDE; - pix2 += frefstride; - pix3 += frefstride; - pix4 += frefstride; - } -} - template<int lx, int ly> void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res) @@ -1641,7 +1541,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \ - p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>; @@ -1656,7 +1555,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \ - p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>; @@ -1703,11 +1601,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) LUMA_PU(16, 64); #if !(HIGH_BIT_DEPTH) - p.pu[LUMA_4x4].sad_x3 = sad_x3_neon<4, 4>; p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>; - p.pu[LUMA_4x8].sad_x3 = sad_x3_neon<4, 8>; p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>; - p.pu[LUMA_4x16].sad_x3 = sad_x3_neon<4, 16>; p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>; #endif // !(HIGH_BIT_DEPTH) -- 2.39.3 (Apple Git-146)
>From a97b929414970bbd2e732b2c3361d3c76f70cc3c Mon Sep 17 00:00:00 2001 Message-Id: <a97b929414970bbd2e732b2c3361d3c76f70cc3c.1736263010.git.jonathan.wri...@arm.com> In-Reply-To: <cover.1736263010.git.jonathan.wri...@arm.com> References: <cover.1736263010.git.jonathan.wri...@arm.com> From: Jonathan Wright <jonathan.wri...@arm.com> Date: Mon, 9 Dec 2024 11:43:52 +0000 Subject: [PATCH 2/3] AArch64: Delete redundant SADx3 Neon intrinsics primitives Delete the SADx3 Neon intrinsics primitives since we now have optimized Neon assembly implementations for all block sizes and bitdepths. --- source/common/aarch64/pixel-prim.cpp | 105 --------------------------- 1 file changed, 105 deletions(-) diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index c57057f5d..947e2b132 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -909,106 +909,6 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp return sum; } -template<int lx, int ly> -void sad_x3_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, intptr_t frefstride, - int32_t *res) -{ - res[0] = 0; - res[1] = 0; - res[2] = 0; - for (int y = 0; y < ly; y++) - { - int x = 0; - uint16x8_t vsum16_0 = vdupq_n_u16(0); - uint16x8_t vsum16_1 = vdupq_n_u16(0); - uint16x8_t vsum16_2 = vdupq_n_u16(0); -#if HIGH_BIT_DEPTH - for (; (x + 8) <= lx; x += 8) - { - uint16x8_t p1 = vld1q_u16(pix1 + x); - uint16x8_t p2 = vld1q_u16(pix2 + x); - uint16x8_t p3 = vld1q_u16(pix3 + x); - uint16x8_t p4 = vld1q_u16(pix4 + x); - vsum16_0 = vabaq_u16(vsum16_0, p1, p2); - vsum16_1 = vabaq_u16(vsum16_1, p1, p3); - vsum16_2 = vabaq_u16(vsum16_2, p1, p4); - } - if (lx & 4) - { - uint16x4_t p1 = vld1_u16(pix1 + x); - uint16x4_t p2 = vld1_u16(pix2 + x); - uint16x4_t p3 = vld1_u16(pix3 + x); - uint16x4_t p4 = vld1_u16(pix4 + x); - res[0] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p2)); - res[1] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p3)); - res[2] += vaddlv_u16(vaba_u16(vdup_n_u16(0), p1, p4)); - x += 4; - } - if (lx >= 4) - { - res[0] += vaddlvq_u16(vsum16_0); - res[1] += vaddlvq_u16(vsum16_1); - res[2] += vaddlvq_u16(vsum16_2); - } -#else - - for (; (x + 16) <= lx; x += 16) - { - uint8x16_t p1 = vld1q_u8(pix1 + x); - uint8x16_t p2 = vld1q_u8(pix2 + x); - uint8x16_t p3 = vld1q_u8(pix3 + x); - uint8x16_t p4 = vld1q_u8(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, vget_low_u8(p1), vget_low_u8(p2)); - vsum16_0 = vabal_high_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, vget_low_u8(p1), vget_low_u8(p3)); - vsum16_1 = vabal_high_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, vget_low_u8(p1), vget_low_u8(p4)); - vsum16_2 = vabal_high_u8(vsum16_2, p1, p4); - } - if (lx & 8) - { - uint8x8_t p1 = vld1_u8(pix1 + x); - uint8x8_t p2 = vld1_u8(pix2 + x); - uint8x8_t p3 = vld1_u8(pix3 + x); - uint8x8_t p4 = vld1_u8(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, p1, p4); - x += 8; - } - if (lx & 4) - { - uint8x8_t p1 = load_u8x4x1(pix1 + x); - uint8x8_t p2 = load_u8x4x1(pix2 + x); - uint8x8_t p3 = load_u8x4x1(pix3 + x); - uint8x8_t p4 = load_u8x4x1(pix4 + x); - vsum16_0 = vabal_u8(vsum16_0, p1, p2); - vsum16_1 = vabal_u8(vsum16_1, p1, p3); - vsum16_2 = vabal_u8(vsum16_2, p1, p4); - x += 4; - } - if (lx >= 4) - { - res[0] += vaddvq_u16(vsum16_0); - res[1] += vaddvq_u16(vsum16_1); - res[2] += vaddvq_u16(vsum16_2); - } - -#endif - if (lx & 3) for (; x < lx; x++) - { - res[0] += abs(pix1[x] - pix2[x]); - res[1] += abs(pix1[x] - pix3[x]); - res[2] += abs(pix1[x] - pix4[x]); - } - - pix1 += FENC_STRIDE; - pix2 += frefstride; - pix3 += frefstride; - pix4 += frefstride; - } -} - template<int lx, int ly> void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const pixel *pix4, const pixel *pix5, intptr_t frefstride, int32_t *res) @@ -1641,7 +1541,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \ - p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>; @@ -1656,7 +1555,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg_neon<W, H>; \ - p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp_neon<W, H>; \ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>; @@ -1703,11 +1601,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) LUMA_PU(16, 64); #if !(HIGH_BIT_DEPTH) - p.pu[LUMA_4x4].sad_x3 = sad_x3_neon<4, 4>; p.pu[LUMA_4x4].sad_x4 = sad_x4_neon<4, 4>; - p.pu[LUMA_4x8].sad_x3 = sad_x3_neon<4, 8>; p.pu[LUMA_4x8].sad_x4 = sad_x4_neon<4, 8>; - p.pu[LUMA_4x16].sad_x3 = sad_x3_neon<4, 16>; p.pu[LUMA_4x16].sad_x4 = sad_x4_neon<4, 16>; #endif // !(HIGH_BIT_DEPTH) -- 2.39.3 (Apple Git-146)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel