The Neon intrinsics implementation of sse is not used anymore given that a faster asm implementation exists for both standard and high bit-depth. Delete the sse_neon function.
Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238 --- source/common/aarch64/pixel-prim.cpp | 61 ---------------------------- 1 file changed, 61 deletions(-) diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index dd3fd4637..63b30604c 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const } -template<int lx, int ly, class T1, class T2> -sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2) -{ - sse_t sum = 0; - - int32x4_t vsum1 = vdupq_n_s32(0); - int32x4_t vsum2 = vdupq_n_s32(0); - for (int y = 0; y < ly; y++) - { - int x = 0; - for (; (x + 8) <= lx; x += 8) - { - int16x8_t tmp; - if (sizeof(T1) == 2 && sizeof(T2) == 2) - { - // We have to cast to the 'real' type so that this block - // will compile for both low and high bitdepth. - uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x); - uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x); - tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2)); - } - else if (sizeof(T1) == 1 && sizeof(T2) == 1) - { - // We have to cast to the 'real' type so that this block - // will compile for both low and high bitdepth. - uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x); - uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x); - tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2)); - } - else - { - X265_CHECK(false, "unsupported sse"); - } - vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp)); - vsum2 = vmlal_high_s16(vsum2, tmp, tmp); - } - for (; x < lx; x++) - { - int tmp = pix1[x] - pix2[x]; - sum += (tmp * tmp); - } - - if (sizeof(T1) == 2 && sizeof(T2) == 2) - { - int32x4_t vsum = vaddq_s32(vsum1, vsum2); - sum += vaddvq_s32(vsum); - vsum1 = vsum2 = vdupq_n_s32(0); - } - - pix1 += stride_pix1; - pix2 += stride_pix2; - } - - int32x4_t vsum = vaddq_s32(vsum1, vsum2); - - return sum + vaddvq_s32(vsum); -} - - template<int bx, int by> void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb) { @@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_420(W, H) \ - p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ @@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_422(W, H) \ - p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ -- 2.39.5 (Apple Git-154)
>From 4db045e36c00550dae8eced28bf7daf5c01e99d1 Mon Sep 17 00:00:00 2001 Message-Id: <4db045e36c00550dae8eced28bf7daf5c01e99d1.1733846134.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1733846134.git.gerdazsejke.m...@arm.com> References: <cover.1733846134.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Sat, 7 Dec 2024 16:20:38 +0100 Subject: [PATCH 07/11] AArch64: Delete sse_neon implementation The Neon intrinsics implementation of sse is not used anymore given that a faster asm implementation exists for both standard and high bit-depth. Delete the sse_neon function. Change-Id: I0ff88a5153764f61517f50ffe3b93f2ba2856238 --- source/common/aarch64/pixel-prim.cpp | 61 ---------------------------- 1 file changed, 61 deletions(-) diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index dd3fd4637..63b30604c 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1155,65 +1155,6 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const } -template<int lx, int ly, class T1, class T2> -sse_t sse_neon(const T1 *pix1, intptr_t stride_pix1, const T2 *pix2, intptr_t stride_pix2) -{ - sse_t sum = 0; - - int32x4_t vsum1 = vdupq_n_s32(0); - int32x4_t vsum2 = vdupq_n_s32(0); - for (int y = 0; y < ly; y++) - { - int x = 0; - for (; (x + 8) <= lx; x += 8) - { - int16x8_t tmp; - if (sizeof(T1) == 2 && sizeof(T2) == 2) - { - // We have to cast to the 'real' type so that this block - // will compile for both low and high bitdepth. - uint16x8_t vpix1 = vld1q_u16((const uint16_t*)pix1 + x); - uint16x8_t vpix2 = vld1q_u16((const uint16_t*)pix2 + x); - tmp = vreinterpretq_s16_u16(vsubq_u16(vpix1, vpix2)); - } - else if (sizeof(T1) == 1 && sizeof(T2) == 1) - { - // We have to cast to the 'real' type so that this block - // will compile for both low and high bitdepth. - uint8x8_t vpix1 = vld1_u8((const uint8_t*)pix1 + x); - uint8x8_t vpix2 = vld1_u8((const uint8_t*)pix2 + x); - tmp = vreinterpretq_s16_u16(vsubl_u8(vpix1, vpix2)); - } - else - { - X265_CHECK(false, "unsupported sse"); - } - vsum1 = vmlal_s16(vsum1, vget_low_s16(tmp), vget_low_s16(tmp)); - vsum2 = vmlal_high_s16(vsum2, tmp, tmp); - } - for (; x < lx; x++) - { - int tmp = pix1[x] - pix2[x]; - sum += (tmp * tmp); - } - - if (sizeof(T1) == 2 && sizeof(T2) == 2) - { - int32x4_t vsum = vaddq_s32(vsum1, vsum2); - sum += vaddvq_s32(vsum); - vsum1 = vsum2 = vdupq_n_s32(0); - } - - pix1 += stride_pix1; - pix2 += stride_pix2; - } - - int32x4_t vsum = vaddq_s32(vsum1, vsum2); - - return sum + vaddvq_s32(vsum); -} - - template<int bx, int by> void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb) { @@ -1953,7 +1894,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_420(W, H) \ - p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ @@ -2044,7 +1984,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) #define CHROMA_CU_422(W, H) \ - p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sse_pp = sse_neon<W, H, pixel, pixel>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel