The existing HBD SSD_S SVE implementation is suitable for SBD as well, so enable it for the SBD build.
Delete the existing SBD SSD_S SVE2 implementation in order to have the SVE implementation as default for SVE2 supported platforms. The SVE implementation is up to 55% faster than the SVE2 implementation. Change-Id: I0fc349d1c875f74743a956869725ceecc5825ee1 --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 38 ++-- source/common/aarch64/ssd-a-sve.S | 224 +++++++++++------------ source/common/aarch64/ssd-a-sve2.S | 177 ------------------ 4 files changed, 126 insertions(+), 315 deletions(-) delete mode 100644 source/common/aarch64/ssd-a-sve2.S diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index ca4282c6d..744fc21de 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -115,7 +115,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) - set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S) + set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index f88fdc000..478b6943a 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -843,6 +843,19 @@ void setupSvePrimitives(EncoderPrimitives &p) p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve); p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve); + // ssd_s + p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); + + p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); + #if !HIGH_BIT_DEPTH p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve); @@ -879,20 +892,6 @@ void setupSvePrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve); - - // ssd_s - p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); - p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); - p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); - p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); - p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); - - p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); - p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); - p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); - p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); - p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); - #endif // !HIGH_BIT_DEPTH } #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) @@ -913,17 +912,6 @@ void setupSve2Primitives(EncoderPrimitives &p) CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg); CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg); - // ssd_s - p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve2); - p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve2); - p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve2); - p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve2); - - p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve2); - p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve2); - p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve2); - p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve2); - // pixel_var p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2); diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S index dbb750e17..94cba7d79 100644 --- a/source/common/aarch64/ssd-a-sve.S +++ b/source/common/aarch64/ssd-a-sve.S @@ -213,118 +213,6 @@ function PFX(pixel_sse_pp_64x64_sve) ret endfunc -function PFX(pixel_ssd_s_4x4_sve) - movi v0.4s, #0 - add x1, x1, x1 - - ldr d16, [x0] - ldr d17, [x0, x1] - sdot z0.d, z16.h, z16.h - sdot z0.d, z17.h, z17.h - add x0, x0, x1, lsl #1 - ldr d16, [x0] - ldr d17, [x0, x1] - sdot z0.d, z16.h, z16.h - sdot z0.d, z17.h, z17.h - - fmov w0, s0 - ret -endfunc - -function PFX(pixel_ssd_s_8x8_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - -.rept 4 - ld1 {v16.8h}, [x0], x1 - sdot z0.d, z16.h, z16.h - ld1 {v17.8h}, [x0], x1 - sdot z1.d, z17.h, z17.h -.endr - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov w0, s0 - ret -endfunc - -function PFX(pixel_ssd_s_16x16_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #16 -.Loop_ssd_s_16: - sub w12, w12, #1 - - ld1 {v16.8h-v17.8h}, [x0], x1 - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - cbnz w12, .Loop_ssd_s_16 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - -function PFX(pixel_ssd_s_32x32_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #32 -.Loop_ssd_s_32: - sub w12, w12, #1 - - ldp q16, q17, [x0] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #32] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - - add x0, x0, x1 - cbnz w12, .Loop_ssd_s_32 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - -function PFX(pixel_ssd_s_64x64_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #64 -.Loop_ssd_s_64: - sub w12, w12, #1 - - ldp q16, q17, [x0] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #32] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #64] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #96] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - - add x0, x0, x1 - cbnz w12, .Loop_ssd_s_64 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - #endif // HIGH_BIT_DEPTH .macro SSE_SS_4x2 @@ -481,3 +369,115 @@ function PFX(pixel_sse_ss_64x64_sve) fmov x0, d0 ret endfunc + +function PFX(pixel_ssd_s_4x4_sve) + movi v0.4s, #0 + add x1, x1, x1 + + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + add x0, x0, x1, lsl #1 + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_8x8_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + +.rept 4 + ld1 {v16.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + ld1 {v17.8h}, [x0], x1 + sdot z1.d, z17.h, z17.h +.endr + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_16x16_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #16 +.Loop_ssd_s_16: + sub w12, w12, #1 + + ld1 {v16.8h-v17.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + cbnz w12, .Loop_ssd_s_16 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_32x32_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #32 +.Loop_ssd_s_32: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_32 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_64x64_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #64 +.Loop_ssd_s_64: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #64] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #96] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_64 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S deleted file mode 100644 index fe3c0d893..000000000 --- a/source/common/aarch64/ssd-a-sve2.S +++ /dev/null @@ -1,177 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.c...@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -#include "asm-sve.S" -#include "ssd-a-common.S" - -.arch armv8-a+sve2 - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -function PFX(pixel_ssd_s_4x4_sve2) - ptrue p0.b, vl8 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 3 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_8x8_sve2) - ptrue p0.b, vl16 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 7 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_16x16_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_ssd_s_16x16 - add x1, x1, x1 - mov w12, #4 - movi v0.16b, #0 - movi v1.16b, #0 -.Loop_ssd_s_16_sve2: - sub w12, w12, #1 -.rept 2 - ld1 {v4.16b,v5.16b}, [x0], x1 - ld1 {v6.16b,v7.16b}, [x0], x1 - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h - smlal v0.4s, v6.4h, v6.4h - smlal2 v1.4s, v6.8h, v6.8h - smlal v0.4s, v7.4h, v7.4h - smlal2 v1.4s, v7.8h, v7.8h -.endr - cbnz w12, .Loop_ssd_s_16_sve2 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_ssd_s_16x16: - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 15 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_32x32_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_ssd_s_32x32 - add x1, x1, x1 - mov w12, #8 - movi v0.16b, #0 - movi v1.16b, #0 -.Loop_ssd_s_32: - sub w12, w12, #1 -.rept 4 - ld1 {v4.16b-v7.16b}, [x0], x1 - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h - smlal v0.4s, v6.4h, v6.4h - smlal2 v1.4s, v6.8h, v6.8h - smlal v0.4s, v7.4h, v7.4h - smlal2 v1.4s, v7.8h, v7.8h -.endr - cbnz w12, .Loop_ssd_s_32 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_ssd_s_32x32: - cmp x9, #48 - bgt .vl_gt_48_pixel_ssd_s_32x32 - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, [x0] - ld1b {z17.b}, p0/z, [x0, #1, mul vl] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h - smlalb z0.s, z17.h, z17.h - smlalt z0.s, z17.h, z17.h -.rept 31 - ld1b {z16.b}, p0/z, [x0] - ld1b {z17.b}, p0/z, [x0, #1, mul vl] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h - smlalb z0.s, z17.h, z17.h - smlalt z0.s, z17.h, z17.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -.vl_gt_48_pixel_ssd_s_32x32: - ptrue p0.b, vl64 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 31 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc -- 2.39.5 (Apple Git-154)
>From d465ff74e10956c1e1cf5cfe5bca1e0b59235bb8 Mon Sep 17 00:00:00 2001 Message-Id: <d465ff74e10956c1e1cf5cfe5bca1e0b59235bb8.1733846134.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1733846134.git.gerdazsejke.m...@arm.com> References: <cover.1733846134.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Mon, 9 Dec 2024 16:18:22 +0100 Subject: [PATCH 10/11] AArch64: Enable existing SSD_S SVE impl for SBD The existing HBD SSD_S SVE implementation is suitable for SBD as well, so enable it for the SBD build. Delete the existing SBD SSD_S SVE2 implementation in order to have the SVE implementation as default for SVE2 supported platforms. The SVE implementation is up to 55% faster than the SVE2 implementation. Change-Id: I0fc349d1c875f74743a956869725ceecc5825ee1 --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 38 ++-- source/common/aarch64/ssd-a-sve.S | 224 +++++++++++------------ source/common/aarch64/ssd-a-sve2.S | 177 ------------------ 4 files changed, 126 insertions(+), 315 deletions(-) delete mode 100644 source/common/aarch64/ssd-a-sve2.S diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index ca4282c6d..744fc21de 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -115,7 +115,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) - set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S) + set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index f88fdc000..478b6943a 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -843,6 +843,19 @@ void setupSvePrimitives(EncoderPrimitives &p) p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve); p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve); + // ssd_s + p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); + + p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); + #if !HIGH_BIT_DEPTH p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve); @@ -879,20 +892,6 @@ void setupSvePrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve); - - // ssd_s - p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); - p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); - p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); - p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); - p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); - - p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); - p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); - p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); - p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); - p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); - #endif // !HIGH_BIT_DEPTH } #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) @@ -913,17 +912,6 @@ void setupSve2Primitives(EncoderPrimitives &p) CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg); CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg); - // ssd_s - p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve2); - p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve2); - p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve2); - p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve2); - - p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve2); - p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve2); - p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve2); - p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve2); - // pixel_var p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2); diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S index dbb750e17..94cba7d79 100644 --- a/source/common/aarch64/ssd-a-sve.S +++ b/source/common/aarch64/ssd-a-sve.S @@ -213,118 +213,6 @@ function PFX(pixel_sse_pp_64x64_sve) ret endfunc -function PFX(pixel_ssd_s_4x4_sve) - movi v0.4s, #0 - add x1, x1, x1 - - ldr d16, [x0] - ldr d17, [x0, x1] - sdot z0.d, z16.h, z16.h - sdot z0.d, z17.h, z17.h - add x0, x0, x1, lsl #1 - ldr d16, [x0] - ldr d17, [x0, x1] - sdot z0.d, z16.h, z16.h - sdot z0.d, z17.h, z17.h - - fmov w0, s0 - ret -endfunc - -function PFX(pixel_ssd_s_8x8_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - -.rept 4 - ld1 {v16.8h}, [x0], x1 - sdot z0.d, z16.h, z16.h - ld1 {v17.8h}, [x0], x1 - sdot z1.d, z17.h, z17.h -.endr - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov w0, s0 - ret -endfunc - -function PFX(pixel_ssd_s_16x16_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #16 -.Loop_ssd_s_16: - sub w12, w12, #1 - - ld1 {v16.8h-v17.8h}, [x0], x1 - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - cbnz w12, .Loop_ssd_s_16 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - -function PFX(pixel_ssd_s_32x32_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #32 -.Loop_ssd_s_32: - sub w12, w12, #1 - - ldp q16, q17, [x0] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #32] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - - add x0, x0, x1 - cbnz w12, .Loop_ssd_s_32 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - -function PFX(pixel_ssd_s_64x64_sve) - movi v0.4s, #0 - movi v1.4s, #0 - add x1, x1, x1 - - mov w12, #64 -.Loop_ssd_s_64: - sub w12, w12, #1 - - ldp q16, q17, [x0] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #32] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #64] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - ldp q16, q17, [x0, #96] - sdot z0.d, z16.h, z16.h - sdot z1.d, z17.h, z17.h - - add x0, x0, x1 - cbnz w12, .Loop_ssd_s_64 - - add v0.2d, v0.2d, v1.2d - addp d0, v0.2d - fmov x0, d0 - ret -endfunc - #endif // HIGH_BIT_DEPTH .macro SSE_SS_4x2 @@ -481,3 +369,115 @@ function PFX(pixel_sse_ss_64x64_sve) fmov x0, d0 ret endfunc + +function PFX(pixel_ssd_s_4x4_sve) + movi v0.4s, #0 + add x1, x1, x1 + + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + add x0, x0, x1, lsl #1 + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_8x8_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + +.rept 4 + ld1 {v16.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + ld1 {v17.8h}, [x0], x1 + sdot z1.d, z17.h, z17.h +.endr + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_16x16_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #16 +.Loop_ssd_s_16: + sub w12, w12, #1 + + ld1 {v16.8h-v17.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + cbnz w12, .Loop_ssd_s_16 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_32x32_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #32 +.Loop_ssd_s_32: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_32 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_64x64_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #64 +.Loop_ssd_s_64: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #64] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #96] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_64 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S deleted file mode 100644 index fe3c0d893..000000000 --- a/source/common/aarch64/ssd-a-sve2.S +++ /dev/null @@ -1,177 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.c...@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -#include "asm-sve.S" -#include "ssd-a-common.S" - -.arch armv8-a+sve2 - -#ifdef __APPLE__ -.section __RODATA,__rodata -#else -.section .rodata -#endif - -.align 4 - -.text - -function PFX(pixel_ssd_s_4x4_sve2) - ptrue p0.b, vl8 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 3 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_8x8_sve2) - ptrue p0.b, vl16 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 7 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_16x16_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_ssd_s_16x16 - add x1, x1, x1 - mov w12, #4 - movi v0.16b, #0 - movi v1.16b, #0 -.Loop_ssd_s_16_sve2: - sub w12, w12, #1 -.rept 2 - ld1 {v4.16b,v5.16b}, [x0], x1 - ld1 {v6.16b,v7.16b}, [x0], x1 - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h - smlal v0.4s, v6.4h, v6.4h - smlal2 v1.4s, v6.8h, v6.8h - smlal v0.4s, v7.4h, v7.4h - smlal2 v1.4s, v7.8h, v7.8h -.endr - cbnz w12, .Loop_ssd_s_16_sve2 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_ssd_s_16x16: - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 15 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc - -function PFX(pixel_ssd_s_32x32_sve2) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_pixel_ssd_s_32x32 - add x1, x1, x1 - mov w12, #8 - movi v0.16b, #0 - movi v1.16b, #0 -.Loop_ssd_s_32: - sub w12, w12, #1 -.rept 4 - ld1 {v4.16b-v7.16b}, [x0], x1 - smlal v0.4s, v4.4h, v4.4h - smlal2 v1.4s, v4.8h, v4.8h - smlal v0.4s, v5.4h, v5.4h - smlal2 v1.4s, v5.8h, v5.8h - smlal v0.4s, v6.4h, v6.4h - smlal2 v1.4s, v6.8h, v6.8h - smlal v0.4s, v7.4h, v7.4h - smlal2 v1.4s, v7.8h, v7.8h -.endr - cbnz w12, .Loop_ssd_s_32 - add v0.4s, v0.4s, v1.4s - ret_v0_w0 -.vl_gt_16_pixel_ssd_s_32x32: - cmp x9, #48 - bgt .vl_gt_48_pixel_ssd_s_32x32 - ptrue p0.b, vl32 - ld1b {z16.b}, p0/z, [x0] - ld1b {z17.b}, p0/z, [x0, #1, mul vl] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h - smlalb z0.s, z17.h, z17.h - smlalt z0.s, z17.h, z17.h -.rept 31 - ld1b {z16.b}, p0/z, [x0] - ld1b {z17.b}, p0/z, [x0, #1, mul vl] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h - smlalb z0.s, z17.h, z17.h - smlalt z0.s, z17.h, z17.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -.vl_gt_48_pixel_ssd_s_32x32: - ptrue p0.b, vl64 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smullb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.rept 31 - ld1b {z16.b}, p0/z, [x0] - add x0, x0, x1, lsl #1 - smlalb z0.s, z16.h, z16.h - smlalt z0.s, z16.h, z16.h -.endr - uaddv d3, p0, z0.s - fmov w0, s3 - ret -endfunc -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel