Delete the Neon and SVE assembly implementations of these kernels as they are slower than the new Neon intrinsics implementation. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 12 -- source/common/aarch64/blockcopy8-common.S | 39 ---- source/common/aarch64/blockcopy8-sve.S | 206 ---------------------- source/common/aarch64/blockcopy8.S | 99 ----------- source/common/aarch64/pixel-prim.cpp | 36 ++++ 6 files changed, 37 insertions(+), 357 deletions(-) delete mode 100644 source/common/aarch64/blockcopy8-common.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index a6f56c8c8..7eb40fb05 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -112,7 +112,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) enable_language(ASM) # Add Arm assembly files here. - set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) + set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S) set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 6d4e0b67a..536af1d5d 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon); p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon); - // cpy1Dto2D_shr - p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon); - p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon); - p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon); - p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon); - p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon); - // sad ALL_LUMA_PU(sad, pixel_sad, neon); ALL_LUMA_PU(sad_x3, sad_x3, neon); @@ -600,11 +593,6 @@ void setupSvePrimitives(EncoderPrimitives &p) LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); - // cpy1Dto2D_shr - p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve); - p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve); - p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve); - // sse_ss p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve); p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve); diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S deleted file mode 100644 index 6d92756fc..000000000 --- a/source/common/aarch64/blockcopy8-common.S +++ /dev/null @@ -1,39 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.c...@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// This file contains the macros written using NEON instruction set -// that are also used by the SVE2 functions - -#include "asm.S" - -.arch armv8-a - -// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) -.macro cpy1Dto2D_shr_start - add x2, x2, x2 - dup v0.8h, w3 - cmeq v1.8h, v1.8h, v1.8h - sshl v1.8h, v1.8h, v0.8h - sri v1.8h, v1.8h, #1 - neg v0.8h, v0.8h -.endm diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index e2154414c..401167038 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -22,7 +22,6 @@ *****************************************************************************/ #include "asm-sve.S" -#include "blockcopy8-common.S" .arch armv8-a+sve @@ -69,208 +68,3 @@ function PFX(blockfill_s_32x32_sve) .endr ret endfunc - -// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) - -function PFX(cpy1Dto2D_shr_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_16x16 - cpy1Dto2D_shr_start - mov w12, #4 -.Loop_cpy1Dto2D_shr_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.8h-v3.8h}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.8h-v3.8h}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_16 - ret -.vl_gt_16_cpy1Dto2D_shr_16x16: - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 16 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #32 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_32x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_32x32 - cpy1Dto2D_shr_start - mov w12, #16 -.Loop_cpy1Dto2D_shr_32_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_32_sve - ret -.vl_gt_16_cpy1Dto2D_shr_32x32: - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - cmp x9, #48 - bgt .vl_gt_48_cpy1Dto2D_shr_32x32 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 32 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - add x1, x1, #64 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_48_cpy1Dto2D_shr_32x32: - ptrue p0.h, vl32 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 32 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #64 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_64x64_sve) - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl8 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - ld1d {z7.d}, p0/z, [x1, #2, mul vl] - ld1d {z8.d}, p0/z, [x1, #3, mul vl] - ld1d {z9.d}, p0/z, [x1, #4, mul vl] - ld1d {z10.d}, p0/z, [x1, #5, mul vl] - ld1d {z11.d}, p0/z, [x1, #6, mul vl] - ld1d {z12.d}, p0/z, [x1, #7, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - add z7.h, p0/m, z7.h, z2.h - add z8.h, p0/m, z8.h, z2.h - add z9.h, p0/m, z9.h, z2.h - add z10.h, p0/m, z10.h, z2.h - add z11.h, p0/m, z11.h, z2.h - add z12.h, p0/m, z12.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - asr z7.h, p0/m, z7.h, z0.h - asr z8.h, p0/m, z8.h, z0.h - asr z9.h, p0/m, z9.h, z0.h - asr z10.h, p0/m, z10.h, z0.h - asr z11.h, p0/m, z11.h, z0.h - asr z12.h, p0/m, z12.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - st1d {z7.d}, p0, [x0, #2, mul vl] - st1d {z8.d}, p0, [x0, #3, mul vl] - st1d {z9.d}, p0, [x0, #4, mul vl] - st1d {z10.d}, p0, [x0, #5, mul vl] - st1d {z11.d}, p0, [x0, #6, mul vl] - st1d {z12.d}, p0, [x0, #7, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_16_cpy1Dto2D_shr_64x64: - cmp x9, #48 - bgt .vl_gt_48_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - ld1d {z7.d}, p0/z, [x1, #2, mul vl] - ld1d {z8.d}, p0/z, [x1, #3, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - add z7.h, p0/m, z7.h, z2.h - add z8.h, p0/m, z8.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - asr z7.h, p0/m, z7.h, z0.h - asr z8.h, p0/m, z8.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - st1d {z7.d}, p0, [x0, #2, mul vl] - st1d {z8.d}, p0, [x0, #3, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_48_cpy1Dto2D_shr_64x64: - cmp x9, #112 - bgt .vl_gt_112_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl32 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_112_cpy1Dto2D_shr_64x64: - ptrue p0.h, vl64 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 5118b3ede..00b49df4d 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -22,7 +22,6 @@ *****************************************************************************/ #include "asm.S" -#include "blockcopy8-common.S" #ifdef __APPLE__ .section __RODATA,__rodata @@ -249,101 +248,3 @@ function PFX(count_nonzero_32_neon) fmov w0, s0 ret endfunc - -function PFX(cpy1Dto2D_shr_4x4_neon) - cpy1Dto2D_shr_start - ld1 {v2.16b-v3.16b}, [x1] - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.d}[0], [x0], x2 - st1 {v2.d}[1], [x0], x2 - st1 {v3.d}[0], [x0], x2 - st1 {v3.d}[1], [x0], x2 - ret -endfunc - -function PFX(cpy1Dto2D_shr_8x8_neon) - cpy1Dto2D_shr_start -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b}, [x0], x2 - st1 {v3.16b}, [x0], x2 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_16x16_neon) - cpy1Dto2D_shr_start - mov w12, #4 -.Loop_cpy1Dto2D_shr_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.8h-v3.8h}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.8h-v3.8h}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_16 - ret -endfunc - -function PFX(cpy1Dto2D_shr_32x32_neon) - cpy1Dto2D_shr_start - mov w12, #16 -.Loop_cpy1Dto2D_shr_32: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_32 - ret -endfunc - -function PFX(cpy1Dto2D_shr_64x64_neon) - cpy1Dto2D_shr_start - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy1Dto2D_shr_64: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sub v16.8h, v16.8h, v1.8h - sub v17.8h, v17.8h, v1.8h - sub v18.8h, v18.8h, v1.8h - sub v19.8h, v19.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_64 - ret -endfunc diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index b3d657961..575c9cab8 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1438,6 +1438,41 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in } } +template<int size> +void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) +{ + X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + + for (int h = 0; h < size; h++) + { + for (int w = 0; w + 16 <= size; w += 16) + { + int16x8_t s0_lo = vld1q_s16(src + w); + int16x8_t s0_hi = vld1q_s16(src + w + 8); + int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift)); + int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift)); + vst1q_s16(dst + w, d0_lo); + vst1q_s16(dst + w + 8, d0_hi); + } + if (size == 8) + { + int16x8_t s0 = vld1q_s16(src); + int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift)); + vst1q_s16(dst, d0); + } + if (size == 4) + { + int16x4_t s0 = vld1_s16(src); + int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift)); + vst1_s16(dst, d0); + } + + src += size; + dst += dstStride; + } +} template<int size> uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride) @@ -1922,6 +1957,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>; #endif // HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
>From e58fa67ac16e6125502c1e903b470fd0a6a2c981 Mon Sep 17 00:00:00 2001 Message-Id: <e58fa67ac16e6125502c1e903b470fd0a6a2c981.1747668338.git.li.zha...@arm.com> In-Reply-To: <cover.1747668338.git.li.zha...@arm.com> References: <cover.1747668338.git.li.zha...@arm.com> From: Li Zhang <li.zha...@arm.com> Date: Mon, 19 May 2025 11:13:51 +0200 Subject: [PATCH 8/8] AArch64: Implement cpy1Dto2D_shr using Neon intrinsics Delete the Neon and SVE assembly implementations of these kernels as they are slower than the new Neon intrinsics implementation. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 12 -- source/common/aarch64/blockcopy8-common.S | 39 ---- source/common/aarch64/blockcopy8-sve.S | 206 ---------------------- source/common/aarch64/blockcopy8.S | 99 ----------- source/common/aarch64/pixel-prim.cpp | 36 ++++ 6 files changed, 37 insertions(+), 357 deletions(-) delete mode 100644 source/common/aarch64/blockcopy8-common.S diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index a6f56c8c8..7eb40fb05 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -112,7 +112,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) enable_language(ASM) # Add Arm assembly files here. - set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) + set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S) set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 6d4e0b67a..536af1d5d 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon); p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon); - // cpy1Dto2D_shr - p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon); - p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon); - p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon); - p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon); - p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon); - // sad ALL_LUMA_PU(sad, pixel_sad, neon); ALL_LUMA_PU(sad_x3, sad_x3, neon); @@ -600,11 +593,6 @@ void setupSvePrimitives(EncoderPrimitives &p) LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); - // cpy1Dto2D_shr - p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve); - p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve); - p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve); - // sse_ss p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve); p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve); diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S deleted file mode 100644 index 6d92756fc..000000000 --- a/source/common/aarch64/blockcopy8-common.S +++ /dev/null @@ -1,39 +0,0 @@ -/***************************************************************************** - * Copyright (C) 2022-2023 MulticoreWare, Inc - * - * Authors: David Chen <david.c...@myais.com.cn> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. - * - * This program is also available under a commercial proprietary license. - * For more information, contact us at license @ x265.com. - *****************************************************************************/ - -// This file contains the macros written using NEON instruction set -// that are also used by the SVE2 functions - -#include "asm.S" - -.arch armv8-a - -// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) -.macro cpy1Dto2D_shr_start - add x2, x2, x2 - dup v0.8h, w3 - cmeq v1.8h, v1.8h, v1.8h - sshl v1.8h, v1.8h, v0.8h - sri v1.8h, v1.8h, #1 - neg v0.8h, v0.8h -.endm diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index e2154414c..401167038 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -22,7 +22,6 @@ *****************************************************************************/ #include "asm-sve.S" -#include "blockcopy8-common.S" .arch armv8-a+sve @@ -69,208 +68,3 @@ function PFX(blockfill_s_32x32_sve) .endr ret endfunc - -// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) - -function PFX(cpy1Dto2D_shr_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_16x16 - cpy1Dto2D_shr_start - mov w12, #4 -.Loop_cpy1Dto2D_shr_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.8h-v3.8h}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.8h-v3.8h}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_16 - ret -.vl_gt_16_cpy1Dto2D_shr_16x16: - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 16 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #32 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_32x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_32x32 - cpy1Dto2D_shr_start - mov w12, #16 -.Loop_cpy1Dto2D_shr_32_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_32_sve - ret -.vl_gt_16_cpy1Dto2D_shr_32x32: - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - cmp x9, #48 - bgt .vl_gt_48_cpy1Dto2D_shr_32x32 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 32 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - add x1, x1, #64 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_48_cpy1Dto2D_shr_32x32: - ptrue p0.h, vl32 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 32 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #64 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_64x64_sve) - dup z0.h, w3 - sub w4, w3, #1 - dup z1.h, w4 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl8 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - ld1d {z7.d}, p0/z, [x1, #2, mul vl] - ld1d {z8.d}, p0/z, [x1, #3, mul vl] - ld1d {z9.d}, p0/z, [x1, #4, mul vl] - ld1d {z10.d}, p0/z, [x1, #5, mul vl] - ld1d {z11.d}, p0/z, [x1, #6, mul vl] - ld1d {z12.d}, p0/z, [x1, #7, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - add z7.h, p0/m, z7.h, z2.h - add z8.h, p0/m, z8.h, z2.h - add z9.h, p0/m, z9.h, z2.h - add z10.h, p0/m, z10.h, z2.h - add z11.h, p0/m, z11.h, z2.h - add z12.h, p0/m, z12.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - asr z7.h, p0/m, z7.h, z0.h - asr z8.h, p0/m, z8.h, z0.h - asr z9.h, p0/m, z9.h, z0.h - asr z10.h, p0/m, z10.h, z0.h - asr z11.h, p0/m, z11.h, z0.h - asr z12.h, p0/m, z12.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - st1d {z7.d}, p0, [x0, #2, mul vl] - st1d {z8.d}, p0, [x0, #3, mul vl] - st1d {z9.d}, p0, [x0, #4, mul vl] - st1d {z10.d}, p0, [x0, #5, mul vl] - st1d {z11.d}, p0, [x0, #6, mul vl] - st1d {z12.d}, p0, [x0, #7, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_16_cpy1Dto2D_shr_64x64: - cmp x9, #48 - bgt .vl_gt_48_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl16 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - ld1d {z7.d}, p0/z, [x1, #2, mul vl] - ld1d {z8.d}, p0/z, [x1, #3, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - add z7.h, p0/m, z7.h, z2.h - add z8.h, p0/m, z8.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - asr z7.h, p0/m, z7.h, z0.h - asr z8.h, p0/m, z8.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - st1d {z7.d}, p0, [x0, #2, mul vl] - st1d {z8.d}, p0, [x0, #3, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_48_cpy1Dto2D_shr_64x64: - cmp x9, #112 - bgt .vl_gt_112_cpy1Dto2D_shr_64x64 - ptrue p0.h, vl32 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - ld1d {z6.d}, p0/z, [x1, #1, mul vl] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - add z6.h, p0/m, z6.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - asr z6.h, p0/m, z6.h, z0.h - st1d {z5.d}, p0, [x0] - st1d {z6.d}, p0, [x0, #1, mul vl] - add x0, x0, x2, lsl #1 -.endr - ret -.vl_gt_112_cpy1Dto2D_shr_64x64: - ptrue p0.h, vl64 - mov z2.h, #1 - lsl z2.h, p0/m, z2.h, z1.h -.rept 64 - ld1d {z5.d}, p0/z, [x1] - add x1, x1, #128 - add z5.h, p0/m, z5.h, z2.h - asr z5.h, p0/m, z5.h, z0.h - st1d {z5.d}, p0, [x0] - add x0, x0, x2, lsl #1 -.endr - ret -endfunc diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 5118b3ede..00b49df4d 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -22,7 +22,6 @@ *****************************************************************************/ #include "asm.S" -#include "blockcopy8-common.S" #ifdef __APPLE__ .section __RODATA,__rodata @@ -249,101 +248,3 @@ function PFX(count_nonzero_32_neon) fmov w0, s0 ret endfunc - -function PFX(cpy1Dto2D_shr_4x4_neon) - cpy1Dto2D_shr_start - ld1 {v2.16b-v3.16b}, [x1] - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.d}[0], [x0], x2 - st1 {v2.d}[1], [x0], x2 - st1 {v3.d}[0], [x0], x2 - st1 {v3.d}[1], [x0], x2 - ret -endfunc - -function PFX(cpy1Dto2D_shr_8x8_neon) - cpy1Dto2D_shr_start -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b}, [x0], x2 - st1 {v3.16b}, [x0], x2 -.endr - ret -endfunc - -function PFX(cpy1Dto2D_shr_16x16_neon) - cpy1Dto2D_shr_start - mov w12, #4 -.Loop_cpy1Dto2D_shr_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.8h-v3.8h}, [x1], #32 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.8h-v3.8h}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_16 - ret -endfunc - -function PFX(cpy1Dto2D_shr_32x32_neon) - cpy1Dto2D_shr_start - mov w12, #16 -.Loop_cpy1Dto2D_shr_32: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_32 - ret -endfunc - -function PFX(cpy1Dto2D_shr_64x64_neon) - cpy1Dto2D_shr_start - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy1Dto2D_shr_64: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], #64 - sub v2.8h, v2.8h, v1.8h - sub v3.8h, v3.8h, v1.8h - sub v4.8h, v4.8h, v1.8h - sub v5.8h, v5.8h, v1.8h - sub v16.8h, v16.8h, v1.8h - sub v17.8h, v17.8h, v1.8h - sub v18.8h, v18.8h, v1.8h - sub v19.8h, v19.8h, v1.8h - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], x2 -.endr - cbnz w12, .Loop_cpy1Dto2D_shr_64 - ret -endfunc diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index b3d657961..575c9cab8 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1438,6 +1438,41 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in } } +template<int size> +void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) +{ + X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + + for (int h = 0; h < size; h++) + { + for (int w = 0; w + 16 <= size; w += 16) + { + int16x8_t s0_lo = vld1q_s16(src + w); + int16x8_t s0_hi = vld1q_s16(src + w + 8); + int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift)); + int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift)); + vst1q_s16(dst + w, d0_lo); + vst1q_s16(dst + w + 8, d0_hi); + } + if (size == 8) + { + int16x8_t s0 = vld1q_s16(src); + int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift)); + vst1q_s16(dst, d0); + } + if (size == 4) + { + int16x4_t s0 = vld1_s16(src); + int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift)); + vst1_s16(dst, d0); + } + + src += size; + dst += dstStride; + } +} template<int size> uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride) @@ -1922,6 +1957,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ + p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>; #endif // HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel