The original cpy2Dto1D_shl_neon intrinsics implementation is scalar, change to use SIMD instructions.
Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size. --- source/common/aarch64/asm-primitives.cpp | 12 --- source/common/aarch64/blockcopy8-sve.S | 127 ----------------------- source/common/aarch64/blockcopy8.S | 86 --------------- source/common/aarch64/pixel-prim.cpp | 24 ++++- 4 files changed, 21 insertions(+), 228 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 642468124..e0d8500ef 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon); p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon); - // cpy2Dto1D_shl - p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon); - p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon); - p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon); - p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon); - p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon); - // cpy2Dto1D_shr p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon); p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon); @@ -613,11 +606,6 @@ void setupSvePrimitives(EncoderPrimitives &p) LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); - // cpy2Dto1D_shl - p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve); - p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve); - p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve); - // cpy2Dto1D_shr p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve); p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve); diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 98dfc7584..0e737271f 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -70,133 +70,6 @@ function PFX(blockfill_s_32x32_sve) ret endfunc -// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) -.macro cpy2Dto1D_shl_start_sve - add x2, x2, x2 - mov z0.h, w3 -.endm - -function PFX(cpy2Dto1D_shl_16x16_sve) - dup z0.h, w3 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_16x16 - cpy2Dto1D_shl_start_sve - mov w12, #4 -.Loop_cpy2Dto1D_shl_16_sve: - sub w12, w12, #1 -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_16_sve - ret -.vl_gt_16_cpy2Dto1D_shl_16x16: - ptrue p0.h, vl16 -.rept 16 - ld1h {z1.h}, p0/z, [x1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0] - add x1, x1, x2, lsl #1 - add x0, x0, #32 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_32x32_sve) - dup z0.h, w3 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_32x32 - cpy2Dto1D_shl_start_sve - mov w12, #16 -.Loop_cpy2Dto1D_shl_32_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_32_sve - ret -.vl_gt_16_cpy2Dto1D_shl_32x32: - cmp x9, #48 - bgt .vl_gt_48_cpy2Dto1D_shl_32x32 - ptrue p0.h, vl16 -.rept 32 - ld1h {z1.h}, p0/z, [x1] - ld1h {z2.h}, p0/z, [x1, #1, mul vl] - lsl z1.h, p0/m, z1.h, z0.h - lsl z2.h, p0/m, z2.h, z0.h - st1h {z1.h}, p0, [x0] - st1h {z2.h}, p0, [x0, #1, mul vl] - add x1, x1, x2, lsl #1 - add x0, x0, #64 -.endr - ret -.vl_gt_48_cpy2Dto1D_shl_32x32: - ptrue p0.h, vl32 -.rept 32 - ld1h {z1.h}, p0/z, [x1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0] - add x1, x1, x2, lsl #1 - add x0, x0, #64 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_64x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_64x64 - cpy2Dto1D_shl_start_sve - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy2Dto1D_shl_64_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_64_sve - ret -.vl_gt_16_cpy2Dto1D_shl_64x64: - dup z0.h, w3 - mov x8, #64 - mov w12, #64 -.L_init_cpy2Dto1D_shl_64x64: - sub w12, w12, 1 - mov x9, #0 - whilelt p0.h, x9, x8 -.L_cpy2Dto1D_shl_64x64: - ld1h {z1.h}, p0/z, [x1, x9, lsl #1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0, x9, lsl #1] - inch x9 - whilelt p0.h, x9, x8 - b.first .L_cpy2Dto1D_shl_64x64 - add x1, x1, x2, lsl #1 - add x0, x0, #128 - cbnz w12, .L_init_cpy2Dto1D_shl_64x64 - ret -endfunc - // void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) function PFX(cpy2Dto1D_shr_4x4_sve) diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index f2ca35215..fef698cab 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -250,92 +250,6 @@ function PFX(count_nonzero_32_neon) ret endfunc -// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) -.macro cpy2Dto1D_shl_start - add x2, x2, x2 - dup v0.8h, w3 -.endm - -function PFX(cpy2Dto1D_shl_4x4_neon) - cpy2Dto1D_shl_start - ld1 {v2.d}[0], [x1], x2 - ld1 {v2.d}[1], [x1], x2 - ld1 {v3.d}[0], [x1], x2 - ld1 {v3.d}[1], [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0] - ret -endfunc - -function PFX(cpy2Dto1D_shl_8x8_neon) - cpy2Dto1D_shl_start -.rept 4 - ld1 {v2.16b}, [x1], x2 - ld1 {v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_16x16_neon) - cpy2Dto1D_shl_start - mov w12, #4 -.Loop_cpy2Dto1D_shl_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_16 - ret -endfunc - -function PFX(cpy2Dto1D_shl_32x32_neon) - cpy2Dto1D_shl_start - mov w12, #16 -.Loop_cpy2Dto1D_shl_32: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_32 - ret -endfunc - -function PFX(cpy2Dto1D_shl_64x64_neon) - cpy2Dto1D_shl_start - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy2Dto1D_shl_64: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_64 - ret -endfunc - // void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) function PFX(cpy2Dto1D_shr_4x4_neon) cpy2Dto1D_shr_start diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index aa91ff407..a8aa6f420 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1689,11 +1689,29 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n"); X265_CHECK(shift >= 0, "invalid shift\n"); - for (int i = 0; i < size; i++) + for (int h = 0; h < size; h++) { - for (int j = 0; j < size; j++) + int w = 0; + for (; w + 16 <= size; w += 16) + { + int16x8_t a0_lo = vld1q_s16(src + w); + int16x8_t a0_hi = vld1q_s16(src + w + 8); + int16x8_t d0_lo = vshlq_s16(a0_lo, vdupq_n_s16(shift)); + int16x8_t d0_hi = vshlq_s16(a0_hi, vdupq_n_s16(shift)); + vst1q_s16(dst + w, d0_lo); + vst1q_s16(dst + w + 8, d0_hi); + } + if (size == 8) + { + int16x8_t a0 = vld1q_s16(src + w); + int16x8_t d0 = vshlq_s16(a0, vdupq_n_s16(shift)); + vst1q_s16(dst + w, d0); + } + if (size == 4) { - dst[j] = src[j] << shift; + int16x4_t a0 = vld1_s16(src + w); + int16x4_t d0 = vshl_s16(a0, vdup_n_s16(shift)); + vst1_s16(dst + w, d0); } src += srcStride; -- 2.39.5 (Apple Git-154)
>From 8b367c739ac10b4df2042ffd5ffa078cfca53e77 Mon Sep 17 00:00:00 2001 Message-Id: <8b367c739ac10b4df2042ffd5ffa078cfca53e77.1747668338.git.li.zha...@arm.com> In-Reply-To: <cover.1747668338.git.li.zha...@arm.com> References: <cover.1747668338.git.li.zha...@arm.com> From: Li Zhang <li.zha...@arm.com> Date: Tue, 13 May 2025 13:10:00 +0200 Subject: [PATCH 6/8] AArch64: Optimize cpy2Dto1D_shl Neon intrinsics implementation The original cpy2Dto1D_shl_neon intrinsics implementation is scalar, change to use SIMD instructions. Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size. --- source/common/aarch64/asm-primitives.cpp | 12 --- source/common/aarch64/blockcopy8-sve.S | 127 ----------------------- source/common/aarch64/blockcopy8.S | 86 --------------- source/common/aarch64/pixel-prim.cpp | 24 ++++- 4 files changed, 21 insertions(+), 228 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 642468124..e0d8500ef 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon); p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon); - // cpy2Dto1D_shl - p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon); - p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon); - p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon); - p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon); - p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon); - // cpy2Dto1D_shr p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon); p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon); @@ -613,11 +606,6 @@ void setupSvePrimitives(EncoderPrimitives &p) LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); - // cpy2Dto1D_shl - p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve); - p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve); - p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve); - // cpy2Dto1D_shr p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve); p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve); diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 98dfc7584..0e737271f 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -70,133 +70,6 @@ function PFX(blockfill_s_32x32_sve) ret endfunc -// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) -.macro cpy2Dto1D_shl_start_sve - add x2, x2, x2 - mov z0.h, w3 -.endm - -function PFX(cpy2Dto1D_shl_16x16_sve) - dup z0.h, w3 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_16x16 - cpy2Dto1D_shl_start_sve - mov w12, #4 -.Loop_cpy2Dto1D_shl_16_sve: - sub w12, w12, #1 -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_16_sve - ret -.vl_gt_16_cpy2Dto1D_shl_16x16: - ptrue p0.h, vl16 -.rept 16 - ld1h {z1.h}, p0/z, [x1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0] - add x1, x1, x2, lsl #1 - add x0, x0, #32 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_32x32_sve) - dup z0.h, w3 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_32x32 - cpy2Dto1D_shl_start_sve - mov w12, #16 -.Loop_cpy2Dto1D_shl_32_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_32_sve - ret -.vl_gt_16_cpy2Dto1D_shl_32x32: - cmp x9, #48 - bgt .vl_gt_48_cpy2Dto1D_shl_32x32 - ptrue p0.h, vl16 -.rept 32 - ld1h {z1.h}, p0/z, [x1] - ld1h {z2.h}, p0/z, [x1, #1, mul vl] - lsl z1.h, p0/m, z1.h, z0.h - lsl z2.h, p0/m, z2.h, z0.h - st1h {z1.h}, p0, [x0] - st1h {z2.h}, p0, [x0, #1, mul vl] - add x1, x1, x2, lsl #1 - add x0, x0, #64 -.endr - ret -.vl_gt_48_cpy2Dto1D_shl_32x32: - ptrue p0.h, vl32 -.rept 32 - ld1h {z1.h}, p0/z, [x1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0] - add x1, x1, x2, lsl #1 - add x0, x0, #64 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_64x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_cpy2Dto1D_shl_64x64 - cpy2Dto1D_shl_start_sve - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy2Dto1D_shl_64_sve: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_64_sve - ret -.vl_gt_16_cpy2Dto1D_shl_64x64: - dup z0.h, w3 - mov x8, #64 - mov w12, #64 -.L_init_cpy2Dto1D_shl_64x64: - sub w12, w12, 1 - mov x9, #0 - whilelt p0.h, x9, x8 -.L_cpy2Dto1D_shl_64x64: - ld1h {z1.h}, p0/z, [x1, x9, lsl #1] - lsl z1.h, p0/m, z1.h, z0.h - st1h {z1.h}, p0, [x0, x9, lsl #1] - inch x9 - whilelt p0.h, x9, x8 - b.first .L_cpy2Dto1D_shl_64x64 - add x1, x1, x2, lsl #1 - add x0, x0, #128 - cbnz w12, .L_init_cpy2Dto1D_shl_64x64 - ret -endfunc - // void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) function PFX(cpy2Dto1D_shr_4x4_sve) diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index f2ca35215..fef698cab 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -250,92 +250,6 @@ function PFX(count_nonzero_32_neon) ret endfunc -// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) -.macro cpy2Dto1D_shl_start - add x2, x2, x2 - dup v0.8h, w3 -.endm - -function PFX(cpy2Dto1D_shl_4x4_neon) - cpy2Dto1D_shl_start - ld1 {v2.d}[0], [x1], x2 - ld1 {v2.d}[1], [x1], x2 - ld1 {v3.d}[0], [x1], x2 - ld1 {v3.d}[1], [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0] - ret -endfunc - -function PFX(cpy2Dto1D_shl_8x8_neon) - cpy2Dto1D_shl_start -.rept 4 - ld1 {v2.16b}, [x1], x2 - ld1 {v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - ret -endfunc - -function PFX(cpy2Dto1D_shl_16x16_neon) - cpy2Dto1D_shl_start - mov w12, #4 -.Loop_cpy2Dto1D_shl_16: - sub w12, w12, #1 -.rept 4 - ld1 {v2.16b-v3.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - st1 {v2.16b-v3.16b}, [x0], #32 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_16 - ret -endfunc - -function PFX(cpy2Dto1D_shl_32x32_neon) - cpy2Dto1D_shl_start - mov w12, #16 -.Loop_cpy2Dto1D_shl_32: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_32 - ret -endfunc - -function PFX(cpy2Dto1D_shl_64x64_neon) - cpy2Dto1D_shl_start - mov w12, #32 - sub x2, x2, #64 -.Loop_cpy2Dto1D_shl_64: - sub w12, w12, #1 -.rept 2 - ld1 {v2.16b-v5.16b}, [x1], #64 - ld1 {v16.16b-v19.16b}, [x1], x2 - sshl v2.8h, v2.8h, v0.8h - sshl v3.8h, v3.8h, v0.8h - sshl v4.8h, v4.8h, v0.8h - sshl v5.8h, v5.8h, v0.8h - sshl v16.8h, v16.8h, v0.8h - sshl v17.8h, v17.8h, v0.8h - sshl v18.8h, v18.8h, v0.8h - sshl v19.8h, v19.8h, v0.8h - st1 {v2.16b-v5.16b}, [x0], #64 - st1 {v16.16b-v19.16b}, [x0], #64 -.endr - cbnz w12, .Loop_cpy2Dto1D_shl_64 - ret -endfunc - // void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) function PFX(cpy2Dto1D_shr_4x4_neon) cpy2Dto1D_shr_start diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index aa91ff407..a8aa6f420 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1689,11 +1689,29 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n"); X265_CHECK(shift >= 0, "invalid shift\n"); - for (int i = 0; i < size; i++) + for (int h = 0; h < size; h++) { - for (int j = 0; j < size; j++) + int w = 0; + for (; w + 16 <= size; w += 16) + { + int16x8_t a0_lo = vld1q_s16(src + w); + int16x8_t a0_hi = vld1q_s16(src + w + 8); + int16x8_t d0_lo = vshlq_s16(a0_lo, vdupq_n_s16(shift)); + int16x8_t d0_hi = vshlq_s16(a0_hi, vdupq_n_s16(shift)); + vst1q_s16(dst + w, d0_lo); + vst1q_s16(dst + w + 8, d0_hi); + } + if (size == 8) + { + int16x8_t a0 = vld1q_s16(src + w); + int16x8_t d0 = vshlq_s16(a0, vdupq_n_s16(shift)); + vst1q_s16(dst + w, d0); + } + if (size == 4) { - dst[j] = src[j] << shift; + int16x4_t a0 = vld1_s16(src + w); + int16x4_t d0 = vshl_s16(a0, vdup_n_s16(shift)); + vst1_s16(dst + w, d0); } src += srcStride; -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel