Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wri...@arm.com> --- source/common/aarch64/asm-primitives.cpp | 33 ---- source/common/aarch64/blockcopy8-common.S | 6 - source/common/aarch64/blockcopy8-sve.S | 220 ---------------------- source/common/aarch64/blockcopy8.S | 152 --------------- source/common/aarch64/pixel-prim.cpp | 38 ++++ 5 files changed, 38 insertions(+), 411 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 981c6352a..1715ae115 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -404,23 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon); ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon); - // Blockcopy_sp - p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); - p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); - p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon); - p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); - p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon); - - // chroma blockcopy_sp - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon); - // Block_fill ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon); ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon); @@ -639,22 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p) CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]); LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]); - // Blockcopy_sp - p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); - p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); - p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); - p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - - // chroma blockcopy_sp - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve); - // Block_fill LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S index 2f2ab556d..6599bb49e 100644 --- a/source/common/aarch64/blockcopy8-common.S +++ b/source/common/aarch64/blockcopy8-common.S @@ -46,9 +46,3 @@ sri v1.8h, v1.8h, #1 neg v0.8h, v0.8h .endm - -const xtn_xtn2_table, align=4 -.byte 0, 2, 4, 6, 8, 10, 12, 14 -.byte 16, 18, 20, 22, 24, 26, 28, 30 -endconst - diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 976d80dd1..d724e8427 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -36,226 +36,6 @@ .text -/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) - * - * r0 - a - * r1 - stridea - * r2 - b - * r3 - strideb */ - -function PFX(blockcopy_sp_4x4_sve) - ptrue p0.h, vl4 -.rept 2 - ld1h {z0.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z0.h}, p0, [x0] - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z1.h}, p0, [x0] - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x8_sve) - ptrue p0.h, vl8 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z0.h}, p0, [x0] - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z1.h}, p0, [x0] - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_16_16 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_sp_16_16: - ptrue p0.h, vl16 -.rept 8 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - st1b {z1.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x32_sve) - mov w12, #4 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_32_32 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32_sve - ret -.vl_gt_16_blockcopy_sp_32_32: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_sp_32_32 - ptrue p0.h, vl16 -.vl_gt_16_loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z2.h}, p0/z, [x2] - ld1h {z3.h}, p0/z, [x2, #1, mul vl] - st1b {z2.h}, p0, [x0] - st1b {z3.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - cbnz w12, .vl_gt_16_loop_csp32_sve - ret -.vl_gt_48_blockcopy_sp_32_32: - ptrue p0.h, vl32 -.vl_gt_48_loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - st1b {z1.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - cbnz w12, .vl_gt_48_loop_csp32_sve - ret -endfunc - -// chroma blockcopy_sp -function PFX(blockcopy_sp_4x8_sve) - ptrue p0.h, vl4 -.rept 8 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x16_sve) - ptrue p0.h, vl8 -.rept 16 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_16_32 - ptrue p0.h, vl8 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_16_blockcopy_sp_16_32: - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_32_64 - ptrue p0.h, vl8 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - ld1h {z2.h}, p0/z, [x2, #2, mul vl] - ld1h {z3.h}, p0/z, [x2, #3, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - st1b {z2.h}, p0, [x0, #2, mul vl] - st1b {z3.h}, p0, [x0, #3, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_16_blockcopy_sp_32_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_sp_32_64 - ptrue p0.h, vl16 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_48_blockcopy_sp_32_64: - ptrue p0.h, vl32 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - function PFX(blockfill_s_32x32_sve) rdvl x9, #1 cmp x9, #16 diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 8ac54a1e1..9db578d1e 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -34,158 +34,6 @@ .text -/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) - * - * r0 - a - * r1 - stridea - * r2 - b - * r3 - strideb */ -function PFX(blockcopy_sp_4x4_neon) - lsl x3, x3, #1 -.rept 2 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x8_neon) - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.d}[0], [x0], x1 - st1 {v1.d}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x16_neon) - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x32_neon) - mov w12, #4 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32 - ret -endfunc - -function PFX(blockcopy_sp_64x64_neon) - mov w12, #16 - lsl x3, x3, #1 - sub x3, x3, #64 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp64: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp64 - ret -endfunc - -// chroma blockcopy_sp -function PFX(blockcopy_sp_4x8_neon) - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x16_neon) - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.d}[0], [x0], x1 - st1 {v1.d}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x32_neon) - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x64_neon) - mov w12, #8 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32x64: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32x64 - ret -endfunc - // void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val) function PFX(blockfill_s_4x4_neon) dup v0.4h, w2 diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 4be409ab1..055b3e35c 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1046,6 +1046,39 @@ void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src, } } +#if !HIGH_BIT_DEPTH +template<int width, int height> +void blockcopy_sp_neon(pixel *dst, intptr_t dst_stride, const int16_t *src, + intptr_t src_stride) +{ + for (int h = 0; h < height; h++) + { + int w = 0; + for (; w + 16 <= width; w += 16) { + int16x8_t s0 = vld1q_s16(src + w + 0); + int16x8_t s1 = vld1q_s16(src + w + 8); + int8x16_t s01 = vcombine_s8(vmovn_s16(s0), vmovn_s16(s1)); + vst1q_u8(dst + w, vreinterpretq_u8_s8(s01)); + } + if (width & 8) + { + int16x8_t s0 = vld1q_s16(src + w); + int8x8_t s0_s8 = vmovn_s16(s0); + vst1_u8(dst + w, vreinterpret_u8_s8(s0_s8)); + w += 8; + } + if (width & 4) + { + int16x4_t s0 = vld1_s16(src + w); + int8x8_t s0_s8 = vmovn_s16(vcombine_s16(s0, vdup_n_s16(0))); + store_u8x4x1(dst + w, vreinterpret_u8_s8(s0_s8)); + } + + dst += dst_stride; + src += src_stride; + } +} +#endif // !HIGH_BIT_DEPTH template<int bx, int by> void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0, @@ -1818,6 +1851,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1992,6 +2026,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2000,6 +2035,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2096,6 +2132,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2104,6 +2141,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; -- 2.39.5 (Apple Git-154)
>From 994e584f68865606cdebfafde8331a04f3828983 Mon Sep 17 00:00:00 2001 Message-Id: <994e584f68865606cdebfafde8331a04f3828983.1747668338.git.li.zha...@arm.com> In-Reply-To: <cover.1747668338.git.li.zha...@arm.com> References: <cover.1747668338.git.li.zha...@arm.com> From: Li Zhang <li.zha...@arm.com> Date: Thu, 8 May 2025 19:17:55 +0200 Subject: [PATCH 4/8] AArch64: Implement blockcopy_sp primitives using Neon intrinsics Delete the Neon and SVE assembly implementations of these kernels as they are no faster, and only serve to increase binary size. Co-authored by: Jonathan Wright <jonathan.wri...@arm.com> --- source/common/aarch64/asm-primitives.cpp | 33 ---- source/common/aarch64/blockcopy8-common.S | 6 - source/common/aarch64/blockcopy8-sve.S | 220 ---------------------- source/common/aarch64/blockcopy8.S | 152 --------------- source/common/aarch64/pixel-prim.cpp | 38 ++++ 5 files changed, 38 insertions(+), 411 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 981c6352a..1715ae115 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -404,23 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon); ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon); - // Blockcopy_sp - p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); - p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); - p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon); - p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); - p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon); - - // chroma blockcopy_sp - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_neon); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon); - // Block_fill ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon); ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon); @@ -639,22 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p) CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]); LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]); - // Blockcopy_sp - p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); - p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); - p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); - p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - - // chroma blockcopy_sp - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve); - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_sve); - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve); - // Block_fill LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s); LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s); diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S index 2f2ab556d..6599bb49e 100644 --- a/source/common/aarch64/blockcopy8-common.S +++ b/source/common/aarch64/blockcopy8-common.S @@ -46,9 +46,3 @@ sri v1.8h, v1.8h, #1 neg v0.8h, v0.8h .endm - -const xtn_xtn2_table, align=4 -.byte 0, 2, 4, 6, 8, 10, 12, 14 -.byte 16, 18, 20, 22, 24, 26, 28, 30 -endconst - diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S index 976d80dd1..d724e8427 100644 --- a/source/common/aarch64/blockcopy8-sve.S +++ b/source/common/aarch64/blockcopy8-sve.S @@ -36,226 +36,6 @@ .text -/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) - * - * r0 - a - * r1 - stridea - * r2 - b - * r3 - strideb */ - -function PFX(blockcopy_sp_4x4_sve) - ptrue p0.h, vl4 -.rept 2 - ld1h {z0.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z0.h}, p0, [x0] - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z1.h}, p0, [x0] - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x8_sve) - ptrue p0.h, vl8 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z0.h}, p0, [x0] - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - add x2, x2, x3, lsl #1 - st1b {z1.h}, p0, [x0] - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x16_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_16_16 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -.vl_gt_16_blockcopy_sp_16_16: - ptrue p0.h, vl16 -.rept 8 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - st1b {z1.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x32_sve) - mov w12, #4 - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_32_32 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32_sve - ret -.vl_gt_16_blockcopy_sp_32_32: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_sp_32_32 - ptrue p0.h, vl16 -.vl_gt_16_loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z2.h}, p0/z, [x2] - ld1h {z3.h}, p0/z, [x2, #1, mul vl] - st1b {z2.h}, p0, [x0] - st1b {z3.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - cbnz w12, .vl_gt_16_loop_csp32_sve - ret -.vl_gt_48_blockcopy_sp_32_32: - ptrue p0.h, vl32 -.vl_gt_48_loop_csp32_sve: - sub w12, w12, #1 -.rept 4 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 - ld1h {z1.h}, p0/z, [x2] - st1b {z1.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - cbnz w12, .vl_gt_48_loop_csp32_sve - ret -endfunc - -// chroma blockcopy_sp -function PFX(blockcopy_sp_4x8_sve) - ptrue p0.h, vl4 -.rept 8 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x16_sve) - ptrue p0.h, vl8 -.rept 16 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x32_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_16_32 - ptrue p0.h, vl8 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_16_blockcopy_sp_16_32: - ptrue p0.h, vl16 -.rept 32 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x64_sve) - rdvl x9, #1 - cmp x9, #16 - bgt .vl_gt_16_blockcopy_sp_32_64 - ptrue p0.h, vl8 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - ld1h {z2.h}, p0/z, [x2, #2, mul vl] - ld1h {z3.h}, p0/z, [x2, #3, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - st1b {z2.h}, p0, [x0, #2, mul vl] - st1b {z3.h}, p0, [x0, #3, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_16_blockcopy_sp_32_64: - cmp x9, #48 - bgt .vl_gt_48_blockcopy_sp_32_64 - ptrue p0.h, vl16 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - ld1h {z1.h}, p0/z, [x2, #1, mul vl] - st1b {z0.h}, p0, [x0] - st1b {z1.h}, p0, [x0, #1, mul vl] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -.vl_gt_48_blockcopy_sp_32_64: - ptrue p0.h, vl32 -.rept 64 - ld1h {z0.h}, p0/z, [x2] - st1b {z0.h}, p0, [x0] - add x2, x2, x3, lsl #1 - add x0, x0, x1 -.endr - ret -endfunc - function PFX(blockfill_s_32x32_sve) rdvl x9, #1 cmp x9, #16 diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S index 8ac54a1e1..9db578d1e 100644 --- a/source/common/aarch64/blockcopy8.S +++ b/source/common/aarch64/blockcopy8.S @@ -34,158 +34,6 @@ .text -/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) - * - * r0 - a - * r1 - stridea - * r2 - b - * r3 - strideb */ -function PFX(blockcopy_sp_4x4_neon) - lsl x3, x3, #1 -.rept 2 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x8_neon) - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.d}[0], [x0], x1 - st1 {v1.d}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x16_neon) - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 8 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x32_neon) - mov w12, #4 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32 - ret -endfunc - -function PFX(blockcopy_sp_64x64_neon) - mov w12, #16 - lsl x3, x3, #1 - sub x3, x3, #64 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp64: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], #64 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp64 - ret -endfunc - -// chroma blockcopy_sp -function PFX(blockcopy_sp_4x8_neon) - lsl x3, x3, #1 -.rept 4 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_8x16_neon) - lsl x3, x3, #1 -.rept 8 - ld1 {v0.8h}, [x2], x3 - ld1 {v1.8h}, [x2], x3 - xtn v0.8b, v0.8h - xtn v1.8b, v1.8h - st1 {v0.d}[0], [x0], x1 - st1 {v1.d}[0], [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_16x32_neon) - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.rept 16 - ld1 {v0.8h-v1.8h}, [x2], x3 - ld1 {v2.8h-v3.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 -.endr - ret -endfunc - -function PFX(blockcopy_sp_32x64_neon) - mov w12, #8 - lsl x3, x3, #1 - movrel x11, xtn_xtn2_table - ld1 {v31.16b}, [x11] -.Loop_csp32x64: - sub w12, w12, #1 -.rept 4 - ld1 {v0.8h-v3.8h}, [x2], x3 - ld1 {v4.8h-v7.8h}, [x2], x3 - tbl v0.16b, {v0.16b,v1.16b}, v31.16b - tbl v1.16b, {v2.16b,v3.16b}, v31.16b - tbl v2.16b, {v4.16b,v5.16b}, v31.16b - tbl v3.16b, {v6.16b,v7.16b}, v31.16b - st1 {v0.16b-v1.16b}, [x0], x1 - st1 {v2.16b-v3.16b}, [x0], x1 -.endr - cbnz w12, .Loop_csp32x64 - ret -endfunc - // void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val) function PFX(blockfill_s_4x4_neon) dup v0.4h, w2 diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp index 4be409ab1..055b3e35c 100644 --- a/source/common/aarch64/pixel-prim.cpp +++ b/source/common/aarch64/pixel-prim.cpp @@ -1046,6 +1046,39 @@ void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src, } } +#if !HIGH_BIT_DEPTH +template<int width, int height> +void blockcopy_sp_neon(pixel *dst, intptr_t dst_stride, const int16_t *src, + intptr_t src_stride) +{ + for (int h = 0; h < height; h++) + { + int w = 0; + for (; w + 16 <= width; w += 16) { + int16x8_t s0 = vld1q_s16(src + w + 0); + int16x8_t s1 = vld1q_s16(src + w + 8); + int8x16_t s01 = vcombine_s8(vmovn_s16(s0), vmovn_s16(s1)); + vst1q_u8(dst + w, vreinterpretq_u8_s8(s01)); + } + if (width & 8) + { + int16x8_t s0 = vld1q_s16(src + w); + int8x8_t s0_s8 = vmovn_s16(s0); + vst1_u8(dst + w, vreinterpret_u8_s8(s0_s8)); + w += 8; + } + if (width & 4) + { + int16x4_t s0 = vld1_s16(src + w); + int8x8_t s0_s8 = vmovn_s16(vcombine_s16(s0, vdup_n_s16(0))); + store_u8x4x1(dst + w, vreinterpret_u8_s8(s0_s8)); + } + + dst += dst_stride; + src += src_stride; + } +} +#endif // !HIGH_BIT_DEPTH template<int bx, int by> void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0, @@ -1818,6 +1851,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \ @@ -1992,6 +2026,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2000,6 +2035,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2096,6 +2132,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; @@ -2104,6 +2141,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \ + p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel