This version looks good, thank you.
At 2025-06-16 18:43:47, "Li Zhang" <li.zha...@arm.com> wrote: >Optimize the standard bit-depth Neon intrinsics implementation to use >ADDW instead of UXTL and ADD. Also unroll the Neon intrinsics >implementations to enable the usage of LDP and STP. Implement Neon >intrinsics for blocksizes of width 4. > >Delete the Neon and SVE2 assembly implementation as they are slower than >Neon intrinsics implementation. >--- > source/common/aarch64/asm-primitives.cpp | 64 ----- > source/common/aarch64/pixel-prim.cpp | 101 ++++++-- > source/common/aarch64/pixel-util-sve2.S | 286 ----------------------- > source/common/aarch64/pixel-util.S | 183 --------------- > 4 files changed, 76 insertions(+), 558 deletions(-) > >diff --git a/source/common/aarch64/asm-primitives.cpp >b/source/common/aarch64/asm-primitives.cpp >index 5ce9352bd..f6203c857 100644 >--- a/source/common/aarch64/asm-primitives.cpp >+++ b/source/common/aarch64/asm-primitives.cpp >@@ -504,38 +504,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) > p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = > PFX(pixel_sub_ps_16x32_neon); > p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = > PFX(pixel_sub_ps_32x64_neon); > >- // pixel_add_ps >- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon); >- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon); >- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon); >- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon); >- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon); >- >- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon); >- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon); >- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon); >- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon); >- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_neon); >- >- // chroma add_ps >- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = >PFX(pixel_add_ps_4x4_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = >PFX(pixel_add_ps_8x8_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = >PFX(pixel_add_ps_16x16_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = >PFX(pixel_add_ps_32x32_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = >PFX(pixel_add_ps_4x8_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = >PFX(pixel_add_ps_8x16_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = >PFX(pixel_add_ps_16x32_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = >PFX(pixel_add_ps_32x64_neon); >- >- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = >PFX(pixel_add_ps_4x4_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = >PFX(pixel_add_ps_8x8_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = >PFX(pixel_add_ps_16x16_neon); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = >PFX(pixel_add_ps_32x32_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = >PFX(pixel_add_ps_4x8_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = >PFX(pixel_add_ps_8x16_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = >PFX(pixel_add_ps_16x32_neon); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = >PFX(pixel_add_ps_32x64_neon); >- > //scale2D_64to32 > p.scale2D_64to32 = PFX(scale2D_64to32_neon); > >@@ -664,38 +632,6 @@ void setupSve2Primitives(EncoderPrimitives &p) > p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = > PFX(pixel_sub_ps_32x32_sve2); > p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = > PFX(pixel_sub_ps_32x64_sve2); > >- // pixel_add_ps >- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_sve2); >- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_sve2); >- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_sve2); >- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_sve2); >- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_sve2); >- >- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_sve2); >- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_sve2); >- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_sve2); >- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_sve2); >- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_sve2); >- >- // chroma add_ps >- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = >PFX(pixel_add_ps_4x4_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = >PFX(pixel_add_ps_8x8_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = >PFX(pixel_add_ps_16x16_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = >PFX(pixel_add_ps_32x32_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = >PFX(pixel_add_ps_4x8_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = >PFX(pixel_add_ps_8x16_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = >PFX(pixel_add_ps_16x32_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = >PFX(pixel_add_ps_32x64_sve2); >- >- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = >PFX(pixel_add_ps_4x4_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = >PFX(pixel_add_ps_8x8_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = >PFX(pixel_add_ps_16x16_sve2); >- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = >PFX(pixel_add_ps_32x32_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = >PFX(pixel_add_ps_4x8_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = >PFX(pixel_add_ps_8x16_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = >PFX(pixel_add_ps_16x32_sve2); >- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = >PFX(pixel_add_ps_32x64_sve2); >- > // scale1D_128to64 > p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2); > p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2); >diff --git a/source/common/aarch64/pixel-prim.cpp >b/source/common/aarch64/pixel-prim.cpp >index f4df6786e..a72b782e9 100644 >--- a/source/common/aarch64/pixel-prim.cpp >+++ b/source/common/aarch64/pixel-prim.cpp >@@ -1110,38 +1110,89 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, >const pixel *b0, const pixe > } > } > >-template<int bx, int by> >-void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const >int16_t *b1, intptr_t sstride0, >- intptr_t sstride1) >+template<int width, int height> >+void pixel_add_ps_neon(pixel *dst, intptr_t dstride, const pixel *src0, >+ const int16_t *src1, intptr_t sstride0, intptr_t >sstride1) > { >- for (int y = 0; y < by; y++) >+ for (int h = 0; h < height; h++) > { >- int x = 0; >- for (; (x + 8) <= bx; x += 8) >- { >- int16x8_t t; >- int16x8_t b1e = vld1q_s16(b1 + x); >- int16x8_t b0e; > #if HIGH_BIT_DEPTH >- b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x)); >- t = vaddq_s16(b0e, b1e); >- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1)); >- t = vmaxq_s16(t, vdupq_n_s16(0)); >- vst1q_u16(a + x, vreinterpretq_u16_s16(t)); >-#else >- b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x))); >- t = vaddq_s16(b0e, b1e); >- vst1_u8(a + x, vqmovun_s16(t)); >-#endif >+ for (int w = 0; w + 16 <= width; w += 16) >+ { >+ uint16x8_t s0_lo = vld1q_u16(src0 + w); >+ uint16x8_t s0_hi = vld1q_u16(src0 + w + 8); >+ int16x8_t s1_lo = vld1q_s16(src1 + w); >+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8); >+ >+ uint16x8_t sum_lo = vsqaddq_u16(s0_lo, s1_lo); >+ uint16x8_t sum_hi = vsqaddq_u16(s0_hi, s1_hi); >+ >+ sum_lo = vminq_u16(sum_lo, vdupq_n_u16((1 << X265_DEPTH) - 1)); >+ sum_hi = vminq_u16(sum_hi, vdupq_n_u16((1 << X265_DEPTH) - 1)); >+ >+ vst1q_u16(dst + w, sum_lo); >+ vst1q_u16(dst + w + 8, sum_hi); > } >- for (; x < bx; x++) >+ if (width == 8) > { >- a[x] = (int16_t)x265_clip(b0[x] + b1[x]); >+ uint16x8_t s0 = vld1q_u16(src0); >+ int16x8_t s1 = vld1q_s16(src1); >+ >+ uint16x8_t sum = vsqaddq_u16(s0, s1); >+ sum = vminq_u16(sum, vdupq_n_u16((1 << X265_DEPTH) - 1)); >+ >+ vst1q_u16(dst, sum); > } >+ if (width == 4) >+ { >+ int16x4_t s1 = vld1_s16(src1); >+ uint16x4_t s0 = vld1_u16(src0); > >- b0 += sstride0; >- b1 += sstride1; >- a += dstride; >+ uint16x4_t sum = vsqadd_u16(s0, s1); >+ sum = vmin_u16(sum, vdup_n_u16((1 << X265_DEPTH) - 1)); >+ >+ vst1_u16(dst, sum); >+ } >+#else // !HIGH_BIT_DEPTH >+ for (int w = 0; w + 16 <= width; w += 16) >+ { >+ uint8x16_t s0 = vld1q_u8(src0 + w); >+ int16x8_t s1_lo = vld1q_s16(src1 + w); >+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8); >+ >+ uint16x8_t sum_lo = vaddw_u8(vreinterpretq_u16_s16(s1_lo), >vget_low_u8(s0)); >+ uint16x8_t sum_hi = vaddw_u8(vreinterpretq_u16_s16(s1_hi), >vget_high_u8(s0)); >+ uint8x8_t d0_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo)); >+ uint8x8_t d0_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi)); >+ >+ vst1_u8(dst + w, d0_lo); >+ vst1_u8(dst + w + 8, d0_hi); >+ } >+ if (width == 8) >+ { >+ uint8x8_t s0 = vld1_u8(src0); >+ int16x8_t s1 = vld1q_s16(src1); >+ >+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0); >+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum)); >+ >+ vst1_u8(dst, d0); >+ } >+ if (width == 4) >+ { >+ uint8x8_t s0 = load_u8x4x1(src0); >+ int16x8_t s1 = vcombine_s16(vld1_s16(src1), vdup_n_s16(0)); >+ >+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0); >+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum)); >+ >+ store_u8x4x1(dst, d0); >+ } >+#endif >+ >+ src0 += sstride0; >+ src1 += sstride1; >+ dst += dstride; > } > } > >diff --git a/source/common/aarch64/pixel-util-sve2.S >b/source/common/aarch64/pixel-util-sve2.S >index 56a2253ea..257bcd7aa 100644 >--- a/source/common/aarch64/pixel-util-sve2.S >+++ b/source/common/aarch64/pixel-util-sve2.S >@@ -531,292 +531,6 @@ function PFX(pixel_sub_ps_32x64_sve2) > ret > endfunc > >-function PFX(pixel_add_ps_4x4_sve2) >- ptrue p0.h, vl8 >- ptrue p1.h, vl4 >-.rept 4 >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z2.h}, p1/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z4.h, z0.h, z2.h >- sqxtunb z4.b, z4.h >- st1b {z4.h}, p1, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >- >-function PFX(pixel_add_ps_8x8_sve2) >- ptrue p0.h, vl8 >-.rept 8 >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z2.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z4.h, z0.h, z2.h >- sqxtunb z4.b, z4.h >- st1b {z4.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >- >-.macro pixel_add_ps_16xN_sve2 h >-function PFX(pixel_add_ps_16x\h\()_sve2) >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_pixel_add_ps_16x\h >- ptrue p0.b, vl16 >-.rept \h >- ld1b {z0.h}, p0/z, [x2] >- ld1b {z1.h}, p0/z, [x2, #1, mul vl] >- ld1h {z2.h}, p0/z, [x3] >- ld1h {z3.h}, p0/z, [x3, #1, mul vl] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z2.h >- add z25.h, z1.h, z3.h >- sqxtunb z6.b, z24.h >- sqxtunb z7.b, z25.h >- st1b {z6.h}, p0, [x0] >- st1b {z7.h}, p0, [x0, #1, mul vl] >- add x0, x0, x1 >-.endr >- ret >-.vl_gt_16_pixel_add_ps_16x\h\(): >- ptrue p0.b, vl32 >-.rept \h >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z2.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z2.h >- sqxtunb z6.b, z24.h >- st1b {z6.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >-.endm >- >-pixel_add_ps_16xN_sve2 16 >-pixel_add_ps_16xN_sve2 32 >- >-.macro pixel_add_ps_32xN_sve2 h >- function PFX(pixel_add_ps_32x\h\()_sve2) >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_pixel_add_ps_32x\h >- lsl x5, x5, #1 >- mov w12, #\h / 4 >-.Loop_add_ps__sve2_32x\h\(): >- sub w12, w12, #1 >-.rept 4 >- ld1 {v0.16b-v1.16b}, [x2], x4 >- ld1 {v16.8h-v19.8h}, [x3], x5 >- uxtl v4.8h, v0.8b >- uxtl2 v5.8h, v0.16b >- uxtl v6.8h, v1.8b >- uxtl2 v7.8h, v1.16b >- add v24.8h, v4.8h, v16.8h >- add v25.8h, v5.8h, v17.8h >- add v26.8h, v6.8h, v18.8h >- add v27.8h, v7.8h, v19.8h >- sqxtun v4.8b, v24.8h >- sqxtun2 v4.16b, v25.8h >- sqxtun v5.8b, v26.8h >- sqxtun2 v5.16b, v27.8h >- st1 {v4.16b-v5.16b}, [x0], x1 >-.endr >- cbnz w12, .Loop_add_ps__sve2_32x\h >- ret >-.vl_gt_16_pixel_add_ps_32x\h\(): >- cmp x9, #48 >- bgt .vl_gt_48_pixel_add_ps_32x\h >- ptrue p0.b, vl32 >-.rept \h >- ld1b {z0.h}, p0/z, [x2] >- ld1b {z1.h}, p0/z, [x2, #1, mul vl] >- ld1h {z4.h}, p0/z, [x3] >- ld1h {z5.h}, p0/z, [x3, #1, mul vl] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z4.h >- add z25.h, z1.h, z5.h >- sqxtunb z6.b, z24.h >- sqxtunb z7.b, z25.h >- st1b {z6.h}, p0, [x0] >- st1b {z7.h}, p0, [x0, #1, mul vl] >- add x0, x0, x1 >-.endr >- ret >-.vl_gt_48_pixel_add_ps_32x\h\(): >- ptrue p0.b, vl64 >-.rept \h >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z4.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z4.h >- sqxtunb z6.b, z24.h >- st1b {z6.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >-.endm >- >-pixel_add_ps_32xN_sve2 32 >-pixel_add_ps_32xN_sve2 64 >- >-function PFX(pixel_add_ps_64x64_sve2) >- rdvl x9, #1 >- cmp x9, #16 >- bgt .vl_gt_16_pixel_add_ps_64x64 >- ptrue p0.b, vl16 >-.rept 64 >- ld1b {z0.h}, p0/z, [x2] >- ld1b {z1.h}, p0/z, [x2, #1, mul vl] >- ld1b {z2.h}, p0/z, [x2, #2, mul vl] >- ld1b {z3.h}, p0/z, [x2, #3, mul vl] >- ld1b {z4.h}, p0/z, [x2, #4 ,mul vl] >- ld1b {z5.h}, p0/z, [x2, #5, mul vl] >- ld1b {z6.h}, p0/z, [x2, #6, mul vl] >- ld1b {z7.h}, p0/z, [x2, #7, mul vl] >- ld1h {z8.h}, p0/z, [x3] >- ld1h {z9.h}, p0/z, [x3, #1, mul vl] >- ld1h {z10.h}, p0/z, [x3, #2, mul vl] >- ld1h {z11.h}, p0/z, [x3, #3, mul vl] >- ld1h {z12.h}, p0/z, [x3, #4, mul vl] >- ld1h {z13.h}, p0/z, [x3, #5, mul vl] >- ld1h {z14.h}, p0/z, [x3, #6, mul vl] >- ld1h {z15.h}, p0/z, [x3, #7, mul vl] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z8.h >- add z25.h, z1.h, z9.h >- add z26.h, z2.h, z10.h >- add z27.h, z3.h, z11.h >- add z28.h, z4.h, z12.h >- add z29.h, z5.h, z13.h >- add z30.h, z6.h, z14.h >- add z31.h, z7.h, z15.h >- sqxtunb z6.b, z24.h >- sqxtunb z7.b, z25.h >- sqxtunb z8.b, z26.h >- sqxtunb z9.b, z27.h >- sqxtunb z10.b, z28.h >- sqxtunb z11.b, z29.h >- sqxtunb z12.b, z30.h >- sqxtunb z13.b, z31.h >- st1b {z6.h}, p0, [x0] >- st1b {z7.h}, p0, [x0, #1, mul vl] >- st1b {z8.h}, p0, [x0, #2, mul vl] >- st1b {z9.h}, p0, [x0, #3, mul vl] >- st1b {z10.h}, p0, [x0, #4, mul vl] >- st1b {z11.h}, p0, [x0, #5, mul vl] >- st1b {z12.h}, p0, [x0, #6, mul vl] >- st1b {z13.h}, p0, [x0, #7, mul vl] >- add x0, x0, x1 >-.endr >- ret >-.vl_gt_16_pixel_add_ps_64x64: >- cmp x9, #48 >- bgt .vl_gt_48_pixel_add_ps_64x64 >- ptrue p0.b, vl32 >-.rept 64 >- ld1b {z0.h}, p0/z, [x2] >- ld1b {z1.h}, p0/z, [x2, #1, mul vl] >- ld1b {z2.h}, p0/z, [x2, #2, mul vl] >- ld1b {z3.h}, p0/z, [x2, #3, mul vl] >- ld1h {z8.h}, p0/z, [x3] >- ld1h {z9.h}, p0/z, [x3, #1, mul vl] >- ld1h {z10.h}, p0/z, [x3, #2, mul vl] >- ld1h {z11.h}, p0/z, [x3, #3, mul vl] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z8.h >- add z25.h, z1.h, z9.h >- add z26.h, z2.h, z10.h >- add z27.h, z3.h, z11.h >- sqxtunb z6.b, z24.h >- sqxtunb z7.b, z25.h >- sqxtunb z8.b, z26.h >- sqxtunb z9.b, z27.h >- st1b {z6.h}, p0, [x0] >- st1b {z7.h}, p0, [x0, #1, mul vl] >- st1b {z8.h}, p0, [x0, #2, mul vl] >- st1b {z9.h}, p0, [x0, #3, mul vl] >- add x0, x0, x1 >-.endr >- ret >-.vl_gt_48_pixel_add_ps_64x64: >- cmp x9, #112 >- bgt .vl_gt_112_pixel_add_ps_64x64 >- ptrue p0.b, vl64 >-.rept 64 >- ld1b {z0.h}, p0/z, [x2] >- ld1b {z1.h}, p0/z, [x2, #1, mul vl] >- ld1h {z8.h}, p0/z, [x3] >- ld1h {z9.h}, p0/z, [x3, #1, mul vl] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z8.h >- add z25.h, z1.h, z9.h >- sqxtunb z6.b, z24.h >- sqxtunb z7.b, z25.h >- st1b {z6.h}, p0, [x0] >- st1b {z7.h}, p0, [x0, #1, mul vl] >- add x0, x0, x1 >-.endr >- ret >-.vl_gt_112_pixel_add_ps_64x64: >- ptrue p0.b, vl128 >-.rept 64 >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z8.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z24.h, z0.h, z8.h >- sqxtunb z6.b, z24.h >- st1b {z6.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >- >-// Chroma add_ps >-function PFX(pixel_add_ps_4x8_sve2) >- ptrue p0.h,vl4 >-.rept 8 >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z2.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z4.h, z0.h, z2.h >- sqxtunb z4.b, z4.h >- st1b {z4.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >- >-function PFX(pixel_add_ps_8x16_sve2) >- ptrue p0.h,vl8 >-.rept 16 >- ld1b {z0.h}, p0/z, [x2] >- ld1h {z2.h}, p0/z, [x3] >- add x2, x2, x4 >- add x3, x3, x5, lsl #1 >- add z4.h, z0.h, z2.h >- sqxtunb z4.b, z4.h >- st1b {z4.h}, p0, [x0] >- add x0, x0, x1 >-.endr >- ret >-endfunc >- > // void scale1D_128to64(pixel *dst, const pixel *src) > function PFX(scale1D_128to64_sve2) > rdvl x9, #1 >diff --git a/source/common/aarch64/pixel-util.S >b/source/common/aarch64/pixel-util.S >index 480278e5e..0751e0e7c 100644 >--- a/source/common/aarch64/pixel-util.S >+++ b/source/common/aarch64/pixel-util.S >@@ -340,189 +340,6 @@ function PFX(pixel_sub_ps_32x64_neon) > ret > endfunc > >-// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, >const int16_t* b1, intptr_t sstride0, intptr_t sstride1); >-function PFX(pixel_add_ps_4x4_neon) >- lsl x5, x5, #1 >-.rept 2 >- ld1 {v0.8b}, [x2], x4 >- ld1 {v1.8b}, [x2], x4 >- ld1 {v2.4h}, [x3], x5 >- ld1 {v3.4h}, [x3], x5 >- uxtl v0.8h, v0.8b >- uxtl v1.8h, v1.8b >- add v4.8h, v0.8h, v2.8h >- add v5.8h, v1.8h, v3.8h >- sqxtun v4.8b, v4.8h >- sqxtun v5.8b, v5.8h >- st1 {v4.s}[0], [x0], x1 >- st1 {v5.s}[0], [x0], x1 >-.endr >- ret >-endfunc >- >-function PFX(pixel_add_ps_8x8_neon) >- lsl x5, x5, #1 >-.rept 4 >- ld1 {v0.8b}, [x2], x4 >- ld1 {v1.8b}, [x2], x4 >- ld1 {v2.8h}, [x3], x5 >- ld1 {v3.8h}, [x3], x5 >- uxtl v0.8h, v0.8b >- uxtl v1.8h, v1.8b >- add v4.8h, v0.8h, v2.8h >- add v5.8h, v1.8h, v3.8h >- sqxtun v4.8b, v4.8h >- sqxtun v5.8b, v5.8h >- st1 {v4.8b}, [x0], x1 >- st1 {v5.8b}, [x0], x1 >-.endr >- ret >-endfunc >- >-.macro pixel_add_ps_16xN_neon h >-function PFX(pixel_add_ps_16x\h\()_neon) >- lsl x5, x5, #1 >- mov w12, #\h / 8 >-.Loop_add_ps_16x\h\(): >- sub w12, w12, #1 >-.rept 4 >- ld1 {v0.16b}, [x2], x4 >- ld1 {v1.16b}, [x2], x4 >- ld1 {v16.8h-v17.8h}, [x3], x5 >- ld1 {v18.8h-v19.8h}, [x3], x5 >- uxtl v4.8h, v0.8b >- uxtl2 v5.8h, v0.16b >- uxtl v6.8h, v1.8b >- uxtl2 v7.8h, v1.16b >- add v24.8h, v4.8h, v16.8h >- add v25.8h, v5.8h, v17.8h >- add v26.8h, v6.8h, v18.8h >- add v27.8h, v7.8h, v19.8h >- sqxtun v4.8b, v24.8h >- sqxtun2 v4.16b, v25.8h >- sqxtun v5.8b, v26.8h >- sqxtun2 v5.16b, v27.8h >- st1 {v4.16b}, [x0], x1 >- st1 {v5.16b}, [x0], x1 >-.endr >- cbnz w12, .Loop_add_ps_16x\h >- ret >-endfunc >-.endm >- >-pixel_add_ps_16xN_neon 16 >-pixel_add_ps_16xN_neon 32 >- >-.macro pixel_add_ps_32xN_neon h >- function PFX(pixel_add_ps_32x\h\()_neon) >- lsl x5, x5, #1 >- mov w12, #\h / 4 >-.Loop_add_ps_32x\h\(): >- sub w12, w12, #1 >-.rept 4 >- ld1 {v0.16b-v1.16b}, [x2], x4 >- ld1 {v16.8h-v19.8h}, [x3], x5 >- uxtl v4.8h, v0.8b >- uxtl2 v5.8h, v0.16b >- uxtl v6.8h, v1.8b >- uxtl2 v7.8h, v1.16b >- add v24.8h, v4.8h, v16.8h >- add v25.8h, v5.8h, v17.8h >- add v26.8h, v6.8h, v18.8h >- add v27.8h, v7.8h, v19.8h >- sqxtun v4.8b, v24.8h >- sqxtun2 v4.16b, v25.8h >- sqxtun v5.8b, v26.8h >- sqxtun2 v5.16b, v27.8h >- st1 {v4.16b-v5.16b}, [x0], x1 >-.endr >- cbnz w12, .Loop_add_ps_32x\h >- ret >-endfunc >-.endm >- >-pixel_add_ps_32xN_neon 32 >-pixel_add_ps_32xN_neon 64 >- >-function PFX(pixel_add_ps_64x64_neon) >- lsl x5, x5, #1 >- sub x5, x5, #64 >- mov w12, #32 >-.Loop_add_ps_64x64: >- sub w12, w12, #1 >-.rept 2 >- ld1 {v0.16b-v3.16b}, [x2], x4 >- ld1 {v16.8h-v19.8h}, [x3], #64 >- ld1 {v20.8h-v23.8h}, [x3], x5 >- uxtl v4.8h, v0.8b >- uxtl2 v5.8h, v0.16b >- uxtl v6.8h, v1.8b >- uxtl2 v7.8h, v1.16b >- uxtl v24.8h, v2.8b >- uxtl2 v25.8h, v2.16b >- uxtl v26.8h, v3.8b >- uxtl2 v27.8h, v3.16b >- add v0.8h, v4.8h, v16.8h >- add v1.8h, v5.8h, v17.8h >- add v2.8h, v6.8h, v18.8h >- add v3.8h, v7.8h, v19.8h >- add v4.8h, v24.8h, v20.8h >- add v5.8h, v25.8h, v21.8h >- add v6.8h, v26.8h, v22.8h >- add v7.8h, v27.8h, v23.8h >- sqxtun v0.8b, v0.8h >- sqxtun2 v0.16b, v1.8h >- sqxtun v1.8b, v2.8h >- sqxtun2 v1.16b, v3.8h >- sqxtun v2.8b, v4.8h >- sqxtun2 v2.16b, v5.8h >- sqxtun v3.8b, v6.8h >- sqxtun2 v3.16b, v7.8h >- st1 {v0.16b-v3.16b}, [x0], x1 >-.endr >- cbnz w12, .Loop_add_ps_64x64 >- ret >-endfunc >- >-// Chroma add_ps >-function PFX(pixel_add_ps_4x8_neon) >- lsl x5, x5, #1 >-.rept 4 >- ld1 {v0.8b}, [x2], x4 >- ld1 {v1.8b}, [x2], x4 >- ld1 {v2.4h}, [x3], x5 >- ld1 {v3.4h}, [x3], x5 >- uxtl v0.8h, v0.8b >- uxtl v1.8h, v1.8b >- add v4.8h, v0.8h, v2.8h >- add v5.8h, v1.8h, v3.8h >- sqxtun v4.8b, v4.8h >- sqxtun v5.8b, v5.8h >- st1 {v4.s}[0], [x0], x1 >- st1 {v5.s}[0], [x0], x1 >-.endr >- ret >-endfunc >- >-function PFX(pixel_add_ps_8x16_neon) >- lsl x5, x5, #1 >-.rept 8 >- ld1 {v0.8b}, [x2], x4 >- ld1 {v1.8b}, [x2], x4 >- ld1 {v2.8h}, [x3], x5 >- ld1 {v3.8h}, [x3], x5 >- uxtl v0.8h, v0.8b >- uxtl v1.8h, v1.8b >- add v4.8h, v0.8h, v2.8h >- add v5.8h, v1.8h, v3.8h >- sqxtun v4.8b, v4.8h >- sqxtun v5.8b, v5.8h >- st1 {v4.8b}, [x0], x1 >- st1 {v5.8b}, [x0], x1 >-.endr >- ret >-endfunc >- > // void scale1D_128to64(pixel *dst, const pixel *src) > function PFX(scale1D_128to64_neon) > .rept 2 >-- >2.39.5 (Apple Git-154) >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel