Add a Neon asm implementation of high bitdepth SAD4D functions for all block sizes. This implementation is 6%-11% faster on Neoverse platforms compared to the existing Neon intrinsics sad_x4_neon<w,h> implementation. --- source/common/aarch64/asm-primitives.cpp | 4 +- source/common/aarch64/sad-a.S | 581 +++++++++++++++-------- 2 files changed, 373 insertions(+), 212 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 283256679..0a20085bf 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -527,6 +527,7 @@ void setupNeonPrimitives(EncoderPrimitives &p) // sad ALL_LUMA_PU(sad, pixel_sad, neon); ALL_LUMA_PU(sad_x3, sad_x3, neon); + ALL_LUMA_PU(sad_x4, sad_x4, neon); #if !HIGH_BIT_DEPTH // pixel_avg_pp @@ -541,9 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon); ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon); - // sad - ALL_LUMA_PU(sad_x4, sad_x4, neon); - // sse_pp p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon); p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon); diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S index 642fd29f3..bf5495ae4 100644 --- a/source/common/aarch64/sad-a.S +++ b/source/common/aarch64/sad-a.S @@ -921,46 +921,59 @@ SAD_FUNC_LOOP_LARGE 64, 32 SAD_FUNC_LOOP_LARGE 64, 48 SAD_FUNC_LOOP_LARGE 64, 64 -// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) -.macro SAD_x3_4 f - ld1 {v0.4h}, [x0], x6 - ld1 {v1.4h}, [x1], x4 - ld1 {v2.4h}, [x2], x4 - ld1 {v3.4h}, [x3], x4 +//void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) +//void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) +.macro SAD_xN_4 n, f + ld1 {v0.4h}, [x0], x7 + ld1 {v1.4h}, [x1], x5 + ld1 {v2.4h}, [x2], x5 + ld1 {v3.4h}, [x3], x5 \f v16.4s, v0.4h, v1.4h \f v17.4s, v0.4h, v2.4h \f v18.4s, v0.4h, v3.4h +.if \n == 4 + ld1 {v4.4h}, [x4], x5 + \f v19.4s, v0.4h, v4.4h +.endif .endm -.macro SAD_x3_4xH h - SAD_x3_4 uabdl +.macro SAD_xN_4xH n, h + SAD_xN_4 \n, uabdl .rept \h - 1 - SAD_x3_4 uabal + SAD_xN_4 \n, uabal .endr .endm -.macro SAD_x3_8x2 f - ld1 {v0.8h}, [x0], x6 - ld1 {v1.8h}, [x1], x4 - ld1 {v2.8h}, [x2], x4 - ld1 {v3.8h}, [x3], x4 +.macro SAD_xN_8x2 n, f + ld1 {v0.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 \f v16.8h, v0.8h, v1.8h \f v17.8h, v0.8h, v2.8h \f v18.8h, v0.8h, v3.8h +.if \n == 4 + ld1 {v4.8h}, [x4], x5 + \f v22.8h, v0.8h, v4.8h +.endif - ld1 {v0.8h}, [x0], x6 - ld1 {v1.8h}, [x1], x4 - ld1 {v2.8h}, [x2], x4 - ld1 {v3.8h}, [x3], x4 + ld1 {v0.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 \f v19.8h, v0.8h, v1.8h \f v20.8h, v0.8h, v2.8h \f v21.8h, v0.8h, v3.8h +.if \n == 4 + ld1 {v4.8h}, [x4], x5 + \f v23.8h, v0.8h, v4.8h +.endif .endm -.macro SAD_x3_8xH h - SAD_x3_8x2 uabd -.rept \h/2 - 1 - SAD_x3_8x2 uaba +.macro SAD_xN_8xH n, h + SAD_xN_8x2 \n, uabd +.rept \h /2 - 1 + SAD_xN_8x2 \n, uaba .endr uaddlp v16.4s, v16.8h uadalp v16.4s, v19.8h @@ -968,28 +981,45 @@ SAD_FUNC_LOOP_LARGE 64, 64 uadalp v17.4s, v20.8h uaddlp v18.4s, v18.8h uadalp v18.4s, v21.8h +.if \n == 4 + uaddlp v19.4s, v22.8h + uadalp v19.4s, v23.8h +.endif .endm -.macro SAD_x3_FUNC w, h -function PFX(sad_x3_\w\()x\h\()_neon) +.macro SAD_xN_FUNC n, w, h +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif + // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w\()xH \h + SAD_xN_\w\()xH \n, \h +.if \n == 3 addp v0.4s, v16.4s, v17.4s addp v1.4s, v18.4s, v18.4s addp v0.4s, v0.4s, v1.4s - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + addp v16.4s, v16.4s, v17.4s + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + str q16, [x6] +.endif ret endfunc .endm -.macro SAD_x3_12 f +.macro SAD_xN_12 n, f ldr q0, [x0] ldr q1, [x1] ldr q2, [x2] @@ -1004,57 +1034,82 @@ endfunc \f v17.8h, v4.8h, v5.8h \f v19.8h, v4.8h, v6.8h \f v21.8h, v4.8h, v7.8h - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 +.if \n == 4 + ldr q3, [x4] + ldr d7, [x4, #16] + \f v22.8h, v0.8h, v3.8h + \f v23.8h, v4.8h, v7.8h + add x4, x4, x5 +.endif .endm -.macro SAD_x3_16 f - ld1 {v0.8h-v1.8h}, [x0], x6 - ld1 {v2.8h-v3.8h}, [x1], x4 +.macro SAD_xN_16 n f + ld1 {v0.8h-v1.8h}, [x0], x7 + ld1 {v2.8h-v3.8h}, [x1], x5 \f v16.8h, v0.8h, v2.8h \f v17.8h, v1.8h, v3.8h - ld1 {v4.8h-v5.8h}, [x2], x4 + ld1 {v4.8h-v5.8h}, [x2], x5 \f v18.8h, v0.8h, v4.8h \f v19.8h, v1.8h, v5.8h - ld1 {v6.8h-v7.8h}, [x3], x4 + ld1 {v6.8h-v7.8h}, [x3], x5 \f v20.8h, v0.8h, v6.8h \f v21.8h, v1.8h, v7.8h +.if \n == 4 + ld1 {v6.8h-v7.8h}, [x4], x5 + \f v22.8h, v0.8h, v6.8h + \f v23.8h, v1.8h, v7.8h +.endif .endm -.macro SAD_x3_32 f - ld1 {v0.8h-v3.8h}, [x0], x6 - ld1 {v4.8h-v7.8h}, [x1], x4 +.macro SAD_xN_32 n f + ld1 {v0.8h-v3.8h}, [x0], x7 + ld1 {v4.8h-v7.8h}, [x1], x5 \f v16.8h, v0.8h, v4.8h uaba v16.8h, v1.8h, v5.8h \f v17.8h, v2.8h, v6.8h uaba v17.8h, v3.8h, v7.8h - ld1 {v4.8h-v7.8h},[x2], x4 + ld1 {v4.8h-v7.8h}, [x2], x5 \f v18.8h, v0.8h, v4.8h uaba v18.8h, v1.8h, v5.8h \f v19.8h, v2.8h, v6.8h uaba v19.8h, v3.8h, v7.8h - ld1 {v4.8h-v7.8h},[x3], x4 + ld1 {v4.8h-v7.8h}, [x3], x5 \f v20.8h, v0.8h, v4.8h uaba v20.8h, v1.8h, v5.8h \f v21.8h, v2.8h, v6.8h uaba v21.8h, v3.8h, v7.8h +.if \n == 4 + ld1 {v4.8h-v7.8h}, [x4], x5 + \f v22.8h, v0.8h, v4.8h + uaba v22.8h, v1.8h, v5.8h + \f v23.8h, v2.8h, v6.8h + uaba v23.8h, v3.8h, v7.8h +.endif .endm -.macro SAD_x3_FUNC_LOOP w, h -function PFX(sad_x3_\w\()x\h\()_neon) +.macro SAD_xN_FUNC_LOOP n, w, h end_type +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif + // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w uabd + SAD_xN_\w \n, uabd - mov w9, #\h - 1 -.Loop_x_\w\()x\h: - sub w9, w9, #1 - SAD_x3_\w uaba - cbnz w9, .Loop_x_\w\()x\h + mov w8, #\h - 1 +.Loop_x\n\()_\w\()x\h: + sub w8, w8, #1 + SAD_xN_\w \n, uaba + cbnz w8, .Loop_x\n\()_\w\()x\h uaddlp v16.4s, v16.8h uadalp v16.4s, v17.8h @@ -1062,61 +1117,86 @@ function PFX(sad_x3_\w\()x\h\()_neon) uadalp v18.4s, v19.8h uaddlp v20.4s, v20.8h uadalp v20.4s, v21.8h + +.if \n == 3 addp v0.4s, v16.4s, v18.4s addp v1.4s, v20.4s, v20.4s addp v0.4s, v0.4s, v1.4s - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + uaddlp v22.4s, v22.8h + uadalp v22.4s, v23.8h + addp v16.4s, v16.4s, v18.4s + addp v20.4s, v20.4s, v22.4s + addp v16.4s, v16.4s, v20.4s + str q16, [x6] +.endif ret endfunc .endm -.macro SAD_x3_16_WIDEN f - ld1 {v0.8h-v1.8h}, [x0], x6 - ld1 {v2.8h-v3.8h}, [x1], x4 +.macro SAD_xN_16_WIDEN n f + ld1 {v0.8h-v1.8h}, [x0], x7 + ld1 {v2.8h-v3.8h}, [x1], x5 uabd v22.8h, v0.8h, v2.8h \f v16.4s, v22.8h uabd v23.8h, v1.8h, v3.8h \f v17.4s, v23.8h - ld1 {v4.8h-v5.8h}, [x2], x4 + ld1 {v4.8h-v5.8h}, [x2], x5 uabd v24.8h, v0.8h, v4.8h \f v18.4s, v24.8h uabd v25.8h, v1.8h, v5.8h \f v19.4s, v25.8h - ld1 {v6.8h-v7.8h}, [x3], x4 + ld1 {v6.8h-v7.8h}, [x3], x5 uabd v26.8h, v0.8h, v6.8h \f v20.4s, v26.8h uabd v27.8h, v1.8h, v7.8h \f v21.4s, v27.8h +.if \n == 4 + ld1 {v2.8h-v3.8h}, [x4], x5 + uabd v28.8h, v0.8h, v2.8h + \f v30.4s, v28.8h + uabd v29.8h, v1.8h, v3.8h + \f v31.4s, v29.8h +.endif .endm -.macro SAD_x3_24_WIDEN f - ld1 {v0.8h-v2.8h}, [x0], x6 - ld1 {v3.8h-v5.8h}, [x1], x4 - uabd v22.8h, v0.8h, v3.8h - uaba v22.8h, v1.8h, v4.8h - \f v16.4s, v22.8h - uabd v23.8h, v2.8h, v5.8h - \f v17.4s, v23.8h - ld1 {v28.8h-v30.8h}, [x2], x4 - uabd v24.8h, v0.8h, v28.8h - uaba v24.8h, v1.8h, v29.8h - \f v18.4s, v24.8h - uabd v25.8h, v2.8h, v30.8h - \f v19.4s, v25.8h - ld1 {v3.8h-v5.8h}, [x3], x4 - uabd v26.8h, v0.8h, v3.8h - uaba v26.8h, v1.8h, v4.8h - \f v20.4s, v26.8h - uabd v27.8h, v2.8h, v5.8h - \f v21.4s, v27.8h +.macro SAD_xN_24_WIDEN n f + ld1 {v0.8h-v2.8h}, [x0], x7 + ld1 {v3.8h-v5.8h}, [x1], x5 + uabd v6.8h, v0.8h, v3.8h + uaba v6.8h, v1.8h, v4.8h + \f v16.4s, v6.8h + uabd v7.8h, v2.8h, v5.8h + \f v17.4s, v7.8h + ld1 {v27.8h-v29.8h}, [x2], x5 + uabd v22.8h, v0.8h, v27.8h + uaba v22.8h, v1.8h, v28.8h + \f v18.4s, v22.8h + uabd v23.8h, v2.8h, v29.8h + \f v19.4s, v23.8h + ld1 {v3.8h-v5.8h}, [x3], x5 + uabd v24.8h, v0.8h, v3.8h + uaba v24.8h, v1.8h, v4.8h + \f v20.4s, v24.8h + uabd v25.8h, v2.8h, v5.8h + \f v21.4s, v25.8h +.if \n == 4 + ld1 {v27.8h-v29.8h}, [x4], x5 + uabd v22.8h, v0.8h, v27.8h + uaba v22.8h, v1.8h, v28.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v29.8h + \f v31.4s, v23.8h +.endif .endm -.macro SAD_x3_32_WIDEN f - ld1 {v0.8h-v3.8h}, [x0], x6 - ld1 {v4.8h-v7.8h}, [x1], x4 +.macro SAD_xN_32_WIDEN n f + ld1 {v0.8h-v3.8h}, [x0], x7 + ld1 {v4.8h-v7.8h}, [x1], x5 uabd v22.8h, v0.8h, v4.8h uaba v22.8h, v1.8h, v5.8h \f v16.4s, v22.8h @@ -1124,7 +1204,7 @@ endfunc uaba v23.8h, v3.8h, v7.8h \f v17.4s, v23.8h - ld1 {v4.8h-v7.8h}, [x2], x4 + ld1 {v4.8h-v7.8h}, [x2], x5 uabd v24.8h, v0.8h, v4.8h uaba v24.8h, v1.8h, v5.8h \f v18.4s, v24.8h @@ -1132,174 +1212,257 @@ endfunc uaba v25.8h, v3.8h, v7.8h \f v19.4s, v25.8h - ld1 {v4.8h-v7.8h}, [x3], x4 + ld1 {v4.8h-v7.8h}, [x3], x5 uabd v26.8h, v0.8h, v4.8h uaba v26.8h, v1.8h, v5.8h \f v20.4s, v26.8h uabd v27.8h, v2.8h, v6.8h uaba v27.8h, v3.8h, v7.8h \f v21.4s, v27.8h + +.if \n == 4 + ld1 {v4.8h-v7.8h}, [x4], x5 + uabd v22.8h, v0.8h, v4.8h + uaba v22.8h, v1.8h, v5.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v6.8h + uaba v23.8h, v3.8h, v7.8h + \f v31.4s, v23.8h +.endif .endm -.macro SAD_x3_48_WIDEN f +.macro SAD_xN_48_WIDEN n f ld1 {v0.8h-v3.8h}, [x0] - ld1 {v28.8h-v31.8h}, [x1] - uabd v6.8h, v0.8h, v28.8h - uaba v6.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x1] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h \f v16.4s, v6.8h - uabd v7.8h, v2.8h, v30.8h - uaba v7.8h, v3.8h, v31.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h \f v17.4s, v7.8h ldp q4, q5, [x0, #64] - ldp q28, q29, [x1, #64] - uabd v22.8h, v4.8h, v28.8h - uaba v22.8h, v5.8h, v29.8h + ldp q26, q27, [x1, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h uadalp v16.4s, v22.8h - ld1 {v28.8h-v31.8h}, [x2] - uabd v23.8h, v0.8h, v28.8h - uaba v23.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x2] + uabd v23.8h, v0.8h, v26.8h + uaba v23.8h, v1.8h, v27.8h \f v18.4s, v23.8h - uabd v24.8h, v2.8h, v30.8h - uaba v24.8h, v3.8h, v31.8h + uabd v24.8h, v2.8h, v28.8h + uaba v24.8h, v3.8h, v29.8h \f v19.4s, v24.8h - ldp q28, q29, [x2, #64] - uabd v25.8h, v4.8h, v28.8h - uaba v25.8h, v5.8h, v29.8h + ldp q26, q27, [x2, #64] + uabd v25.8h, v4.8h, v26.8h + uaba v25.8h, v5.8h, v27.8h uadalp v18.4s, v25.8h - ld1 {v28.8h-v31.8h}, [x3] - uabd v26.8h, v0.8h, v28.8h - uaba v26.8h, v1.8h, v29.8h - \f v20.4s, v26.8h - uabd v27.8h, v2.8h, v30.8h - uaba v27.8h, v3.8h, v31.8h - \f v21.4s, v27.8h - ldp q28, q29, [x3, #64] - uabd v6.8h, v4.8h, v28.8h - uaba v6.8h, v5.8h, v29.8h - uadalp v20.4s, v6.8h + ld1 {v26.8h-v29.8h}, [x3] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h + \f v20.4s, v6.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h + \f v21.4s, v7.8h + ldp q26, q27, [x3, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h + uadalp v20.4s, v22.8h + + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 +.if \n == 4 + ld1 {v26.8h-v29.8h}, [x4] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h + \f v30.4s, v6.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h + \f v31.4s, v7.8h + ldp q26, q27, [x4, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h + uadalp v30.4s, v22.8h + add x4, x4, x5 +.endif .endm -.macro SAD_x3_64_WIDEN f +.macro SAD_xN_64_WIDEN n f ld1 {v0.8h-v3.8h}, [x0] - ld1 {v28.8h-v31.8h}, [x1] - uabd v22.8h, v0.8h, v28.8h - uaba v22.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x1] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h \f v16.4s, v22.8h - uabd v23.8h, v2.8h, v30.8h - uaba v23.8h, v3.8h, v31.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h \f v17.4s, v23.8h ldp q4, q5, [x0, #64] ldp q6, q7, [x0, #96] - ldp q28, q29, [x1, #64] - ldp q30, q31, [x1, #96] - uabd v24.8h, v4.8h, v28.8h - uaba v24.8h, v5.8h, v29.8h + ldp q26, q27, [x1, #64] + ldp q28, q29, [x1, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h uadalp v16.4s, v24.8h - uabd v25.8h, v6.8h, v30.8h - uaba v25.8h, v7.8h, v31.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h uadalp v17.4s, v25.8h - ld1 {v28.8h-v31.8h}, [x2] - uabd v26.8h, v0.8h, v28.8h - uaba v26.8h, v1.8h, v29.8h - \f v18.4s, v26.8h - uabd v27.8h, v2.8h, v30.8h - uaba v27.8h, v3.8h, v31.8h - \f v19.4s, v27.8h - ldp q28, q29, [x2, #64] - ldp q30, q31, [x2, #96] - uabd v22.8h, v4.8h, v28.8h - uaba v22.8h, v5.8h, v29.8h - uadalp v18.4s, v22.8h - uabd v23.8h, v6.8h, v30.8h - uaba v23.8h, v7.8h, v31.8h - uadalp v19.4s, v23.8h + ld1 {v26.8h-v29.8h}, [x2] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v18.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v19.4s, v23.8h + ldp q26, q27, [x2, #64] + ldp q28, q29, [x2, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v18.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v19.4s, v25.8h + + ld1 {v26.8h-v29.8h}, [x3] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v20.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v21.4s, v23.8h + ldp q26, q27, [x3, #64] + ldp q28, q29, [x3, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v20.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v21.4s, v25.8h + + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 + +.if \n == 4 + ld1 {v26.8h-v29.8h}, [x4] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v31.4s, v23.8h + ldp q26, q27, [x4, #64] + ldp q28, q29, [x4, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v30.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v31.4s, v25.8h + add x4, x4, x5 +.endif +.endm + +.macro SAD_xN_FUNC_LOOP_LARGE n, w, h +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif - ld1 {v28.8h-v31.8h}, [x3] - uabd v24.8h, v0.8h, v28.8h - uaba v24.8h, v1.8h, v29.8h - \f v20.4s, v24.8h - uabd v25.8h, v2.8h, v30.8h - uaba v25.8h, v3.8h, v31.8h - \f v21.4s, v25.8h - ldp q28, q29, [x3, #64] - ldp q30, q31, [x3, #96] - uabd v26.8h, v4.8h, v28.8h - uaba v26.8h, v5.8h, v29.8h - uadalp v20.4s, v26.8h - uabd v27.8h, v6.8h, v30.8h - uaba v27.8h, v7.8h, v31.8h - uadalp v21.4s, v27.8h - - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 -.endm - -.macro SAD_x3_FUNC_LOOP_LARGE w, h -function PFX(sad_x3_\w\()x\h\()_neon) // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w\()_WIDEN uaddlp - SAD_x3_\w\()_WIDEN uadalp + SAD_xN_\w\()_WIDEN \n, uaddlp + SAD_xN_\w\()_WIDEN \n, uadalp - mov w9, #(\h - 2)/2 -.Loop_x_\w\()x\h: - sub w9, w9, #1 + mov w8, #(\h - 2)/2 +.Loop_x\n\()_\w\()x\h: + sub w8, w8, #1 .rept 2 - SAD_x3_\w\()_WIDEN uadalp + SAD_xN_\w\()_WIDEN \n, uadalp .endr - cbnz w9, .Loop_x_\w\()x\h + cbnz w8, .Loop_x\n\()_\w\()x\h add v16.4s, v16.4s, v17.4s add v17.4s, v18.4s, v19.4s add v18.4s, v20.4s, v21.4s +.if \n == 3 addp v0.4s, v16.4s, v17.4s addp v1.4s, v18.4s, v18.4s addp v0.4s, v0.4s, v1.4s - - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + add v19.4s, v30.4s, v31.4s + addp v16.4s, v16.4s, v17.4s + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + str q16, [x6] +.endif ret endfunc .endm -SAD_x3_FUNC 4, 4 -SAD_x3_FUNC 4, 8 -SAD_x3_FUNC 4, 16 -SAD_x3_FUNC 8, 4 -SAD_x3_FUNC 8, 8 -SAD_x3_FUNC 8, 16 -SAD_x3_FUNC 8, 32 -SAD_x3_FUNC_LOOP 12, 16 -SAD_x3_FUNC_LOOP 16, 4 -SAD_x3_FUNC_LOOP 16, 8 -SAD_x3_FUNC_LOOP 16, 12 -SAD_x3_FUNC_LOOP 16, 16 -SAD_x3_FUNC_LOOP 32, 8 -SAD_x3_FUNC_LOOP_LARGE 16, 32 -SAD_x3_FUNC_LOOP_LARGE 16, 64 -SAD_x3_FUNC_LOOP_LARGE 24, 32 -SAD_x3_FUNC_LOOP_LARGE 32, 16 -SAD_x3_FUNC_LOOP_LARGE 32, 24 -SAD_x3_FUNC_LOOP_LARGE 32, 32 -SAD_x3_FUNC_LOOP_LARGE 32, 64 -SAD_x3_FUNC_LOOP_LARGE 48, 64 -SAD_x3_FUNC_LOOP_LARGE 64, 16 -SAD_x3_FUNC_LOOP_LARGE 64, 32 -SAD_x3_FUNC_LOOP_LARGE 64, 48 -SAD_x3_FUNC_LOOP_LARGE 64, 64 +SAD_xN_FUNC 3, 4, 4 +SAD_xN_FUNC 3, 4, 8 +SAD_xN_FUNC 3, 4, 16 +SAD_xN_FUNC 3, 8, 4 +SAD_xN_FUNC 3, 8, 8 +SAD_xN_FUNC 3, 8, 16 +SAD_xN_FUNC 3, 8, 32 +SAD_xN_FUNC_LOOP 3, 12, 16 +SAD_xN_FUNC_LOOP 3, 16, 4 +SAD_xN_FUNC_LOOP 3, 16, 8 +SAD_xN_FUNC_LOOP 3, 16, 12 +SAD_xN_FUNC_LOOP 3, 16, 16 +SAD_xN_FUNC_LOOP 3, 32, 8 +SAD_xN_FUNC_LOOP_LARGE 3, 16, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 16, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 24, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 16 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 24 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 48, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 16 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 48 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 64 + +SAD_xN_FUNC 4, 4, 4 +SAD_xN_FUNC 4, 4, 8 +SAD_xN_FUNC 4, 4, 16 +SAD_xN_FUNC 4, 8, 4 +SAD_xN_FUNC 4, 8, 8 +SAD_xN_FUNC 4, 8, 16 +SAD_xN_FUNC 4, 8, 32 +SAD_xN_FUNC_LOOP 4, 12, 16 +SAD_xN_FUNC_LOOP 4, 16, 4 +SAD_xN_FUNC_LOOP 4, 16, 8 +SAD_xN_FUNC_LOOP 4, 16, 12 +SAD_xN_FUNC_LOOP 4, 16, 16 +SAD_xN_FUNC_LOOP 4, 32, 8 +SAD_xN_FUNC_LOOP_LARGE 4, 16, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 16, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 24, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 16 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 24 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 48, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 16 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 48 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 64 #endif // !HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
>From 03a6f9ec9f39a8f4d6f8e7c6e092e90bccefad98 Mon Sep 17 00:00:00 2001 Message-Id: <03a6f9ec9f39a8f4d6f8e7c6e092e90bccefad98.1731667226.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1731667226.git.gerdazsejke.m...@arm.com> References: <cover.1731667226.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Tue, 5 Nov 2024 16:48:23 +0100 Subject: [PATCH 3/3] AArch64: Add Neon asm implementation of HBD SAD4D Add a Neon asm implementation of high bitdepth SAD4D functions for all block sizes. This implementation is 6%-11% faster on Neoverse platforms compared to the existing Neon intrinsics sad_x4_neon<w,h> implementation. --- source/common/aarch64/asm-primitives.cpp | 4 +- source/common/aarch64/sad-a.S | 581 +++++++++++++++-------- 2 files changed, 373 insertions(+), 212 deletions(-) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index 283256679..0a20085bf 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -527,6 +527,7 @@ void setupNeonPrimitives(EncoderPrimitives &p) // sad ALL_LUMA_PU(sad, pixel_sad, neon); ALL_LUMA_PU(sad_x3, sad_x3, neon); + ALL_LUMA_PU(sad_x4, sad_x4, neon); #if !HIGH_BIT_DEPTH // pixel_avg_pp @@ -541,9 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p) ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon); ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon); - // sad - ALL_LUMA_PU(sad_x4, sad_x4, neon); - // sse_pp p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon); p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon); diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S index 642fd29f3..bf5495ae4 100644 --- a/source/common/aarch64/sad-a.S +++ b/source/common/aarch64/sad-a.S @@ -921,46 +921,59 @@ SAD_FUNC_LOOP_LARGE 64, 32 SAD_FUNC_LOOP_LARGE 64, 48 SAD_FUNC_LOOP_LARGE 64, 64 -// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) -.macro SAD_x3_4 f - ld1 {v0.4h}, [x0], x6 - ld1 {v1.4h}, [x1], x4 - ld1 {v2.4h}, [x2], x4 - ld1 {v3.4h}, [x3], x4 +//void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) +//void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) +.macro SAD_xN_4 n, f + ld1 {v0.4h}, [x0], x7 + ld1 {v1.4h}, [x1], x5 + ld1 {v2.4h}, [x2], x5 + ld1 {v3.4h}, [x3], x5 \f v16.4s, v0.4h, v1.4h \f v17.4s, v0.4h, v2.4h \f v18.4s, v0.4h, v3.4h +.if \n == 4 + ld1 {v4.4h}, [x4], x5 + \f v19.4s, v0.4h, v4.4h +.endif .endm -.macro SAD_x3_4xH h - SAD_x3_4 uabdl +.macro SAD_xN_4xH n, h + SAD_xN_4 \n, uabdl .rept \h - 1 - SAD_x3_4 uabal + SAD_xN_4 \n, uabal .endr .endm -.macro SAD_x3_8x2 f - ld1 {v0.8h}, [x0], x6 - ld1 {v1.8h}, [x1], x4 - ld1 {v2.8h}, [x2], x4 - ld1 {v3.8h}, [x3], x4 +.macro SAD_xN_8x2 n, f + ld1 {v0.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 \f v16.8h, v0.8h, v1.8h \f v17.8h, v0.8h, v2.8h \f v18.8h, v0.8h, v3.8h +.if \n == 4 + ld1 {v4.8h}, [x4], x5 + \f v22.8h, v0.8h, v4.8h +.endif - ld1 {v0.8h}, [x0], x6 - ld1 {v1.8h}, [x1], x4 - ld1 {v2.8h}, [x2], x4 - ld1 {v3.8h}, [x3], x4 + ld1 {v0.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 \f v19.8h, v0.8h, v1.8h \f v20.8h, v0.8h, v2.8h \f v21.8h, v0.8h, v3.8h +.if \n == 4 + ld1 {v4.8h}, [x4], x5 + \f v23.8h, v0.8h, v4.8h +.endif .endm -.macro SAD_x3_8xH h - SAD_x3_8x2 uabd -.rept \h/2 - 1 - SAD_x3_8x2 uaba +.macro SAD_xN_8xH n, h + SAD_xN_8x2 \n, uabd +.rept \h /2 - 1 + SAD_xN_8x2 \n, uaba .endr uaddlp v16.4s, v16.8h uadalp v16.4s, v19.8h @@ -968,28 +981,45 @@ SAD_FUNC_LOOP_LARGE 64, 64 uadalp v17.4s, v20.8h uaddlp v18.4s, v18.8h uadalp v18.4s, v21.8h +.if \n == 4 + uaddlp v19.4s, v22.8h + uadalp v19.4s, v23.8h +.endif .endm -.macro SAD_x3_FUNC w, h -function PFX(sad_x3_\w\()x\h\()_neon) +.macro SAD_xN_FUNC n, w, h +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif + // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w\()xH \h + SAD_xN_\w\()xH \n, \h +.if \n == 3 addp v0.4s, v16.4s, v17.4s addp v1.4s, v18.4s, v18.4s addp v0.4s, v0.4s, v1.4s - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + addp v16.4s, v16.4s, v17.4s + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + str q16, [x6] +.endif ret endfunc .endm -.macro SAD_x3_12 f +.macro SAD_xN_12 n, f ldr q0, [x0] ldr q1, [x1] ldr q2, [x2] @@ -1004,57 +1034,82 @@ endfunc \f v17.8h, v4.8h, v5.8h \f v19.8h, v4.8h, v6.8h \f v21.8h, v4.8h, v7.8h - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 +.if \n == 4 + ldr q3, [x4] + ldr d7, [x4, #16] + \f v22.8h, v0.8h, v3.8h + \f v23.8h, v4.8h, v7.8h + add x4, x4, x5 +.endif .endm -.macro SAD_x3_16 f - ld1 {v0.8h-v1.8h}, [x0], x6 - ld1 {v2.8h-v3.8h}, [x1], x4 +.macro SAD_xN_16 n f + ld1 {v0.8h-v1.8h}, [x0], x7 + ld1 {v2.8h-v3.8h}, [x1], x5 \f v16.8h, v0.8h, v2.8h \f v17.8h, v1.8h, v3.8h - ld1 {v4.8h-v5.8h}, [x2], x4 + ld1 {v4.8h-v5.8h}, [x2], x5 \f v18.8h, v0.8h, v4.8h \f v19.8h, v1.8h, v5.8h - ld1 {v6.8h-v7.8h}, [x3], x4 + ld1 {v6.8h-v7.8h}, [x3], x5 \f v20.8h, v0.8h, v6.8h \f v21.8h, v1.8h, v7.8h +.if \n == 4 + ld1 {v6.8h-v7.8h}, [x4], x5 + \f v22.8h, v0.8h, v6.8h + \f v23.8h, v1.8h, v7.8h +.endif .endm -.macro SAD_x3_32 f - ld1 {v0.8h-v3.8h}, [x0], x6 - ld1 {v4.8h-v7.8h}, [x1], x4 +.macro SAD_xN_32 n f + ld1 {v0.8h-v3.8h}, [x0], x7 + ld1 {v4.8h-v7.8h}, [x1], x5 \f v16.8h, v0.8h, v4.8h uaba v16.8h, v1.8h, v5.8h \f v17.8h, v2.8h, v6.8h uaba v17.8h, v3.8h, v7.8h - ld1 {v4.8h-v7.8h},[x2], x4 + ld1 {v4.8h-v7.8h}, [x2], x5 \f v18.8h, v0.8h, v4.8h uaba v18.8h, v1.8h, v5.8h \f v19.8h, v2.8h, v6.8h uaba v19.8h, v3.8h, v7.8h - ld1 {v4.8h-v7.8h},[x3], x4 + ld1 {v4.8h-v7.8h}, [x3], x5 \f v20.8h, v0.8h, v4.8h uaba v20.8h, v1.8h, v5.8h \f v21.8h, v2.8h, v6.8h uaba v21.8h, v3.8h, v7.8h +.if \n == 4 + ld1 {v4.8h-v7.8h}, [x4], x5 + \f v22.8h, v0.8h, v4.8h + uaba v22.8h, v1.8h, v5.8h + \f v23.8h, v2.8h, v6.8h + uaba v23.8h, v3.8h, v7.8h +.endif .endm -.macro SAD_x3_FUNC_LOOP w, h -function PFX(sad_x3_\w\()x\h\()_neon) +.macro SAD_xN_FUNC_LOOP n, w, h end_type +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif + // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w uabd + SAD_xN_\w \n, uabd - mov w9, #\h - 1 -.Loop_x_\w\()x\h: - sub w9, w9, #1 - SAD_x3_\w uaba - cbnz w9, .Loop_x_\w\()x\h + mov w8, #\h - 1 +.Loop_x\n\()_\w\()x\h: + sub w8, w8, #1 + SAD_xN_\w \n, uaba + cbnz w8, .Loop_x\n\()_\w\()x\h uaddlp v16.4s, v16.8h uadalp v16.4s, v17.8h @@ -1062,61 +1117,86 @@ function PFX(sad_x3_\w\()x\h\()_neon) uadalp v18.4s, v19.8h uaddlp v20.4s, v20.8h uadalp v20.4s, v21.8h + +.if \n == 3 addp v0.4s, v16.4s, v18.4s addp v1.4s, v20.4s, v20.4s addp v0.4s, v0.4s, v1.4s - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + uaddlp v22.4s, v22.8h + uadalp v22.4s, v23.8h + addp v16.4s, v16.4s, v18.4s + addp v20.4s, v20.4s, v22.4s + addp v16.4s, v16.4s, v20.4s + str q16, [x6] +.endif ret endfunc .endm -.macro SAD_x3_16_WIDEN f - ld1 {v0.8h-v1.8h}, [x0], x6 - ld1 {v2.8h-v3.8h}, [x1], x4 +.macro SAD_xN_16_WIDEN n f + ld1 {v0.8h-v1.8h}, [x0], x7 + ld1 {v2.8h-v3.8h}, [x1], x5 uabd v22.8h, v0.8h, v2.8h \f v16.4s, v22.8h uabd v23.8h, v1.8h, v3.8h \f v17.4s, v23.8h - ld1 {v4.8h-v5.8h}, [x2], x4 + ld1 {v4.8h-v5.8h}, [x2], x5 uabd v24.8h, v0.8h, v4.8h \f v18.4s, v24.8h uabd v25.8h, v1.8h, v5.8h \f v19.4s, v25.8h - ld1 {v6.8h-v7.8h}, [x3], x4 + ld1 {v6.8h-v7.8h}, [x3], x5 uabd v26.8h, v0.8h, v6.8h \f v20.4s, v26.8h uabd v27.8h, v1.8h, v7.8h \f v21.4s, v27.8h +.if \n == 4 + ld1 {v2.8h-v3.8h}, [x4], x5 + uabd v28.8h, v0.8h, v2.8h + \f v30.4s, v28.8h + uabd v29.8h, v1.8h, v3.8h + \f v31.4s, v29.8h +.endif .endm -.macro SAD_x3_24_WIDEN f - ld1 {v0.8h-v2.8h}, [x0], x6 - ld1 {v3.8h-v5.8h}, [x1], x4 - uabd v22.8h, v0.8h, v3.8h - uaba v22.8h, v1.8h, v4.8h - \f v16.4s, v22.8h - uabd v23.8h, v2.8h, v5.8h - \f v17.4s, v23.8h - ld1 {v28.8h-v30.8h}, [x2], x4 - uabd v24.8h, v0.8h, v28.8h - uaba v24.8h, v1.8h, v29.8h - \f v18.4s, v24.8h - uabd v25.8h, v2.8h, v30.8h - \f v19.4s, v25.8h - ld1 {v3.8h-v5.8h}, [x3], x4 - uabd v26.8h, v0.8h, v3.8h - uaba v26.8h, v1.8h, v4.8h - \f v20.4s, v26.8h - uabd v27.8h, v2.8h, v5.8h - \f v21.4s, v27.8h +.macro SAD_xN_24_WIDEN n f + ld1 {v0.8h-v2.8h}, [x0], x7 + ld1 {v3.8h-v5.8h}, [x1], x5 + uabd v6.8h, v0.8h, v3.8h + uaba v6.8h, v1.8h, v4.8h + \f v16.4s, v6.8h + uabd v7.8h, v2.8h, v5.8h + \f v17.4s, v7.8h + ld1 {v27.8h-v29.8h}, [x2], x5 + uabd v22.8h, v0.8h, v27.8h + uaba v22.8h, v1.8h, v28.8h + \f v18.4s, v22.8h + uabd v23.8h, v2.8h, v29.8h + \f v19.4s, v23.8h + ld1 {v3.8h-v5.8h}, [x3], x5 + uabd v24.8h, v0.8h, v3.8h + uaba v24.8h, v1.8h, v4.8h + \f v20.4s, v24.8h + uabd v25.8h, v2.8h, v5.8h + \f v21.4s, v25.8h +.if \n == 4 + ld1 {v27.8h-v29.8h}, [x4], x5 + uabd v22.8h, v0.8h, v27.8h + uaba v22.8h, v1.8h, v28.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v29.8h + \f v31.4s, v23.8h +.endif .endm -.macro SAD_x3_32_WIDEN f - ld1 {v0.8h-v3.8h}, [x0], x6 - ld1 {v4.8h-v7.8h}, [x1], x4 +.macro SAD_xN_32_WIDEN n f + ld1 {v0.8h-v3.8h}, [x0], x7 + ld1 {v4.8h-v7.8h}, [x1], x5 uabd v22.8h, v0.8h, v4.8h uaba v22.8h, v1.8h, v5.8h \f v16.4s, v22.8h @@ -1124,7 +1204,7 @@ endfunc uaba v23.8h, v3.8h, v7.8h \f v17.4s, v23.8h - ld1 {v4.8h-v7.8h}, [x2], x4 + ld1 {v4.8h-v7.8h}, [x2], x5 uabd v24.8h, v0.8h, v4.8h uaba v24.8h, v1.8h, v5.8h \f v18.4s, v24.8h @@ -1132,174 +1212,257 @@ endfunc uaba v25.8h, v3.8h, v7.8h \f v19.4s, v25.8h - ld1 {v4.8h-v7.8h}, [x3], x4 + ld1 {v4.8h-v7.8h}, [x3], x5 uabd v26.8h, v0.8h, v4.8h uaba v26.8h, v1.8h, v5.8h \f v20.4s, v26.8h uabd v27.8h, v2.8h, v6.8h uaba v27.8h, v3.8h, v7.8h \f v21.4s, v27.8h + +.if \n == 4 + ld1 {v4.8h-v7.8h}, [x4], x5 + uabd v22.8h, v0.8h, v4.8h + uaba v22.8h, v1.8h, v5.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v6.8h + uaba v23.8h, v3.8h, v7.8h + \f v31.4s, v23.8h +.endif .endm -.macro SAD_x3_48_WIDEN f +.macro SAD_xN_48_WIDEN n f ld1 {v0.8h-v3.8h}, [x0] - ld1 {v28.8h-v31.8h}, [x1] - uabd v6.8h, v0.8h, v28.8h - uaba v6.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x1] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h \f v16.4s, v6.8h - uabd v7.8h, v2.8h, v30.8h - uaba v7.8h, v3.8h, v31.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h \f v17.4s, v7.8h ldp q4, q5, [x0, #64] - ldp q28, q29, [x1, #64] - uabd v22.8h, v4.8h, v28.8h - uaba v22.8h, v5.8h, v29.8h + ldp q26, q27, [x1, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h uadalp v16.4s, v22.8h - ld1 {v28.8h-v31.8h}, [x2] - uabd v23.8h, v0.8h, v28.8h - uaba v23.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x2] + uabd v23.8h, v0.8h, v26.8h + uaba v23.8h, v1.8h, v27.8h \f v18.4s, v23.8h - uabd v24.8h, v2.8h, v30.8h - uaba v24.8h, v3.8h, v31.8h + uabd v24.8h, v2.8h, v28.8h + uaba v24.8h, v3.8h, v29.8h \f v19.4s, v24.8h - ldp q28, q29, [x2, #64] - uabd v25.8h, v4.8h, v28.8h - uaba v25.8h, v5.8h, v29.8h + ldp q26, q27, [x2, #64] + uabd v25.8h, v4.8h, v26.8h + uaba v25.8h, v5.8h, v27.8h uadalp v18.4s, v25.8h - ld1 {v28.8h-v31.8h}, [x3] - uabd v26.8h, v0.8h, v28.8h - uaba v26.8h, v1.8h, v29.8h - \f v20.4s, v26.8h - uabd v27.8h, v2.8h, v30.8h - uaba v27.8h, v3.8h, v31.8h - \f v21.4s, v27.8h - ldp q28, q29, [x3, #64] - uabd v6.8h, v4.8h, v28.8h - uaba v6.8h, v5.8h, v29.8h - uadalp v20.4s, v6.8h + ld1 {v26.8h-v29.8h}, [x3] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h + \f v20.4s, v6.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h + \f v21.4s, v7.8h + ldp q26, q27, [x3, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h + uadalp v20.4s, v22.8h + + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 +.if \n == 4 + ld1 {v26.8h-v29.8h}, [x4] + uabd v6.8h, v0.8h, v26.8h + uaba v6.8h, v1.8h, v27.8h + \f v30.4s, v6.8h + uabd v7.8h, v2.8h, v28.8h + uaba v7.8h, v3.8h, v29.8h + \f v31.4s, v7.8h + ldp q26, q27, [x4, #64] + uabd v22.8h, v4.8h, v26.8h + uaba v22.8h, v5.8h, v27.8h + uadalp v30.4s, v22.8h + add x4, x4, x5 +.endif .endm -.macro SAD_x3_64_WIDEN f +.macro SAD_xN_64_WIDEN n f ld1 {v0.8h-v3.8h}, [x0] - ld1 {v28.8h-v31.8h}, [x1] - uabd v22.8h, v0.8h, v28.8h - uaba v22.8h, v1.8h, v29.8h + ld1 {v26.8h-v29.8h}, [x1] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h \f v16.4s, v22.8h - uabd v23.8h, v2.8h, v30.8h - uaba v23.8h, v3.8h, v31.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h \f v17.4s, v23.8h ldp q4, q5, [x0, #64] ldp q6, q7, [x0, #96] - ldp q28, q29, [x1, #64] - ldp q30, q31, [x1, #96] - uabd v24.8h, v4.8h, v28.8h - uaba v24.8h, v5.8h, v29.8h + ldp q26, q27, [x1, #64] + ldp q28, q29, [x1, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h uadalp v16.4s, v24.8h - uabd v25.8h, v6.8h, v30.8h - uaba v25.8h, v7.8h, v31.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h uadalp v17.4s, v25.8h - ld1 {v28.8h-v31.8h}, [x2] - uabd v26.8h, v0.8h, v28.8h - uaba v26.8h, v1.8h, v29.8h - \f v18.4s, v26.8h - uabd v27.8h, v2.8h, v30.8h - uaba v27.8h, v3.8h, v31.8h - \f v19.4s, v27.8h - ldp q28, q29, [x2, #64] - ldp q30, q31, [x2, #96] - uabd v22.8h, v4.8h, v28.8h - uaba v22.8h, v5.8h, v29.8h - uadalp v18.4s, v22.8h - uabd v23.8h, v6.8h, v30.8h - uaba v23.8h, v7.8h, v31.8h - uadalp v19.4s, v23.8h + ld1 {v26.8h-v29.8h}, [x2] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v18.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v19.4s, v23.8h + ldp q26, q27, [x2, #64] + ldp q28, q29, [x2, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v18.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v19.4s, v25.8h + + ld1 {v26.8h-v29.8h}, [x3] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v20.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v21.4s, v23.8h + ldp q26, q27, [x3, #64] + ldp q28, q29, [x3, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v20.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v21.4s, v25.8h + + add x0, x0, x7 + add x1, x1, x5 + add x2, x2, x5 + add x3, x3, x5 + +.if \n == 4 + ld1 {v26.8h-v29.8h}, [x4] + uabd v22.8h, v0.8h, v26.8h + uaba v22.8h, v1.8h, v27.8h + \f v30.4s, v22.8h + uabd v23.8h, v2.8h, v28.8h + uaba v23.8h, v3.8h, v29.8h + \f v31.4s, v23.8h + ldp q26, q27, [x4, #64] + ldp q28, q29, [x4, #96] + uabd v24.8h, v4.8h, v26.8h + uaba v24.8h, v5.8h, v27.8h + uadalp v30.4s, v24.8h + uabd v25.8h, v6.8h, v28.8h + uaba v25.8h, v7.8h, v29.8h + uadalp v31.4s, v25.8h + add x4, x4, x5 +.endif +.endm + +.macro SAD_xN_FUNC_LOOP_LARGE n, w, h +function PFX(sad_x\n\()_\w\()x\h\()_neon) + // Make function arguments for n == 3 look like n == 4. +.if \n == 3 + mov x6, x5 + mov x5, x4 +.endif - ld1 {v28.8h-v31.8h}, [x3] - uabd v24.8h, v0.8h, v28.8h - uaba v24.8h, v1.8h, v29.8h - \f v20.4s, v24.8h - uabd v25.8h, v2.8h, v30.8h - uaba v25.8h, v3.8h, v31.8h - \f v21.4s, v25.8h - ldp q28, q29, [x3, #64] - ldp q30, q31, [x3, #96] - uabd v26.8h, v4.8h, v28.8h - uaba v26.8h, v5.8h, v29.8h - uadalp v20.4s, v26.8h - uabd v27.8h, v6.8h, v30.8h - uaba v27.8h, v7.8h, v31.8h - uadalp v21.4s, v27.8h - - add x0, x0, x6 - add x1, x1, x4 - add x2, x2, x4 - add x3, x3, x4 -.endm - -.macro SAD_x3_FUNC_LOOP_LARGE w, h -function PFX(sad_x3_\w\()x\h\()_neon) // Stride is given in terms of pixel channel size, so double to get number of bytes. - add x4, x4, x4 - mov x6, #(FENC_STRIDE << 1) + add x5, x5, x5 + mov x7, #(FENC_STRIDE << 1) - SAD_x3_\w\()_WIDEN uaddlp - SAD_x3_\w\()_WIDEN uadalp + SAD_xN_\w\()_WIDEN \n, uaddlp + SAD_xN_\w\()_WIDEN \n, uadalp - mov w9, #(\h - 2)/2 -.Loop_x_\w\()x\h: - sub w9, w9, #1 + mov w8, #(\h - 2)/2 +.Loop_x\n\()_\w\()x\h: + sub w8, w8, #1 .rept 2 - SAD_x3_\w\()_WIDEN uadalp + SAD_xN_\w\()_WIDEN \n, uadalp .endr - cbnz w9, .Loop_x_\w\()x\h + cbnz w8, .Loop_x\n\()_\w\()x\h add v16.4s, v16.4s, v17.4s add v17.4s, v18.4s, v19.4s add v18.4s, v20.4s, v21.4s +.if \n == 3 addp v0.4s, v16.4s, v17.4s addp v1.4s, v18.4s, v18.4s addp v0.4s, v0.4s, v1.4s - - str d0, [x5] - add x5, x5, #8 - st1 {v0.s}[2], [x5] + str d0, [x6] + add x6, x6, #8 + st1 {v0.s}[2], [x6] +.else + add v19.4s, v30.4s, v31.4s + addp v16.4s, v16.4s, v17.4s + addp v18.4s, v18.4s, v19.4s + addp v16.4s, v16.4s, v18.4s + str q16, [x6] +.endif ret endfunc .endm -SAD_x3_FUNC 4, 4 -SAD_x3_FUNC 4, 8 -SAD_x3_FUNC 4, 16 -SAD_x3_FUNC 8, 4 -SAD_x3_FUNC 8, 8 -SAD_x3_FUNC 8, 16 -SAD_x3_FUNC 8, 32 -SAD_x3_FUNC_LOOP 12, 16 -SAD_x3_FUNC_LOOP 16, 4 -SAD_x3_FUNC_LOOP 16, 8 -SAD_x3_FUNC_LOOP 16, 12 -SAD_x3_FUNC_LOOP 16, 16 -SAD_x3_FUNC_LOOP 32, 8 -SAD_x3_FUNC_LOOP_LARGE 16, 32 -SAD_x3_FUNC_LOOP_LARGE 16, 64 -SAD_x3_FUNC_LOOP_LARGE 24, 32 -SAD_x3_FUNC_LOOP_LARGE 32, 16 -SAD_x3_FUNC_LOOP_LARGE 32, 24 -SAD_x3_FUNC_LOOP_LARGE 32, 32 -SAD_x3_FUNC_LOOP_LARGE 32, 64 -SAD_x3_FUNC_LOOP_LARGE 48, 64 -SAD_x3_FUNC_LOOP_LARGE 64, 16 -SAD_x3_FUNC_LOOP_LARGE 64, 32 -SAD_x3_FUNC_LOOP_LARGE 64, 48 -SAD_x3_FUNC_LOOP_LARGE 64, 64 +SAD_xN_FUNC 3, 4, 4 +SAD_xN_FUNC 3, 4, 8 +SAD_xN_FUNC 3, 4, 16 +SAD_xN_FUNC 3, 8, 4 +SAD_xN_FUNC 3, 8, 8 +SAD_xN_FUNC 3, 8, 16 +SAD_xN_FUNC 3, 8, 32 +SAD_xN_FUNC_LOOP 3, 12, 16 +SAD_xN_FUNC_LOOP 3, 16, 4 +SAD_xN_FUNC_LOOP 3, 16, 8 +SAD_xN_FUNC_LOOP 3, 16, 12 +SAD_xN_FUNC_LOOP 3, 16, 16 +SAD_xN_FUNC_LOOP 3, 32, 8 +SAD_xN_FUNC_LOOP_LARGE 3, 16, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 16, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 24, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 16 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 24 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 32, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 48, 64 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 16 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 32 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 48 +SAD_xN_FUNC_LOOP_LARGE 3, 64, 64 + +SAD_xN_FUNC 4, 4, 4 +SAD_xN_FUNC 4, 4, 8 +SAD_xN_FUNC 4, 4, 16 +SAD_xN_FUNC 4, 8, 4 +SAD_xN_FUNC 4, 8, 8 +SAD_xN_FUNC 4, 8, 16 +SAD_xN_FUNC 4, 8, 32 +SAD_xN_FUNC_LOOP 4, 12, 16 +SAD_xN_FUNC_LOOP 4, 16, 4 +SAD_xN_FUNC_LOOP 4, 16, 8 +SAD_xN_FUNC_LOOP 4, 16, 12 +SAD_xN_FUNC_LOOP 4, 16, 16 +SAD_xN_FUNC_LOOP 4, 32, 8 +SAD_xN_FUNC_LOOP_LARGE 4, 16, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 16, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 24, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 16 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 24 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 32, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 48, 64 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 16 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 32 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 48 +SAD_xN_FUNC_LOOP_LARGE 4, 64, 64 #endif // !HIGH_BIT_DEPTH -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel