Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for block sizes of width 32 to dispatch to shared functions, to reduce code size. --- source/common/aarch64/ssd-a.S | 16 ++++++++++------ source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S index 4a5e80d49..a66d68617 100644 --- a/source/common/aarch64/ssd-a.S +++ b/source/common/aarch64/ssd-a.S @@ -101,13 +101,11 @@ SSE_PP_16xN 16 SSE_PP_16xN 32 // Loop unrolled to process 4 rows per iteration. -.macro SSE_PP_32xN h -function PFX(pixel_sse_pp_32x\h\()_neon) - mov w12, #(\h / 4) +function PFX(pixel_sse_pp_32xh_neon), export=0 movi v0.4s, #0 movi v1.4s, #0 -.Loop_sse_pp_32_x\h: - sub w12, w12, #1 +.Loop_sse_pp_32xh: + sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, [x0], x1 ld1 {v18.16b,v19.16b}, [x2], x3 @@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon) uadalp v0.4s, v22.8h uadalp v1.4s, v23.8h .endr - cbnz w12, .Loop_sse_pp_32_x\h + cbnz w4, .Loop_sse_pp_32xh add v0.4s, v0.4s, v1.4s ret_v0_w0 endfunc + +.macro SSE_PP_32xN h +function PFX(pixel_sse_pp_32x\h\()_neon) + mov w4, \h / 4 + b PFX(pixel_sse_pp_32xh_neon) +endfunc .endm SSE_PP_32xN 32 diff --git a/source/common/aarch64/ssd-neon-dotprod.S b/source/common/aarch64/ssd-neon-dotprod.S index 4df4fb35b..044412fba 100644 --- a/source/common/aarch64/ssd-neon-dotprod.S +++ b/source/common/aarch64/ssd-neon-dotprod.S @@ -110,13 +110,11 @@ SSE_PP_16xN 16 SSE_PP_16xN 32 // Loop unrolled to process 4 rows per iteration. -.macro SSE_PP_32xN h -function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) - mov w12, #(\h / 4) +function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0 movi v0.4s, #0 movi v1.4s, #0 -.Loop_sse_pp_32_x\h: - sub w12, w12, #1 +.Loop_sse_pp_32xh: + sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, [x0], x1 ld1 {v18.16b,v19.16b}, [x2], x3 @@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) uabd v3.16b, v17.16b, v19.16b udot v1.4s, v3.16b, v3.16b .endr - cbnz w12, .Loop_sse_pp_32_x\h + cbnz w4, .Loop_sse_pp_32xh add v0.4s, v0.4s, v1.4s addv s0, v0.4s fmov w0, s0 ret endfunc + +.macro SSE_PP_32xN h +function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) + mov w4, \h / 4 + b PFX(pixel_sse_pp_32xh_neon_dotprod) +endfunc .endm SSE_PP_32xN 32 -- 2.42.1
>From 3f2420887b051b543388d928e26b3cda838926f9 Mon Sep 17 00:00:00 2001 From: Hari Limaye <hari.lim...@arm.com> Date: Thu, 18 Apr 2024 11:53:45 +0100 Subject: [PATCH] AArch64: Reuse code for sse_pp_neon and sse_pp_neon_dotprod Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for block sizes of width 32 to dispatch to shared functions, to reduce code size. --- source/common/aarch64/ssd-a.S | 16 ++++++++++------ source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S index 4a5e80d49..a66d68617 100644 --- a/source/common/aarch64/ssd-a.S +++ b/source/common/aarch64/ssd-a.S @@ -101,13 +101,11 @@ SSE_PP_16xN 16 SSE_PP_16xN 32 // Loop unrolled to process 4 rows per iteration. -.macro SSE_PP_32xN h -function PFX(pixel_sse_pp_32x\h\()_neon) - mov w12, #(\h / 4) +function PFX(pixel_sse_pp_32xh_neon), export=0 movi v0.4s, #0 movi v1.4s, #0 -.Loop_sse_pp_32_x\h: - sub w12, w12, #1 +.Loop_sse_pp_32xh: + sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, [x0], x1 ld1 {v18.16b,v19.16b}, [x2], x3 @@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon) uadalp v0.4s, v22.8h uadalp v1.4s, v23.8h .endr - cbnz w12, .Loop_sse_pp_32_x\h + cbnz w4, .Loop_sse_pp_32xh add v0.4s, v0.4s, v1.4s ret_v0_w0 endfunc + +.macro SSE_PP_32xN h +function PFX(pixel_sse_pp_32x\h\()_neon) + mov w4, \h / 4 + b PFX(pixel_sse_pp_32xh_neon) +endfunc .endm SSE_PP_32xN 32 diff --git a/source/common/aarch64/ssd-neon-dotprod.S b/source/common/aarch64/ssd-neon-dotprod.S index 4df4fb35b..044412fba 100644 --- a/source/common/aarch64/ssd-neon-dotprod.S +++ b/source/common/aarch64/ssd-neon-dotprod.S @@ -110,13 +110,11 @@ SSE_PP_16xN 16 SSE_PP_16xN 32 // Loop unrolled to process 4 rows per iteration. -.macro SSE_PP_32xN h -function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) - mov w12, #(\h / 4) +function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0 movi v0.4s, #0 movi v1.4s, #0 -.Loop_sse_pp_32_x\h: - sub w12, w12, #1 +.Loop_sse_pp_32xh: + sub w4, w4, #1 .rept 4 ld1 {v16.16b,v17.16b}, [x0], x1 ld1 {v18.16b,v19.16b}, [x2], x3 @@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) uabd v3.16b, v17.16b, v19.16b udot v1.4s, v3.16b, v3.16b .endr - cbnz w12, .Loop_sse_pp_32_x\h + cbnz w4, .Loop_sse_pp_32xh add v0.4s, v0.4s, v1.4s addv s0, v0.4s fmov w0, s0 ret endfunc + +.macro SSE_PP_32xN h +function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) + mov w4, \h / 4 + b PFX(pixel_sse_pp_32xh_neon_dotprod) +endfunc .endm SSE_PP_32xN 32 -- 2.42.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel