Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for
block sizes of width 32 to dispatch to shared functions, to reduce code
size.
---
 source/common/aarch64/ssd-a.S            | 16 ++++++++++------
 source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 4a5e80d49..a66d68617 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -101,13 +101,11 @@ SSE_PP_16xN 16
 SSE_PP_16xN 32
 
 // Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon)
-    mov             w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon), export=0
     movi            v0.4s, #0
     movi            v1.4s, #0
-.Loop_sse_pp_32_x\h:
-    sub             w12, w12, #1
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
     ld1             {v18.16b,v19.16b}, [x2], x3
@@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon)
     uadalp          v0.4s, v22.8h
     uadalp          v1.4s, v23.8h
 .endr
-    cbnz            w12, .Loop_sse_pp_32_x\h
+    cbnz            w4, .Loop_sse_pp_32xh
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon)
+endfunc
 .endm
 
 SSE_PP_32xN 32
diff --git a/source/common/aarch64/ssd-neon-dotprod.S 
b/source/common/aarch64/ssd-neon-dotprod.S
index 4df4fb35b..044412fba 100644
--- a/source/common/aarch64/ssd-neon-dotprod.S
+++ b/source/common/aarch64/ssd-neon-dotprod.S
@@ -110,13 +110,11 @@ SSE_PP_16xN 16
 SSE_PP_16xN 32
 
 // Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
-    mov             w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
     movi            v0.4s, #0
     movi            v1.4s, #0
-.Loop_sse_pp_32_x\h:
-    sub             w12, w12, #1
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
     ld1             {v18.16b,v19.16b}, [x2], x3
@@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
     uabd            v3.16b, v17.16b, v19.16b
     udot            v1.4s, v3.16b, v3.16b
 .endr
-    cbnz            w12, .Loop_sse_pp_32_x\h
+    cbnz            w4, .Loop_sse_pp_32xh
     add             v0.4s, v0.4s, v1.4s
     addv            s0, v0.4s
     fmov            w0, s0
     ret
 endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)
+endfunc
 .endm
 
 SSE_PP_32xN 32
-- 
2.42.1

>From 3f2420887b051b543388d928e26b3cda838926f9 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.lim...@arm.com>
Date: Thu, 18 Apr 2024 11:53:45 +0100
Subject: [PATCH] AArch64: Reuse code for sse_pp_neon and sse_pp_neon_dotprod

Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for
block sizes of width 32 to dispatch to shared functions, to reduce code
size.
---
 source/common/aarch64/ssd-a.S            | 16 ++++++++++------
 source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 4a5e80d49..a66d68617 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -101,13 +101,11 @@ SSE_PP_16xN 16
 SSE_PP_16xN 32
 
 // Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon)
-    mov             w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon), export=0
     movi            v0.4s, #0
     movi            v1.4s, #0
-.Loop_sse_pp_32_x\h:
-    sub             w12, w12, #1
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
     ld1             {v18.16b,v19.16b}, [x2], x3
@@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon)
     uadalp          v0.4s, v22.8h
     uadalp          v1.4s, v23.8h
 .endr
-    cbnz            w12, .Loop_sse_pp_32_x\h
+    cbnz            w4, .Loop_sse_pp_32xh
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon)
+endfunc
 .endm
 
 SSE_PP_32xN 32
diff --git a/source/common/aarch64/ssd-neon-dotprod.S b/source/common/aarch64/ssd-neon-dotprod.S
index 4df4fb35b..044412fba 100644
--- a/source/common/aarch64/ssd-neon-dotprod.S
+++ b/source/common/aarch64/ssd-neon-dotprod.S
@@ -110,13 +110,11 @@ SSE_PP_16xN 16
 SSE_PP_16xN 32
 
 // Loop unrolled to process 4 rows per iteration.
-.macro SSE_PP_32xN h
-function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
-    mov             w12, #(\h / 4)
+function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0
     movi            v0.4s, #0
     movi            v1.4s, #0
-.Loop_sse_pp_32_x\h:
-    sub             w12, w12, #1
+.Loop_sse_pp_32xh:
+    sub             w4, w4, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
     ld1             {v18.16b,v19.16b}, [x2], x3
@@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
     uabd            v3.16b, v17.16b, v19.16b
     udot            v1.4s, v3.16b, v3.16b
 .endr
-    cbnz            w12, .Loop_sse_pp_32_x\h
+    cbnz            w4, .Loop_sse_pp_32xh
     add             v0.4s, v0.4s, v1.4s
     addv            s0, v0.4s
     fmov            w0, s0
     ret
 endfunc
+
+.macro SSE_PP_32xN h
+function PFX(pixel_sse_pp_32x\h\()_neon_dotprod)
+    mov             w4, \h / 4
+    b               PFX(pixel_sse_pp_32xh_neon_dotprod)
+endfunc
 .endm
 
 SSE_PP_32xN 32
-- 
2.42.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to