Pushed to master branch. *__________________________* *Karam Singh* *Ph.D. IIT Guwahati* Senior Software (Video Coding) Engineer Mobile: +91 8011279030 Block 9A, 6th floor, DLF Cyber City Manapakkam, Chennai 600 089
On Thu, Aug 22, 2024 at 3:34 PM Hari Limaye <hari.lim...@arm.com> wrote: > Refactor the implementations of sse_pp_neon and sse_pp_neon_dotprod for > block sizes of width 32 to dispatch to shared functions, to reduce code > size. > --- > source/common/aarch64/ssd-a.S | 16 ++++++++++------ > source/common/aarch64/ssd-neon-dotprod.S | 16 ++++++++++------ > 2 files changed, 20 insertions(+), 12 deletions(-) > > diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S > index 4a5e80d49..a66d68617 100644 > --- a/source/common/aarch64/ssd-a.S > +++ b/source/common/aarch64/ssd-a.S > @@ -101,13 +101,11 @@ SSE_PP_16xN 16 > SSE_PP_16xN 32 > > // Loop unrolled to process 4 rows per iteration. > -.macro SSE_PP_32xN h > -function PFX(pixel_sse_pp_32x\h\()_neon) > - mov w12, #(\h / 4) > +function PFX(pixel_sse_pp_32xh_neon), export=0 > movi v0.4s, #0 > movi v1.4s, #0 > -.Loop_sse_pp_32_x\h: > - sub w12, w12, #1 > +.Loop_sse_pp_32xh: > + sub w4, w4, #1 > .rept 4 > ld1 {v16.16b,v17.16b}, [x0], x1 > ld1 {v18.16b,v19.16b}, [x2], x3 > @@ -125,10 +123,16 @@ function PFX(pixel_sse_pp_32x\h\()_neon) > uadalp v0.4s, v22.8h > uadalp v1.4s, v23.8h > .endr > - cbnz w12, .Loop_sse_pp_32_x\h > + cbnz w4, .Loop_sse_pp_32xh > add v0.4s, v0.4s, v1.4s > ret_v0_w0 > endfunc > + > +.macro SSE_PP_32xN h > +function PFX(pixel_sse_pp_32x\h\()_neon) > + mov w4, \h / 4 > + b PFX(pixel_sse_pp_32xh_neon) > +endfunc > .endm > > SSE_PP_32xN 32 > diff --git a/source/common/aarch64/ssd-neon-dotprod.S > b/source/common/aarch64/ssd-neon-dotprod.S > index 4df4fb35b..044412fba 100644 > --- a/source/common/aarch64/ssd-neon-dotprod.S > +++ b/source/common/aarch64/ssd-neon-dotprod.S > @@ -110,13 +110,11 @@ SSE_PP_16xN 16 > SSE_PP_16xN 32 > > // Loop unrolled to process 4 rows per iteration. > -.macro SSE_PP_32xN h > -function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) > - mov w12, #(\h / 4) > +function PFX(pixel_sse_pp_32xh_neon_dotprod), export=0 > movi v0.4s, #0 > movi v1.4s, #0 > -.Loop_sse_pp_32_x\h: > - sub w12, w12, #1 > +.Loop_sse_pp_32xh: > + sub w4, w4, #1 > .rept 4 > ld1 {v16.16b,v17.16b}, [x0], x1 > ld1 {v18.16b,v19.16b}, [x2], x3 > @@ -126,12 +124,18 @@ function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) > uabd v3.16b, v17.16b, v19.16b > udot v1.4s, v3.16b, v3.16b > .endr > - cbnz w12, .Loop_sse_pp_32_x\h > + cbnz w4, .Loop_sse_pp_32xh > add v0.4s, v0.4s, v1.4s > addv s0, v0.4s > fmov w0, s0 > ret > endfunc > + > +.macro SSE_PP_32xN h > +function PFX(pixel_sse_pp_32x\h\()_neon_dotprod) > + mov w4, \h / 4 > + b PFX(pixel_sse_pp_32xh_neon_dotprod) > +endfunc > .endm > > SSE_PP_32xN 32 > -- > 2.42.1 > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel