aarch64: new optimization for 8-bit hevc_epel_uni_v

Logan.Lyu Fri, 22 Sep 2023 21:35:17 -0700

Hi, Martin,

Thanks for your review.

Thanks for the patches. Functionally, they seem to work, and theissues i saw in the code are relatively minor. Unfortunately, some ofthe issues are issues that we've been through in many earlier patches,so I would hope that you would pay attention to them in the futurebefore posting more patches.

Okay, I have noticed the previous issues and made some modificationsaccording to the issues, And I have completed the modifications based onyour comments.

If there are any missing issues that have not been corrected, please letme know.




在 2023/9/17 5:46, Martin Storsjö 写道:

On Thu, 14 Sep 2023, Logan.Lyu wrote:
Hi Martin,
You can try the attached patchset. If that doesn't work, My codebranch address is https://github.com/myais2023/FFmpeg/tree/hevc-aarch64
Thanks for the patches. Functionally, they seem to work, and theissues i saw in the code are relatively minor. Unfortunately, some ofthe issues are issues that we've been through in many earlier patches,so I would hope that you would pay attention to them in the futurebefore posting more patches.
In patch 1, you've got a bunch of sxtw instructions for src/dst strideparameters that have the type ptrdiff_t - that shouldn't be necessary?
In patch 2, you're moving the macros calc_epelh, calc_epelh2,load_epel_filterh - can you split out the move into a separate commit?(This isn't strictly necessary but would make things even clearer.)
In patch 2, you're storing below the stack, then decrementing itafterwards - e.g. like this:
+        stp             x0, x30, [sp, #-16]
+        stp             x1, x2, [sp, #-32]
+        stp             x3, x4, [sp, #-48]
+        stp             x5, x6, [sp, #-64]!
Please change that so that you're first predecrementing the wholearea, then storing the other elements above that stack pointer, e.g.like this:
stp x0, x30, [sp, #-64]!
stp x1, x2, [sp, #16]
stp x3, x4, [sp, #32]

etc.
The same issue also appears in variouos places within functions likethis:
+        stp             x0, x1, [sp, #-16]
+        stp             x4, x6, [sp, #-32]
+        stp             xzr, x30, [sp, #-48]!
Please fix all of these cases - you can search through your patchesfor anything related to storing on the stack. Also, storing xzr hereseems superfluous - if you've got an odd number of registers to store,just make one instruction str instead of stp (but keep the stackaligned).
Then in patch 4, you've got yet another pattern for doing thesestores, where you have superfluous consecutive stack decrements likethis:
+        stp             x6, x30, [sp, #-16]!
+        mov             x7, #16
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
Please just do one stack decrement covering all the stack space you need.

I believe these issues have been raised in earlier reviews as well.

// Martin

From 62a59aa1fb7bc684ca0c216fd039dd0f231ad0c0 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Tue, 15 Aug 2023 16:42:25 +0800
Subject: [PATCH 04/10] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_v

checkasm bench:
put_hevc_qpel_uni_v4_8_c: 146.2
put_hevc_qpel_uni_v4_8_neon: 43.2
put_hevc_qpel_uni_v6_8_c: 303.9
put_hevc_qpel_uni_v6_8_neon: 69.7
put_hevc_qpel_uni_v8_8_c: 495.2
put_hevc_qpel_uni_v8_8_neon: 74.7
put_hevc_qpel_uni_v12_8_c: 1100.9
put_hevc_qpel_uni_v12_8_neon: 222.4
put_hevc_qpel_uni_v16_8_c: 1955.2
put_hevc_qpel_uni_v16_8_neon: 269.2
put_hevc_qpel_uni_v24_8_c: 4571.9
put_hevc_qpel_uni_v24_8_neon: 832.4
put_hevc_qpel_uni_v32_8_c: 8226.4
put_hevc_qpel_uni_v32_8_neon: 1035.7
put_hevc_qpel_uni_v48_8_c: 18324.2
put_hevc_qpel_uni_v48_8_neon: 2321.2
put_hevc_qpel_uni_v64_8_c: 37659.4
put_hevc_qpel_uni_v64_8_neon: 4122.2

Co-Authored-By: J. Dekker <j...@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 221 ++++++++++++++++++++++
 2 files changed, 226 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index d78954f440..51d212ff72 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -192,6 +192,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -295,6 +299,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index e38dff9645..2107e31a3c 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -44,6 +44,35 @@ endconst
         sxtl            v0.8h, v0.8b
 .endm
 
+.macro load_qpel_filterb freg, xreg
+        movrel          \xreg, qpel_filters_abs
+        add             \xreg, \xreg, \freg, lsl #3
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+        ld4r            {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
         // void put_hevc_qpel_h(int16_t *dst,
@@ -595,6 +624,198 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, 
export=1
         ret
 endfunc
 
+.macro calc_all
+        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
+        b.eq            2f
+        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
+        b.eq            2f
+        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
+        b.hi            1b
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().s}[0], [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1             {v24.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #4
+        sub             x2, x2, x3
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        sqrshrun        v24.8b, v24.8h, #6
+        st1             {v24.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v24.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1             {v24.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #8
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        mov             w11, w4         // height
+        mov             x10, x0         // dst
+        ldr             q16, [x8]
+        ldr             q17, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q18, [x8]
+        ldr             q19, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q20, [x8]
+        ldr             q21, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q22, [x8]
+        add             x8, x8, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x8], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1             {v24.8b}, [x10], #8
+        subs            x11, x11, #1
+        st1             {v24.s}[2], [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #12
+        add             x2, x2, #12
+        subs            w7, w7, #12
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        mov             w11, w4         // height
+        mov             x10, x0         // dst
+        ldr             q16, [x8]
+        ldr             q17, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q18, [x8]
+        ldr             q19, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q20, [x8]
+        ldr             q21, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q22, [x8]
+        add             x8, x8, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x8], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        subs            x11, x11, #1
+        st1             {v24.16b}, [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #16
+        add             x2, x2, #16
+        subs            w7, w7, #16
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
         sub             w10, w10, w5
-- 
2.38.0.windows.1

From 864036ece479923f71e05c5b232ee06c3ead3f84 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Tue, 15 Aug 2023 17:00:17 +0800
Subject: [PATCH 05/10] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_hv

checkasm bench:
put_hevc_qpel_uni_hv4_8_c: 489.2
put_hevc_qpel_uni_hv4_8_i8mm: 105.7
put_hevc_qpel_uni_hv6_8_c: 852.7
put_hevc_qpel_uni_hv6_8_i8mm: 268.7
put_hevc_qpel_uni_hv8_8_c: 1345.7
put_hevc_qpel_uni_hv8_8_i8mm: 300.4
put_hevc_qpel_uni_hv12_8_c: 2757.4
put_hevc_qpel_uni_hv12_8_i8mm: 581.4
put_hevc_qpel_uni_hv16_8_c: 4458.9
put_hevc_qpel_uni_hv16_8_i8mm: 860.2
put_hevc_qpel_uni_hv24_8_c: 9582.2
put_hevc_qpel_uni_hv24_8_i8mm: 2086.7
put_hevc_qpel_uni_hv32_8_c: 16401.9
put_hevc_qpel_uni_hv32_8_i8mm: 3217.4
put_hevc_qpel_uni_hv48_8_c: 36402.4
put_hevc_qpel_uni_hv48_8_i8mm: 7082.7
put_hevc_qpel_uni_hv64_8_c: 62713.2
put_hevc_qpel_uni_hv64_8_i8mm: 12408.9

Co-Authored-By: J. Dekker <j...@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 376 ++++++++++++++++++++++
 2 files changed, 381 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 51d212ff72..fc94b2c416 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -196,6 +196,10 @@ NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t 
dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -310,6 +314,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 2107e31a3c..4132d7a8a9 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -73,6 +73,45 @@ endconst
         umlsl2          \dst\().8h, \src7\().16b, v7.16b
 .endm
 
+.macro load_qpel_filterh freg, xreg
+        movrel          \xreg, qpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, 
shift=6
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+        sshr            \dst\().4s, \dst\().4s, \shift
+.else
+        \op             \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, 
op, shift=6
+        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sshr
+        sshr            \dst\().4s, \dstt\().4s, \shift
+.else
+        \op             \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
         // void put_hevc_qpel_h(int16_t *dst,
@@ -1519,6 +1558,343 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, 
export=1
 endfunc
 
 #if HAVE_I8MM
+
+.macro calc_all2
+        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, 
v23, v25, v27, v29, v31
+        b.eq            2f
+        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, 
v25, v27, v29, v31, v17
+        b.eq            2f
+        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, 
v27, v29, v31, v17, v19
+        b.eq            2f
+        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, 
v29, v31, v17, v19, v21
+        b.eq            2f
+        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, 
v31, v17, v19, v21, v23
+        b.eq            2f
+        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, 
v17, v19, v21, v23, v25
+        b.eq            2f
+        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, 
v19, v21, v23, v25, v27
+        b.eq            2f
+        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, 
v21, v23, v25, v27, v29
+        b.hi            1b
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        ldr             d16, [sp]
+        ldr             d17, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             d22, [sp]
+        add             sp, sp, x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().4h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        subs            w4, w4, #1
+        st1             {v1.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #4
+        ldr             q16, [sp]
+        ldr             q17, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1             {v1.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v1.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        ldr             q16, [sp]
+        ldr             q17, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x9]
+        add             sp, sp, x9, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, 
\src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, 
\src6, \src7, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        subs            w4, w4, #1
+        st1             {v1.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x7, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x0, sp, #48
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon_i8mm)
+        ldp             x7, x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #8
+        ld1             {v16.8h, v17.8h}, [sp], x9
+        ld1             {v18.8h, v19.8h}, [sp], x9
+        ld1             {v20.8h, v21.8h}, [sp], x9
+        ld1             {v22.8h, v23.8h}, [sp], x9
+        ld1             {v24.8h, v25.8h}, [sp], x9
+        ld1             {v26.8h, v27.8h}, [sp], x9
+        ld1             {v28.8h, v29.8h}, [sp], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, 
src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x9
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  
\src6,  \src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  
\src6,  \src7, sqrshrn2, #12
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, 
\src14, \src15, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1             {v1.8b}, [x0], #8
+        subs            w4, w4, #1
+        st1             {v1.s}[2], [x0], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x7, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon_i8mm)
+        ldp             x7, x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+.Lqpel_uni_hv16_loop:
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        sub             w12, w9, w7, lsl #1
+0:      mov             x8, sp          // src
+        ld1             {v16.8h, v17.8h}, [x8], x9
+        mov             w11, w4         // height
+        ld1             {v18.8h, v19.8h}, [x8], x9
+        mov             x10, x0         // dst
+        ld1             {v20.8h, v21.8h}, [x8], x9
+        ld1             {v22.8h, v23.8h}, [x8], x9
+        ld1             {v24.8h, v25.8h}, [x8], x9
+        ld1             {v26.8h, v27.8h}, [x8], x9
+        ld1             {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, 
src9, src10, src11, src12, src13, src14, src15
+        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+        calc_qpelh      v1,     \src0, \src1, \src2,  \src3,  \src4,  \src5,  
\src6,  \src7,  sqrshrn,  #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2,  \src3,  \src4,  \src5,  
\src6,  \src7,  sqrshrn2, #12
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, 
\src14, \src15, sqrshrn,  #12
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, 
\src14, \src15, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        subs            x11, x11, #1
+        sqxtun2         v1.16b, v2.8h
+        st1             {v1.16b}, [x10], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #16
+        add             sp, sp, #32
+        subs            w7, w7, #16
+        b.ne            0b
+        add             w10, w4, #6
+        add             sp, sp, x12         // discard rest of first line
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon_i8mm, export=1
+        stp             x4, x5, [sp, #-64]!
+        stp             x2, x3, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        stp             x6, x30, [sp, #48]
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon_i8mm)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        add             x2, x2, #16
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        mov             x7, #8
+        add             x0, x0, #16
+        ldr             x6, [sp]
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon_i8mm)
+        ldr             x30, [sp, #8]
+        add             sp, sp, #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x7, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        add             x0, sp, #48
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon_i8mm)
+        ldp             x7, x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        b               .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x7, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x0, sp, #48
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon_i8mm)
+        ldp             x7, x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        b               .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon_i8mm, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x7, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        mov             x2, x3
+        sub             x1, x1, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon_i8mm)
+        ldp             x7, x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        b               .Lqpel_uni_hv16_loop
+endfunc
+
 .macro QPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #3
-- 
2.38.0.windows.1

From 8d9f5940413087d6327ddc421e84bce1c04fa3bf Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Tue, 15 Aug 2023 15:24:32 +0800
Subject: [PATCH 01/10] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_v

checkasm bench:
put_hevc_epel_uni_hv64_8_i8mm: 6568.7
put_hevc_epel_uni_v4_8_c: 88.7
put_hevc_epel_uni_v4_8_neon: 32.7
put_hevc_epel_uni_v6_8_c: 185.4
put_hevc_epel_uni_v6_8_neon: 44.9
put_hevc_epel_uni_v8_8_c: 333.9
put_hevc_epel_uni_v8_8_neon: 44.4
put_hevc_epel_uni_v12_8_c: 728.7
put_hevc_epel_uni_v12_8_neon: 119.7
put_hevc_epel_uni_v16_8_c: 1224.2
put_hevc_epel_uni_v16_8_neon: 139.7
put_hevc_epel_uni_v24_8_c: 2531.2
put_hevc_epel_uni_v24_8_neon: 329.9
put_hevc_epel_uni_v32_8_c: 4739.9
put_hevc_epel_uni_v32_8_neon: 562.7
put_hevc_epel_uni_v48_8_c: 10618.7
put_hevc_epel_uni_v48_8_neon: 1256.2
put_hevc_epel_uni_v64_8_c: 19169.9
put_hevc_epel_uni_v64_8_neon: 2179.2

Co-Authored-By: J. Dekker <j...@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 284 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 289 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index a8d694639b..83cb15b1db 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -32,6 +32,290 @@ const epel_filters, align=4
         .byte -2, 10, 58, -2
 endconst
 
+const epel_filters_abs, align=4
+        .byte  0,  0,  0,  0
+        .byte  2, 58, 10,  2
+        .byte  4, 54, 16,  2
+        .byte  6, 46, 28,  4
+        .byte  4, 36, 36,  4
+        .byte  4, 28, 46,  6
+        .byte  2, 16, 54,  4
+        .byte  2, 10, 58,  2
+endconst
+
+
+.macro load_epel_filterb freg, xreg
+        movrel          \xreg, epel_filters_abs
+        add             \xreg, \xreg, \freg, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+.endm
+
+.macro calc_epelb dst, src0, src1, src2, src3
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src2\().8b, v2.8b
+        umlsl           \dst\().8h, \src3\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src0, src1, src2, src3
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src2\().16b, v2.16b
+        umlsl2          \dst\().8h, \src3\().16b, v3.16b
+.endm
+
+.macro calc_all4
+        calc            v16, v17, v18, v19
+        b.eq            2f
+        calc            v17, v18, v19, v16
+        b.eq            2f
+        calc            v18, v19, v16, v17
+        b.eq            2f
+        calc            v19, v16, v17, v18
+        b.ne            1b
+.endm
+
+.macro calc_all8
+        calc            v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21
+        b.ne            1b
+.endm
+
+.macro calc_all12
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, 
v27
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17, 
v18
+        b.eq            2f
+        calc            v22, v23, v24, v25, v26, v27, v16, v17, v18, v19, v20, 
v21
+        b.eq            2f
+        calc            v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, 
v24
+        b.ne            1b
+.endm
+
+.macro calc_all16
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, 
v27, v28, v29, v30, v31
+        b.eq            2f
+        calc            v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, 
v31, v16, v17, v18, v19
+        b.eq            2f
+        calc            v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, 
v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, 
v23, v24, v25, v26, v27
+        b.ne            1b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.s}[0], [x2], x3
+        ld1             {v17.s}[0], [x2], x3
+        ld1             {v18.s}[0], [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x2], x3
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.s}[0], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        st1             {v4.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v4.h}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1             {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1             {v22.8b, v23.8b, v24.8b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, 
src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b-v6.8b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ld1             {v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x2], x3
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b}, [x0], x1
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, 
src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, [x2], x3
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b, v29.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, 
src11, src12, src13, src14, src15
+        ld1             {\src12\().16b, \src13\().16b, \src14\().16b, 
\src15\().16b}, [x2], x3
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b, v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b, v11.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+.endm
+1:      calc_all16
+.purgem calc
+2:      ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
 #if HAVE_I8MM
 
 .macro EPEL_H_HEADER
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e125b0cfb2..f1e167c50b 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,10 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t 
_dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -285,6 +289,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
-- 
2.38.0.windows.1

From 2c0d09c7c14c7ec479533fa9e5db229458a99038 Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sat, 23 Sep 2023 10:22:09 +0800
Subject: [PATCH 02/10] move macros calc_epelh, calc_epelh2, load_epel_filterh

---
 libavcodec/aarch64/hevcdsp_epel_neon.S | 44 ++++++++++++++------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 83cb15b1db..03d7ea4f68 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -64,6 +64,29 @@ endconst
         umlsl2          \dst\().8h, \src3\().16b, v3.16b
 .endm
 
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
 .macro calc_all4
         calc            v16, v17, v18, v19
         b.eq            2f
@@ -1102,28 +1125,7 @@ endfunc
         sqxtn2          v6.8h, v31.4s
 .endm
 
-.macro calc_epelh dst, src0, src1, src2, src3
-        smull           \dst\().4s, \src0\().4h, v0.h[0]
-        smlal           \dst\().4s, \src1\().4h, v0.h[1]
-        smlal           \dst\().4s, \src2\().4h, v0.h[2]
-        smlal           \dst\().4s, \src3\().4h, v0.h[3]
-        sqshrn          \dst\().4h, \dst\().4s, #6
-.endm
-
-.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
-        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
-        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
-        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
-        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
-        sqshrn2         \dst\().8h, \tmp\().4s, #6
-.endm
 
-.macro load_epel_filterh freg, xreg
-        movrel          \xreg, epel_filters
-        add             \xreg, \xreg, \freg, lsl #2
-        ld1             {v0.8b}, [\xreg]
-        sxtl            v0.8h, v0.8b
-.endm
 
 function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
         epel_uni_w_hv_start
-- 
2.38.0.windows.1

From 73d2775257f5bf65465ecc2fdc87e6881cb56cdb Mon Sep 17 00:00:00 2001
From: Logan Lyu <logan....@myais.com.cn>
Date: Sat, 23 Sep 2023 10:38:36 +0800
Subject: [PATCH 03/10] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_hv

checkasm bench:
put_hevc_epel_uni_hv4_8_c: 204.7
put_hevc_epel_uni_hv4_8_i8mm: 70.2
put_hevc_epel_uni_hv6_8_c: 378.2
put_hevc_epel_uni_hv6_8_i8mm: 131.9
put_hevc_epel_uni_hv8_8_c: 637.7
put_hevc_epel_uni_hv8_8_i8mm: 137.9
put_hevc_epel_uni_hv12_8_c: 1301.9
put_hevc_epel_uni_hv12_8_i8mm: 314.2
put_hevc_epel_uni_hv16_8_c: 2203.4
put_hevc_epel_uni_hv16_8_i8mm: 454.7
put_hevc_epel_uni_hv24_8_c: 4848.2
put_hevc_epel_uni_hv24_8_i8mm: 1065.2
put_hevc_epel_uni_hv32_8_c: 8517.4
put_hevc_epel_uni_hv32_8_i8mm: 1898.4
put_hevc_epel_uni_hv48_8_c: 19591.7
put_hevc_epel_uni_hv48_8_i8mm: 4107.2
put_hevc_epel_uni_hv64_8_c: 33880.2
put_hevc_epel_uni_hv64_8_i8mm: 6568.7

Co-Authored-By: J. Dekker <j...@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 302 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 307 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 03d7ea4f68..1c501775a6 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -683,6 +683,306 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().4h}, [sp], x10
+        calc_epelh      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.s}[0], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        st1             {v4.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v4.h}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src1, \src2, \src3
+        calc_epelh2     v4, v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        subs            w4, w4, #1
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().8h, \src7\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src2, \src4, \src6
+        calc_epelh2     v4, v5, \src0, \src2, \src4, \src6
+        calc_epelh      v5,     \src1, \src3, \src5, \src7
+        calc_epelh2     v5, v6, \src1, \src3, \src5, \src7
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b}, [x0], x1
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm, export=1
+        add             w10, w4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             w3, w4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldr             x30, [sp]
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             sp, sp, #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, 
src11
+        ld1             {\src9\().8h, \src10\().8h, \src11\().8h}, [sp], x10
+        calc_epelh      v4,     \src0, \src3, \src6, \src9
+        calc_epelh2     v4, v5, \src0, \src3, \src6, \src9
+        calc_epelh      v5,     \src1, \src4, \src7, \src10
+        calc_epelh2     v5, v6, \src1, \src4, \src7, \src10
+        calc_epelh      v6,     \src2, \src5, \src8, \src11
+        calc_epelh2     v6, v7, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon_i8mm, export=1
+        stp             x5, x6, [sp, #-64]!
+        stp             x3, x4, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x0, x30, [sp, #48]
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldr             x30, [sp, #56]
+        add             sp, sp, #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon_i8mm, export=1
+        stp             x5, x6, [sp, #-64]!
+        stp             x3, x4, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x0, x30, [sp, #48]
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon_i8mm)
+        ldr             x30, [sp, #56]
+        add             sp, sp, #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon_i8mm, export=1
+        stp             x5, x6, [sp, #-64]!
+        stp             x3, x4, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x0, x30, [sp, #48]
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon_i8mm)
+        ldr             x30, [sp, #56]
+        add             sp, sp, #64
+        ret
+endfunc
+
 .macro EPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #1
@@ -1671,6 +1971,8 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, 
export=1
 endfunc
 
 
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index f1e167c50b..d78954f440 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -165,6 +165,10 @@ NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t 
dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm)
+
 NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -298,6 +302,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 
         if (have_i8mm(cpu_flags)) {
             NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
-- 
2.38.0.windows.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v

Reply via email to