hevc: add aarch64 NEON for qpel uni-weighted HV filter

Jun Zhao via ffmpeg-cvslog Fri, 13 Mar 2026 14:44:45 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit f5e6cca935e9f575659b3eb44a130dd95919ca7e
Author:     Jun Zhao <[email protected]>
AuthorDate: Tue Feb 3 13:30:25 2026 +0800
Commit:     Martin Storsjö <[email protected]>
CommitDate: Fri Mar 13 21:43:37 2026 +0000

    lavc/hevc: add aarch64 NEON for qpel uni-weighted HV filter
    
    Add NEON-optimized implementations for HEVC QPEL uni-directional
    weighted HV interpolation (put_hevc_qpel_uni_w_hv) at 8-bit depth,
    for block widths 6, 12, 24, and 48.
    
    These functions perform horizontal then vertical 8-tap QPEL filtering
    with weighting (wx, ox, denom) and output to uint8_t. Previously
    only widths 4, 8, 16, 32, 64 were implemented; this completes
    coverage for all standard HEVC block widths.
    
    Performance results on Apple M4:
    ./tests/checkasm/checkasm --test=hevc_pel --bench
    
    put_hevc_qpel_uni_w_hv6_8_neon:   3.11x
    put_hevc_qpel_uni_w_hv12_8_neon:  3.19x
    put_hevc_qpel_uni_w_hv24_8_neon:  2.26x
    put_hevc_qpel_uni_w_hv48_8_neon:  1.80x
    
    Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/h26x/dsp.h             |   4 +-
 libavcodec/aarch64/h26x/qpel_neon.S       | 383 +++++++++++++++++++++++++++++-
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   4 +-
 3 files changed, 384 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index d4e79689fc..4b89049cde 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -222,12 +222,12 @@ NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t 
_dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
-NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
-NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S 
b/libavcodec/aarch64/h26x/qpel_neon.S
index a22a8e0d78..651b81f301 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -600,8 +600,116 @@ function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
         ret
 endfunc
 
-.else // qpel_uni, qpel_bi
+.endif // qpel-only h16 functions
+
+// h24: process 24 pixels per row as 16 + 8, reusing h16 subroutine.
+function ff_hevc_put_hevc_\type\()_h24_8_neon, export=1
+        load_filter     mx
+        sxtw            height, heightw
+.ifc \type, qpel_bi
+        ldrh            w8, [sp] // width
+        mov             x16, #(HEVC_MAX_PB_SIZE << 2) // src2bstridel
+        lsl             x17, height, #7 // src2b reset
+        add             x15, x4, #(HEVC_MAX_PB_SIZE << 1) // src2b
+.endif
+        sub             src, src, #3
+.ifnc \type, qpel
+        mov             mx, x30
+.endif
+.ifc \type, qpel
+        mov             dststride, #(HEVC_MAX_PB_SIZE << 1)
+        lsl             x13, srcstride, #1 // srcstridel
+        mov             x14, #((HEVC_MAX_PB_SIZE << 2) - 32)
+.else
+        lsl             x14, dststride, #1 // dststridel
+        lsl             x13, srcstride, #1 // srcstridel
+        sub             x14, x14, #16
+.endif
+        add             x10, dst, dststride // dstb
+        add             x12, src, srcstride // srcb
+0:
+.ifc \type, qpel
+        // Preserve filter index (mx) and lr so h16 gets correct mx; ret uses 
lr
+        stp             x4, x30, [sp, #-16]!
+        ldr             x4, [sp]
+.endif
+        // Load 32 bytes for row1 and row2 (need 24+7=31)
+        ld1             {v16.8b-v18.8b}, [src], x13
+        ld1             {v19.8b-v21.8b}, [x12], x13
+
+        uxtl            v16.8h, v16.8b
+        uxtl            v19.8h, v19.8b
+        bl              ff_hevc_put_hevc_h16_8_neon
+        subs            heightw, heightw, #2
+
+.ifc \type, qpel
+        st1             {v26.8h, v27.8h}, [dst], #32
+        st1             {v28.8h, v29.8h}, [x10], #32
+.else
+.ifc \type, qpel_bi
+        ld1             {v16.8h, v17.8h}, [ x4], x16
+        ld1             {v18.8h, v19.8h}, [x15], x16
+        sqadd           v26.8h, v26.8h, v16.8h
+        sqadd           v27.8h, v27.8h, v17.8h
+        sqadd           v28.8h, v28.8h, v18.8h
+        sqadd           v29.8h, v29.8h, v19.8h
+        sqrshrun        v26.8b, v26.8h, #7
+        sqrshrun        v27.8b, v27.8h, #7
+        sqrshrun        v28.8b, v28.8h, #7
+        sqrshrun        v29.8b, v29.8h, #7
+.else
+        sqrshrun        v26.8b, v26.8h, #6
+        sqrshrun        v27.8b, v27.8h, #6
+        sqrshrun        v28.8b, v28.8h, #6
+        sqrshrun        v29.8b, v29.8h, #6
+.endif
+        st1             {v26.8b, v27.8b}, [dst], #16
+        st1             {v28.8b, v29.8b}, [x10], #16
+.endif
+        // Reload for second half (8 pixels at offset +16)
+        sub             src, src, x13
+        sub             x12, x12, x13
+        add             src, src, #16
+        add             x12, x12, #16
+        ld1             {v16.8b, v17.8b}, [src], x13
+        ld1             {v18.8b, v19.8b}, [x12], x13
+        bl              ff_hevc_put_hevc_h8_8_neon
 
+.ifc \type, qpel
+        st1             {v23.8h}, [dst], x14
+        st1             {v24.8h}, [x10], x14
+.else
+.ifc \type, qpel_bi
+        ld1             {v25.8h}, [ x4], x16
+        ld1             {v26.8h}, [x15], x16
+        sub             x4,  x4,  #48
+        sub             x15, x15, #48
+        sqadd           v23.8h, v23.8h, v25.8h
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqrshrun        v23.8b, v23.8h, #7
+        sqrshrun        v24.8b, v24.8h, #7
+.else
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun        v24.8b, v24.8h, #6
+.endif
+        st1             {v23.8b}, [dst], x14
+        st1             {v24.8b}, [x10], x14
+.endif
+        // Reset src pointers back (undo +16 offset)
+        sub             src, src, #16
+        sub             x12, x12, #16
+.ifc \type, qpel
+        ldp             x4, x30, [sp], #16
+.endif
+        b.gt            0b
+.ifc \type, qpel
+        ret
+.else
+        ret             mx
+.endif
+endfunc
+
+.ifnc \type, qpel // qpel h16 already defined above; only emit for 
qpel_uni/qpel_bi
 .ifnc \type, qpel_bi
 function ff_vvc_put_\type\()_h16_8_neon, export=1
         vvc_load_filter mx
@@ -661,8 +769,7 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
         b.gt            1b // double line
         ret             mx
 endfunc
-
-.endif // qpel vs qpel_uni/qpel_bi
+.endif // !qpel
 
 .ifc \type, qpel
 // VVC qpel h32: self-contained int16-domain implementation
@@ -5265,7 +5372,10 @@ DISABLE_I8MM
 .if \width >= 32
         mov             w6,  #\width
         bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+.elseif \width == 24
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon)
 .else
+        mov             w6,  #\width
         bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_\suffix)
 .endif
 .else
@@ -5396,6 +5506,106 @@ function hevc_put_hevc_qpel_uni_w_hv4_8_end_neon
         ret
 endfunc
 
+// hv6: process 6 pixels (4 + 2), use 8-element loads/stores
+.macro QPEL_UNI_W_HV_6
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        mul             v25.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        add             x15, x20, #4            // avoid st1 postincrement 
stall
+        st1             {v24.s}[0], [x20], x21
+        st1             {v24.h}[2], [x15]
+.endm
+
+function hevc_put_hevc_qpel_uni_w_hv6_8_end_neon
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             q23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.eq            2f
+
+        ldr             q22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_6
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
 .macro QPEL_UNI_W_HV_8
         sshr            v26.4s, v26.4s, #6
         sshr            v27.4s, v27.4s, #6
@@ -5493,6 +5703,143 @@ function hevc_put_hevc_qpel_uni_w_hv8_8_end_neon
         ret
 endfunc
 
+// hv12: process 12 pixels = 8 + 4
+// Input: v16-v23 hold first 8 elements per row (q registers)
+//        v1-v7,v31 hold elements 8-11 per row (d registers, only .4h used)
+// Output: 12 bytes to [x20], advance by x21
+.macro QPEL_UNI_W_HV_12
+        sshr            v24.4s, v24.4s, #6
+        sshr            v25.4s, v25.4s, #6
+        sshr            v26.4s, v26.4s, #6
+        mul             v24.4s, v24.4s, v28.4s
+        mul             v25.4s, v25.4s, v28.4s
+        mul             v26.4s, v26.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqrshl          v26.4s, v26.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun          v26.8b, v26.8h
+        add             x15, x20, #8            // avoid st1 postincrement 
stall
+        st1             {v24.d}[0], [x20], x21
+        st1             {v26.s}[0], [x15]
+.endm
+
+function hevc_put_hevc_qpel_uni_w_hv12_8_end_neon
+        // Load first 7 rows of 12 elements each
+        // Each row: q16-q22 (first 8 elements) + d1-d7 (elements 8-11)
+        ldr             q16, [sp]
+        ldr             d1, [sp, #16]
+        add             sp, sp, x10
+        ldr             q17, [sp]
+        ldr             d2, [sp, #16]
+        add             sp, sp, x10
+        ldr             q18, [sp]
+        ldr             d3, [sp, #16]
+        add             sp, sp, x10
+        ldr             q19, [sp]
+        ldr             d4, [sp, #16]
+        add             sp, sp, x10
+        ldr             q20, [sp]
+        ldr             d5, [sp, #16]
+        add             sp, sp, x10
+        ldr             q21, [sp]
+        ldr             d6, [sp, #16]
+        add             sp, sp, x10
+        ldr             q22, [sp]
+        ldr             d7, [sp, #16]
+        add             sp, sp, x10
+1:
+        ldr             q23, [sp]
+        ldr             d31, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q16, [sp]
+        ldr             d1, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q17, [sp]
+        ldr             d2, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q18, [sp]
+        ldr             d3, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q19, [sp]
+        ldr             d4, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q20, [sp]
+        ldr             d5, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q21, [sp]
+        ldr             d6, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.eq            2f
+
+        ldr             q22, [sp]
+        ldr             d7, [sp, #16]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        subs            w22, w22, #1
+        QPEL_UNI_W_HV_12
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
 .macro QPEL_UNI_W_HV_16
         sshr            v24.4s, v24.4s, #6
         sshr            v25.4s, v25.4s, #6
@@ -5625,11 +5972,21 @@ function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
         add             x11, x14, #32
         add             x20, x13, #16
         mov             w22, w12
+        cmp             w27, #8
+        b.eq            .Lqpel_uni_w_hv24_tail
         mov             x14, x11
         mov             x13, x20
         b.hi            3b
         QPEL_UNI_W_HV_END
         ret
+
+// hv24 tail: process remaining 8 columns (16-23) via tail-call to hv8
+// sp is set to the start of tail data.
+// This reuses the stack frame setup by 
ff_hevc_put_hevc_qpel_uni_w_hv24_8_neon.
+// hv8 will restore the original lr and return to the caller.
+.Lqpel_uni_w_hv24_tail:
+        mov             sp, x11
+        b               hevc_put_hevc_qpel_uni_w_hv8_8_end_neon
 endfunc
 
 .macro qpel_uni_w_hv suffix
@@ -5638,11 +5995,26 @@ function ff_hevc_put_hevc_qpel_uni_w_hv4_8_\suffix, 
export=1
         b               hevc_put_hevc_qpel_uni_w_hv4_8_end_neon
 endfunc
 
+function ff_hevc_put_hevc_qpel_uni_w_hv6_8_\suffix, export=1
+        QPEL_UNI_W_HV_HEADER 6, \suffix
+        b               hevc_put_hevc_qpel_uni_w_hv6_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_uni_w_hv8_8_\suffix, export=1
         QPEL_UNI_W_HV_HEADER 8, \suffix
         b               hevc_put_hevc_qpel_uni_w_hv8_8_end_neon
 endfunc
 
+function ff_hevc_put_hevc_qpel_uni_w_hv12_8_\suffix, export=1
+        QPEL_UNI_W_HV_HEADER 12, \suffix
+        b               hevc_put_hevc_qpel_uni_w_hv12_8_end_neon
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_hv24_8_\suffix, export=1
+        QPEL_UNI_W_HV_HEADER 24, \suffix
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_uni_w_hv16_8_\suffix, export=1
         QPEL_UNI_W_HV_HEADER 16, \suffix
         b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
@@ -5653,6 +6025,11 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_\suffix, 
export=1
         b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
 endfunc
 
+function ff_hevc_put_hevc_qpel_uni_w_hv48_8_\suffix, export=1
+        QPEL_UNI_W_HV_HEADER 48, \suffix
+        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
+endfunc
+
 function ff_hevc_put_hevc_qpel_uni_w_hv64_8_\suffix, export=1
         QPEL_UNI_W_HV_HEADER 64, \suffix
         b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 30560bafb9..b8448c24eb 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -288,7 +288,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 
         NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,);
-        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,);
         NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv,);
 
         if (have_i8mm(cpu_flags)) {
@@ -302,7 +302,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
-            NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv, _i8mm);
         }
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/05: lavc/hevc: add aarch64 NEON for qpel uni-weighted HV filter

Reply via email to