hevc: add aarch64 NEON for epel uni horizontal filter

Jun Zhao via ffmpeg-cvslog Fri, 13 Mar 2026 14:45:04 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 489d36b5e107da7b9b861893c4280ae53bf67ba1
Author:     Jun Zhao <[email protected]>
AuthorDate: Tue Feb 3 11:33:03 2026 +0800
Commit:     Martin Storsjö <[email protected]>
CommitDate: Fri Mar 13 21:43:37 2026 +0000

    lavc/hevc: add aarch64 NEON for epel uni horizontal filter
    
    Add NEON-optimized implementations for HEVC EPEL uni-directional
    horizontal interpolation (put_hevc_epel_uni_h) at 8-bit depth.
    
    These functions perform horizontal 4-tap EPEL filtering with
    output directly to uint8_t pixels (no weighting):
    - 4-tap horizontal EPEL filter
    - Output: (filter_result + 32) >> 6, clipped to [0, 255]
    
    Supports all block widths: 4, 6, 8, 12, 16, 24, 32, 48, 64.
    
    Performance results on Apple M4:
    ./tests/checkasm/checkasm --test=hevc_pel --bench
    
    put_hevc_epel_uni_h4_8_neon:   2.26x
    put_hevc_epel_uni_h6_8_neon:   2.71x
    put_hevc_epel_uni_h8_8_neon:   4.40x
    put_hevc_epel_uni_h12_8_neon:  3.60x
    put_hevc_epel_uni_h16_8_neon:  3.00x
    put_hevc_epel_uni_h24_8_neon:  3.72x
    put_hevc_epel_uni_h32_8_neon:  3.14x
    put_hevc_epel_uni_h48_8_neon:  3.16x
    put_hevc_epel_uni_h64_8_neon:  3.15x
    
    Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/h26x/dsp.h             |   4 +
 libavcodec/aarch64/h26x/epel_neon.S       | 392 ++++++++++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   1 +
 3 files changed, 397 insertions(+)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 4b89049cde..47a61d22c2 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -130,6 +130,10 @@ NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t 
dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
diff --git a/libavcodec/aarch64/h26x/epel_neon.S 
b/libavcodec/aarch64/h26x/epel_neon.S
index 235e3f5dd6..412f929992 100644
--- a/libavcodec/aarch64/h26x/epel_neon.S
+++ b/libavcodec/aarch64/h26x/epel_neon.S
@@ -1744,6 +1744,398 @@ function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
         ret
 endfunc
 
+// epel_uni_h: horizontal EPEL filter with output to uint8_t
+// void put_hevc_epel_uni_h(uint8_t *dst, ptrdiff_t dststride,
+//                          const uint8_t *src, ptrdiff_t srcstride,
+//                          int height, intptr_t mx, intptr_t my, int width)
+// x0: dst, x1: dststride, x2: src, x3: srcstride, w4: height, x5: mx
+
+.macro EPEL_UNI_H_HEADER
+        movrel          x7, epel_filters
+        add             x7, x7, x5, lsl #2
+        ld1r            {v30.4s}, [x7]
+        sxtl            v0.8h, v30.8b
+        sub             x2, x2, #1
+.endm
+
+function ff_hevc_put_hevc_epel_uni_h4_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v4.8b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v4.8b
+        ext             v5.16b, v4.16b, v4.16b, #2
+        ext             v6.16b, v4.16b, v4.16b, #4
+        ext             v7.16b, v4.16b, v4.16b, #6
+        mul             v16.4h, v4.4h, v0.h[0]
+        mla             v16.4h, v5.4h, v0.h[1]
+        mla             v16.4h, v6.4h, v0.h[2]
+        mla             v16.4h, v7.4h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        st1             {v16.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h6_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        add             x7, x0, #4
+        st1             {v16.s}[0], [x0], x1
+        st1             {v16.h}[2], [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h8_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        st1             {v16.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h12_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl2           v4.8h, v3.16b
+        uxtl            v3.8h, v3.8b
+        ext             v5.16b, v3.16b, v4.16b, #2
+        ext             v6.16b, v3.16b, v4.16b, #4
+        ext             v7.16b, v3.16b, v4.16b, #6
+        ext             v20.16b, v4.16b, v4.16b, #2
+        ext             v21.16b, v4.16b, v4.16b, #4
+        ext             v22.16b, v4.16b, v4.16b, #6
+        mul             v16.8h, v3.8h, v0.h[0]
+        mla             v16.8h, v5.8h, v0.h[1]
+        mla             v16.8h, v6.8h, v0.h[2]
+        mla             v16.8h, v7.8h, v0.h[3]
+        mul             v17.4h, v4.4h, v0.h[0]
+        mla             v17.4h, v20.4h, v0.h[1]
+        mla             v17.4h, v21.4h, v0.h[2]
+        mla             v17.4h, v22.4h, v0.h[3]
+        sqrshrun        v16.8b, v16.8h, #6
+        sqrshrun        v17.8b, v17.8h, #6
+        add             x7, x0, #8
+        st1             {v16.8b}, [x0], x1
+        st1             {v17.s}[0], [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h16_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v2.8b
+        uxtl2           v5.8h, v2.16b
+        uxtl            v6.8h, v3.8b
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        ext             v19.16b, v5.16b, v6.16b, #2
+        ext             v20.16b, v5.16b, v6.16b, #4
+        ext             v21.16b, v5.16b, v6.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v19.8h, v0.h[1]
+        mla             v23.8h, v20.8h, v0.h[2]
+        mla             v23.8h, v21.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        st1             {v22.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h24_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v3.8h, v1.8b
+        uxtl2           v4.8h, v1.16b
+        uxtl            v5.8h, v2.8b
+        uxtl2           v6.8h, v2.16b
+        // First 8 pixels
+        ext             v16.16b, v3.16b, v4.16b, #2
+        ext             v17.16b, v3.16b, v4.16b, #4
+        ext             v18.16b, v3.16b, v4.16b, #6
+        mul             v22.8h, v3.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v23.8h, v4.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v24.8h, v5.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        add             x7, x0, #16
+        st1             {v22.16b}, [x0], x1
+        st1             {v23.8b}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h32_8_neon, export=1
+        EPEL_UNI_H_HEADER
+1:      ld1             {v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v26.8h, v3.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v26.16b, #2
+        ext             v17.16b, v7.16b, v26.16b, #4
+        ext             v18.16b, v7.16b, v26.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        st1             {v22.16b, v23.16b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h48_8_neon, export=1
+        EPEL_UNI_H_HEADER
+        sub             sp, sp, #32
+        st1             {v8.16b, v9.16b}, [sp]
+1:      ld1             {v1.16b, v2.16b, v3.16b}, [x2]
+        add             x7, x2, #48
+        ld1             {v26.8b}, [x7]
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v8.8h, v3.8b
+        uxtl2           v9.8h, v3.16b
+        uxtl            v27.8h, v26.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v8.16b, #2
+        ext             v17.16b, v7.16b, v8.16b, #4
+        ext             v18.16b, v7.16b, v8.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        // Fifth 8 pixels
+        ext             v16.16b, v8.16b, v9.16b, #2
+        ext             v17.16b, v8.16b, v9.16b, #4
+        ext             v18.16b, v8.16b, v9.16b, #6
+        mul             v28.8h, v8.8h, v0.h[0]
+        mla             v28.8h, v16.8h, v0.h[1]
+        mla             v28.8h, v17.8h, v0.h[2]
+        mla             v28.8h, v18.8h, v0.h[3]
+        // Sixth 8 pixels
+        ext             v16.16b, v9.16b, v27.16b, #2
+        ext             v17.16b, v9.16b, v27.16b, #4
+        ext             v18.16b, v9.16b, v27.16b, #6
+        mul             v29.8h, v9.8h, v0.h[0]
+        mla             v29.8h, v16.8h, v0.h[1]
+        mla             v29.8h, v17.8h, v0.h[2]
+        mla             v29.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        sqrshrun        v24.8b, v28.8h, #6
+        sqrshrun2       v24.16b, v29.8h, #6
+        st1             {v22.16b, v23.16b, v24.16b}, [x0], x1
+        b.ne            1b
+        ld1             {v8.16b, v9.16b}, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_h64_8_neon, export=1
+        EPEL_UNI_H_HEADER
+        sub             sp, sp, #64
+        st1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+1:      add             x7, x2, #48
+        ld1             {v1.16b, v2.16b, v3.16b}, [x2]
+        ld1             {v26.16b, v27.16b}, [x7]
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        uxtl            v4.8h, v1.8b
+        uxtl2           v5.8h, v1.16b
+        uxtl            v6.8h, v2.8b
+        uxtl2           v7.8h, v2.16b
+        uxtl            v8.8h, v3.8b
+        uxtl2           v9.8h, v3.16b
+        uxtl            v10.8h, v26.8b
+        uxtl2           v11.8h, v26.16b
+        uxtl            v28.8h, v27.8b
+        // First 8 pixels
+        ext             v16.16b, v4.16b, v5.16b, #2
+        ext             v17.16b, v4.16b, v5.16b, #4
+        ext             v18.16b, v4.16b, v5.16b, #6
+        mul             v22.8h, v4.8h, v0.h[0]
+        mla             v22.8h, v16.8h, v0.h[1]
+        mla             v22.8h, v17.8h, v0.h[2]
+        mla             v22.8h, v18.8h, v0.h[3]
+        // Second 8 pixels
+        ext             v16.16b, v5.16b, v6.16b, #2
+        ext             v17.16b, v5.16b, v6.16b, #4
+        ext             v18.16b, v5.16b, v6.16b, #6
+        mul             v23.8h, v5.8h, v0.h[0]
+        mla             v23.8h, v16.8h, v0.h[1]
+        mla             v23.8h, v17.8h, v0.h[2]
+        mla             v23.8h, v18.8h, v0.h[3]
+        // Third 8 pixels
+        ext             v16.16b, v6.16b, v7.16b, #2
+        ext             v17.16b, v6.16b, v7.16b, #4
+        ext             v18.16b, v6.16b, v7.16b, #6
+        mul             v24.8h, v6.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Fourth 8 pixels
+        ext             v16.16b, v7.16b, v8.16b, #2
+        ext             v17.16b, v7.16b, v8.16b, #4
+        ext             v18.16b, v7.16b, v8.16b, #6
+        mul             v25.8h, v7.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v22.16b, v23.8h, #6
+        sqrshrun        v23.8b, v24.8h, #6
+        sqrshrun2       v23.16b, v25.8h, #6
+        // Fifth 8 pixels
+        ext             v16.16b, v8.16b, v9.16b, #2
+        ext             v17.16b, v8.16b, v9.16b, #4
+        ext             v18.16b, v8.16b, v9.16b, #6
+        mul             v24.8h, v8.8h, v0.h[0]
+        mla             v24.8h, v16.8h, v0.h[1]
+        mla             v24.8h, v17.8h, v0.h[2]
+        mla             v24.8h, v18.8h, v0.h[3]
+        // Sixth 8 pixels
+        ext             v16.16b, v9.16b, v10.16b, #2
+        ext             v17.16b, v9.16b, v10.16b, #4
+        ext             v18.16b, v9.16b, v10.16b, #6
+        mul             v25.8h, v9.8h, v0.h[0]
+        mla             v25.8h, v16.8h, v0.h[1]
+        mla             v25.8h, v17.8h, v0.h[2]
+        mla             v25.8h, v18.8h, v0.h[3]
+        // Seventh 8 pixels
+        ext             v16.16b, v10.16b, v11.16b, #2
+        ext             v17.16b, v10.16b, v11.16b, #4
+        ext             v18.16b, v10.16b, v11.16b, #6
+        mul             v26.8h, v10.8h, v0.h[0]
+        mla             v26.8h, v16.8h, v0.h[1]
+        mla             v26.8h, v17.8h, v0.h[2]
+        mla             v26.8h, v18.8h, v0.h[3]
+        // Eighth 8 pixels
+        ext             v16.16b, v11.16b, v28.16b, #2
+        ext             v17.16b, v11.16b, v28.16b, #4
+        ext             v18.16b, v11.16b, v28.16b, #6
+        mul             v27.8h, v11.8h, v0.h[0]
+        mla             v27.8h, v16.8h, v0.h[1]
+        mla             v27.8h, v17.8h, v0.h[2]
+        mla             v27.8h, v18.8h, v0.h[3]
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        sqrshrun        v25.8b, v26.8h, #6
+        sqrshrun2       v25.16b, v27.8h, #6
+        st1             {v22.16b, v23.16b, v24.16b, v25.16b}, [x0], x1
+        b.ne            1b
+        ld1             {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
 
 .macro EPEL_H_HEADER
         movrel          x5, epel_filters
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b8448c24eb..8544f1f17f 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -268,6 +268,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_qpel_bi_w, 0, 0, 
pel_bi_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_6(c->put_hevc_epel_bi_w, 0, 0, 
pel_bi_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 1, epel_uni_h,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v,);

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/05: lavc/hevc: add aarch64 NEON for epel uni horizontal filter

Reply via email to