This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit f5e6cca935e9f575659b3eb44a130dd95919ca7e Author: Jun Zhao <[email protected]> AuthorDate: Tue Feb 3 13:30:25 2026 +0800 Commit: Martin Storsjö <[email protected]> CommitDate: Fri Mar 13 21:43:37 2026 +0000 lavc/hevc: add aarch64 NEON for qpel uni-weighted HV filter Add NEON-optimized implementations for HEVC QPEL uni-directional weighted HV interpolation (put_hevc_qpel_uni_w_hv) at 8-bit depth, for block widths 6, 12, 24, and 48. These functions perform horizontal then vertical 8-tap QPEL filtering with weighting (wx, ox, denom) and output to uint8_t. Previously only widths 4, 8, 16, 32, 64 were implemented; this completes coverage for all standard HEVC block widths. Performance results on Apple M4: ./tests/checkasm/checkasm --test=hevc_pel --bench put_hevc_qpel_uni_w_hv6_8_neon: 3.11x put_hevc_qpel_uni_w_hv12_8_neon: 3.19x put_hevc_qpel_uni_w_hv24_8_neon: 2.26x put_hevc_qpel_uni_w_hv48_8_neon: 1.80x Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/h26x/dsp.h | 4 +- libavcodec/aarch64/h26x/qpel_neon.S | 383 +++++++++++++++++++++++++++++- libavcodec/aarch64/hevcdsp_init_aarch64.c | 4 +- 3 files changed, 384 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index d4e79689fc..4b89049cde 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -222,12 +222,12 @@ NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); -NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, +NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width),); -NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, +NEON8_FNPROTO(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S index a22a8e0d78..651b81f301 100644 --- a/libavcodec/aarch64/h26x/qpel_neon.S +++ b/libavcodec/aarch64/h26x/qpel_neon.S @@ -600,8 +600,116 @@ function ff_hevc_put_hevc_qpel_h16_8_neon, export=1 ret endfunc -.else // qpel_uni, qpel_bi +.endif // qpel-only h16 functions + +// h24: process 24 pixels per row as 16 + 8, reusing h16 subroutine. +function ff_hevc_put_hevc_\type\()_h24_8_neon, export=1 + load_filter mx + sxtw height, heightw +.ifc \type, qpel_bi + ldrh w8, [sp] // width + mov x16, #(HEVC_MAX_PB_SIZE << 2) // src2bstridel + lsl x17, height, #7 // src2b reset + add x15, x4, #(HEVC_MAX_PB_SIZE << 1) // src2b +.endif + sub src, src, #3 +.ifnc \type, qpel + mov mx, x30 +.endif +.ifc \type, qpel + mov dststride, #(HEVC_MAX_PB_SIZE << 1) + lsl x13, srcstride, #1 // srcstridel + mov x14, #((HEVC_MAX_PB_SIZE << 2) - 32) +.else + lsl x14, dststride, #1 // dststridel + lsl x13, srcstride, #1 // srcstridel + sub x14, x14, #16 +.endif + add x10, dst, dststride // dstb + add x12, src, srcstride // srcb +0: +.ifc \type, qpel + // Preserve filter index (mx) and lr so h16 gets correct mx; ret uses lr + stp x4, x30, [sp, #-16]! + ldr x4, [sp] +.endif + // Load 32 bytes for row1 and row2 (need 24+7=31) + ld1 {v16.8b-v18.8b}, [src], x13 + ld1 {v19.8b-v21.8b}, [x12], x13 + + uxtl v16.8h, v16.8b + uxtl v19.8h, v19.8b + bl ff_hevc_put_hevc_h16_8_neon + subs heightw, heightw, #2 + +.ifc \type, qpel + st1 {v26.8h, v27.8h}, [dst], #32 + st1 {v28.8h, v29.8h}, [x10], #32 +.else +.ifc \type, qpel_bi + ld1 {v16.8h, v17.8h}, [ x4], x16 + ld1 {v18.8h, v19.8h}, [x15], x16 + sqadd v26.8h, v26.8h, v16.8h + sqadd v27.8h, v27.8h, v17.8h + sqadd v28.8h, v28.8h, v18.8h + sqadd v29.8h, v29.8h, v19.8h + sqrshrun v26.8b, v26.8h, #7 + sqrshrun v27.8b, v27.8h, #7 + sqrshrun v28.8b, v28.8h, #7 + sqrshrun v29.8b, v29.8h, #7 +.else + sqrshrun v26.8b, v26.8h, #6 + sqrshrun v27.8b, v27.8h, #6 + sqrshrun v28.8b, v28.8h, #6 + sqrshrun v29.8b, v29.8h, #6 +.endif + st1 {v26.8b, v27.8b}, [dst], #16 + st1 {v28.8b, v29.8b}, [x10], #16 +.endif + // Reload for second half (8 pixels at offset +16) + sub src, src, x13 + sub x12, x12, x13 + add src, src, #16 + add x12, x12, #16 + ld1 {v16.8b, v17.8b}, [src], x13 + ld1 {v18.8b, v19.8b}, [x12], x13 + bl ff_hevc_put_hevc_h8_8_neon +.ifc \type, qpel + st1 {v23.8h}, [dst], x14 + st1 {v24.8h}, [x10], x14 +.else +.ifc \type, qpel_bi + ld1 {v25.8h}, [ x4], x16 + ld1 {v26.8h}, [x15], x16 + sub x4, x4, #48 + sub x15, x15, #48 + sqadd v23.8h, v23.8h, v25.8h + sqadd v24.8h, v24.8h, v26.8h + sqrshrun v23.8b, v23.8h, #7 + sqrshrun v24.8b, v24.8h, #7 +.else + sqrshrun v23.8b, v23.8h, #6 + sqrshrun v24.8b, v24.8h, #6 +.endif + st1 {v23.8b}, [dst], x14 + st1 {v24.8b}, [x10], x14 +.endif + // Reset src pointers back (undo +16 offset) + sub src, src, #16 + sub x12, x12, #16 +.ifc \type, qpel + ldp x4, x30, [sp], #16 +.endif + b.gt 0b +.ifc \type, qpel + ret +.else + ret mx +.endif +endfunc + +.ifnc \type, qpel // qpel h16 already defined above; only emit for qpel_uni/qpel_bi .ifnc \type, qpel_bi function ff_vvc_put_\type\()_h16_8_neon, export=1 vvc_load_filter mx @@ -661,8 +769,7 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1 b.gt 1b // double line ret mx endfunc - -.endif // qpel vs qpel_uni/qpel_bi +.endif // !qpel .ifc \type, qpel // VVC qpel h32: self-contained int16-domain implementation @@ -5265,7 +5372,10 @@ DISABLE_I8MM .if \width >= 32 mov w6, #\width bl X(ff_hevc_put_hevc_qpel_h32_8_neon) +.elseif \width == 24 + bl X(ff_hevc_put_hevc_qpel_h24_8_neon) .else + mov w6, #\width bl X(ff_hevc_put_hevc_qpel_h\width\()_8_\suffix) .endif .else @@ -5396,6 +5506,106 @@ function hevc_put_hevc_qpel_uni_w_hv4_8_end_neon ret endfunc +// hv6: process 6 pixels (4 + 2), use 8-element loads/stores +.macro QPEL_UNI_W_HV_6 + sshr v26.4s, v26.4s, #6 + sshr v27.4s, v27.4s, #6 + mul v24.4s, v26.4s, v28.4s + mul v25.4s, v27.4s, v28.4s + sqrshl v24.4s, v24.4s, v30.4s + sqrshl v25.4s, v25.4s, v30.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtun v24.8b, v24.8h + add x15, x20, #4 // avoid st1 postincrement stall + st1 {v24.s}[0], [x20], x21 + st1 {v24.h}[2], [x15] +.endm + +function hevc_put_hevc_qpel_uni_w_hv6_8_end_neon + ldr q16, [sp] + ldr q17, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q18, [sp] + ldr q19, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q20, [sp] + ldr q21, [sp, x10] + add sp, sp, x10, lsl #1 + ldr q22, [sp] + add sp, sp, x10 +1: + ldr q23, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q16, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q17, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q18, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q19, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q20, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q21, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.eq 2f + + ldr q22, [sp] + add sp, sp, x10 + QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22 + subs w22, w22, #1 + QPEL_UNI_W_HV_6 + b.hi 1b + +2: + QPEL_UNI_W_HV_END + ret +endfunc + .macro QPEL_UNI_W_HV_8 sshr v26.4s, v26.4s, #6 sshr v27.4s, v27.4s, #6 @@ -5493,6 +5703,143 @@ function hevc_put_hevc_qpel_uni_w_hv8_8_end_neon ret endfunc +// hv12: process 12 pixels = 8 + 4 +// Input: v16-v23 hold first 8 elements per row (q registers) +// v1-v7,v31 hold elements 8-11 per row (d registers, only .4h used) +// Output: 12 bytes to [x20], advance by x21 +.macro QPEL_UNI_W_HV_12 + sshr v24.4s, v24.4s, #6 + sshr v25.4s, v25.4s, #6 + sshr v26.4s, v26.4s, #6 + mul v24.4s, v24.4s, v28.4s + mul v25.4s, v25.4s, v28.4s + mul v26.4s, v26.4s, v28.4s + sqrshl v24.4s, v24.4s, v30.4s + sqrshl v25.4s, v25.4s, v30.4s + sqrshl v26.4s, v26.4s, v30.4s + sqadd v24.4s, v24.4s, v29.4s + sqadd v25.4s, v25.4s, v29.4s + sqadd v26.4s, v26.4s, v29.4s + sqxtn v24.4h, v24.4s + sqxtn2 v24.8h, v25.4s + sqxtn v26.4h, v26.4s + sqxtun v24.8b, v24.8h + sqxtun v26.8b, v26.8h + add x15, x20, #8 // avoid st1 postincrement stall + st1 {v24.d}[0], [x20], x21 + st1 {v26.s}[0], [x15] +.endm + +function hevc_put_hevc_qpel_uni_w_hv12_8_end_neon + // Load first 7 rows of 12 elements each + // Each row: q16-q22 (first 8 elements) + d1-d7 (elements 8-11) + ldr q16, [sp] + ldr d1, [sp, #16] + add sp, sp, x10 + ldr q17, [sp] + ldr d2, [sp, #16] + add sp, sp, x10 + ldr q18, [sp] + ldr d3, [sp, #16] + add sp, sp, x10 + ldr q19, [sp] + ldr d4, [sp, #16] + add sp, sp, x10 + ldr q20, [sp] + ldr d5, [sp, #16] + add sp, sp, x10 + ldr q21, [sp] + ldr d6, [sp, #16] + add sp, sp, x10 + ldr q22, [sp] + ldr d7, [sp, #16] + add sp, sp, x10 +1: + ldr q23, [sp] + ldr d31, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23 + QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q16, [sp] + ldr d1, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16 + QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q17, [sp] + ldr d2, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17 + QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q18, [sp] + ldr d3, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18 + QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q19, [sp] + ldr d4, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19 + QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q20, [sp] + ldr d5, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20 + QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q21, [sp] + ldr d6, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21 + QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.eq 2f + + ldr q22, [sp] + ldr d7, [sp, #16] + add sp, sp, x10 + QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22 + QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7 + subs w22, w22, #1 + QPEL_UNI_W_HV_12 + b.hi 1b + +2: + QPEL_UNI_W_HV_END + ret +endfunc + .macro QPEL_UNI_W_HV_16 sshr v24.4s, v24.4s, #6 sshr v25.4s, v25.4s, #6 @@ -5625,11 +5972,21 @@ function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon add x11, x14, #32 add x20, x13, #16 mov w22, w12 + cmp w27, #8 + b.eq .Lqpel_uni_w_hv24_tail mov x14, x11 mov x13, x20 b.hi 3b QPEL_UNI_W_HV_END ret + +// hv24 tail: process remaining 8 columns (16-23) via tail-call to hv8 +// sp is set to the start of tail data. +// This reuses the stack frame setup by ff_hevc_put_hevc_qpel_uni_w_hv24_8_neon. +// hv8 will restore the original lr and return to the caller. +.Lqpel_uni_w_hv24_tail: + mov sp, x11 + b hevc_put_hevc_qpel_uni_w_hv8_8_end_neon endfunc .macro qpel_uni_w_hv suffix @@ -5638,11 +5995,26 @@ function ff_hevc_put_hevc_qpel_uni_w_hv4_8_\suffix, export=1 b hevc_put_hevc_qpel_uni_w_hv4_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv6_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 6, \suffix + b hevc_put_hevc_qpel_uni_w_hv6_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv8_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 8, \suffix b hevc_put_hevc_qpel_uni_w_hv8_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv12_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 12, \suffix + b hevc_put_hevc_qpel_uni_w_hv12_8_end_neon +endfunc + +function ff_hevc_put_hevc_qpel_uni_w_hv24_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 24, \suffix + b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv16_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 16, \suffix b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon @@ -5653,6 +6025,11 @@ function ff_hevc_put_hevc_qpel_uni_w_hv32_8_\suffix, export=1 b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon endfunc +function ff_hevc_put_hevc_qpel_uni_w_hv48_8_\suffix, export=1 + QPEL_UNI_W_HV_HEADER 48, \suffix + b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon +endfunc + function ff_hevc_put_hevc_qpel_uni_w_hv64_8_\suffix, export=1 QPEL_UNI_W_HV_HEADER 64, \suffix b hevc_put_hevc_qpel_uni_w_hv16_8_end_neon diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 30560bafb9..b8448c24eb 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -288,7 +288,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv,); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv,); - NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,); + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv,); NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv,); if (have_i8mm(cpu_flags)) { @@ -302,7 +302,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); - NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); + NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv, _i8mm); } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
