The branch, master has been updated via 03c054d43c594211b89d5b2931430dccd7424d58 (commit) via 56a638d8365370b287c53768f2c8a34a4cf3e417 (commit) from e5ac70042e91d19110a04a52b7e6fa4703f61200 (commit)
- Log ----------------------------------------------------------------- commit 03c054d43c594211b89d5b2931430dccd7424d58 Author: Krzysztof Pyrkosz <ffm...@szaka.eu> AuthorDate: Mon Sep 8 20:56:24 2025 +0200 Commit: Martin Storsjö <mar...@martin.st> CommitDate: Tue Sep 23 11:20:20 2025 +0000 avcodec/aarch64/vvc: Implement dmvr_v_8 A72 dmvr_v_8_12x20_neon: 207.0 ( 4.15x) dmvr_v_8_20x12_neon: 170.4 ( 4.37x) dmvr_v_8_20x20_neon: 273.4 ( 4.58x) A53 dmvr_v_8_12x20_neon: 450.6 ( 4.21x) dmvr_v_8_20x12_neon: 342.8 ( 3.70x) dmvr_v_8_20x20_neon: 550.9 ( 3.79x) diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index bdfa142a5a..b7dc1d89f8 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -101,6 +101,7 @@ DMVR_FUN(, 12) DMVR_FUN(h_, 8) DMVR_FUN(h_, 10) DMVR_FUN(h_, 12) +DMVR_FUN(v_, 8) DMVR_FUN(hv_, 8) DMVR_FUN(hv_, 10) DMVR_FUN(hv_, 12) @@ -195,6 +196,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.w_avg = vvc_w_avg_8; c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon; c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon; + c->inter.dmvr[1][0] = ff_vvc_dmvr_v_8_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon; c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index df6b59510d..a874edf889 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -385,6 +385,62 @@ function ff_vvc_dmvr_12_neon, export=1 ret endfunc +function ff_vvc_dmvr_v_8_neon, export=1 + movrel x7, X(ff_vvc_inter_luma_dmvr_filters) + add x7, x7, x5, lsl #1 + ld2r {v0.16b, v1.16b}, [x7] + tbz w6, #4, 12f + + ldr s16, [x1, #16] + ld1 {v2.16b}, [x1], x2 +20: + ldr s17, [x1, #16] + umull v4.8h, v0.8b, v2.8b + umull2 v5.8h, v0.16b, v2.16b + ld1 {v3.16b}, [x1], x2 + umull v16.8h, v0.8b, v16.8b + umull v6.8h, v1.8b, v3.8b + umull2 v7.8h, v1.16b, v3.16b + add v4.8h, v4.8h, v6.8h + umull v18.8h, v1.8b, v17.8b + add v5.8h, v5.8h, v7.8h + urshr v4.8h, v4.8h, #2 + add v19.4h, v16.4h, v18.4h + urshr v5.8h, v5.8h, #2 + urshr v19.4h, v19.4h, #2 + st1 {v4.8h, v5.8h}, [x0], #32 + subs w3, w3, #1 + mov v2.16b, v3.16b + st1 {v19.4h}, [x0], #8 + mov v16.16b, v17.16b + add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8) + b.ne 20b + ret + +12: + ldr s16, [x1, #8] + ld1 {v2.8b}, [x1], x2 +2: + ldr s17, [x1, #8] + umull v4.8h, v0.8b, v2.8b + ld1 {v3.8b}, [x1], x2 + umull v16.8h, v0.8b, v16.8b + umull v6.8h, v1.8b, v3.8b + add v4.8h, v4.8h, v6.8h + umull v18.8h, v1.8b, v17.8b + srshr v4.8h, v4.8h, #2 + add v19.4h, v16.4h, v18.4h + srshr v19.4h, v19.4h, #2 + st1 {v4.8h}, [x0], #16 + subs w3, w3, #1 + mov v2.16b, v3.16b + st1 {v19.4h}, [x0], #8 + mov v16.16b, v17.16b + add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8) + b.ne 2b + ret +endfunc + function ff_vvc_dmvr_h_8_neon, export=1 movrel x7, X(ff_vvc_inter_luma_dmvr_filters) add x7, x7, x4, lsl #1 commit 56a638d8365370b287c53768f2c8a34a4cf3e417 Author: Krzysztof Pyrkosz <ffm...@szaka.eu> AuthorDate: Sun Sep 14 19:13:24 2025 +0200 Commit: Martin Storsjö <mar...@martin.st> CommitDate: Tue Sep 23 11:20:11 2025 +0000 avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon Before and after: A53: apply_bdof_8_16x8_neon: 2733.1 ( 4.88x) apply_bdof_8_16x16_neon: 5458.6 ( 4.86x) apply_bdof_10_16x8_neon: 2789.8 ( 4.64x) apply_bdof_10_16x16_neon: 5523.8 ( 4.68x) apply_bdof_12_16x8_neon: 2792.8 ( 4.58x) apply_bdof_12_16x16_neon: 5519.5 ( 4.63x) apply_bdof_8_16x8_neon: 2571.8 ( 5.12x) apply_bdof_8_16x16_neon: 5173.3 ( 5.12x) apply_bdof_10_16x8_neon: 2635.1 ( 4.87x) apply_bdof_10_16x16_neon: 5243.0 ( 4.89x) apply_bdof_12_16x8_neon: 2613.0 ( 4.89x) apply_bdof_12_16x16_neon: 5231.7 ( 4.90x) A78: apply_bdof_8_16x8_neon: 565.3 ( 8.43x) apply_bdof_8_16x16_neon: 1109.5 ( 8.60x) apply_bdof_10_16x8_neon: 568.2 ( 7.92x) apply_bdof_10_16x16_neon: 1114.1 ( 8.08x) apply_bdof_12_16x8_neon: 570.2 ( 7.87x) apply_bdof_12_16x16_neon: 1116.3 ( 8.03x) apply_bdof_8_16x8_neon: 541.4 ( 8.81x) apply_bdof_8_16x16_neon: 1065.9 ( 8.97x) apply_bdof_10_16x8_neon: 543.2 ( 8.32x) apply_bdof_10_16x16_neon: 1071.5 ( 8.39x) apply_bdof_12_16x8_neon: 544.2 ( 8.25x) apply_bdof_12_16x16_neon: 1074.1 ( 8.37x) diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index 79ff720cdd..df6b59510d 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -803,28 +803,21 @@ function vvc_bdof_grad_filter_8x_neon, export=0 src1 .req x5 width .req w6 height .req w7 + tbnz w6, #4, 16f -1: - mov x10, src0 - mov w11, width - mov x12, gh0 - mov x13, gv0 - mov x14, src1 - mov x15, gh1 - mov x16, gv1 -2: - ldur q0, [x10, #2] - ldur q1, [x10, #-2] - ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)] - ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)] +8: + ldur q0, [src0, #2] + ldur q1, [src0, #-2] + ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)] + ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)] sshr v0.8h, v0.8h, #6 sshr v1.8h, v1.8h, #6 - ldur q4, [x14, #2] - ldur q5, [x14, #-2] + ldur q4, [src1, #2] + ldur q5, [src1, #-2] sshr v2.8h, v2.8h, #6 sshr v3.8h, v3.8h, #6 - ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)] - ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)] + ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)] + ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)] // results of gradient_h0 sub v0.8h, v0.8h, v1.8h // results of gradient_v0 @@ -839,26 +832,20 @@ function vvc_bdof_grad_filter_8x_neon, export=0 // results of gradient_v1 sub v6.8h, v6.8h, v7.8h - add x10, x10, #16 - add x14, x14, #16 - // (gradient_h0 + gradient_h1) >> 1 shadd v1.8h, v0.8h, v4.8h // gradient_h0 - gradient_h1 sub v5.8h, v0.8h, v4.8h - subs w11, w11, #8 - // (gradient_v0 + gradient_v1) >> 1 shadd v3.8h, v2.8h, v6.8h // gradient_v0 - gradient_v1 sub v7.8h, v2.8h, v6.8h - st1 {v1.8h}, [x12], #16 - st1 {v5.8h}, [x15], #16 - st1 {v3.8h}, [x13], #16 - st1 {v7.8h}, [x16], #16 - b.ne 2b + st1 {v1.8h}, [gh0] + st1 {v5.8h}, [gh1] + st1 {v3.8h}, [gv0] + st1 {v7.8h}, [gv1] subs height, height, #1 add gh0, gh0, #(BDOF_BLOCK_SIZE << 1) @@ -867,7 +854,84 @@ function vvc_bdof_grad_filter_8x_neon, export=0 add gh1, gh1, #(BDOF_BLOCK_SIZE << 1) add gv1, gv1, #(BDOF_BLOCK_SIZE << 1) add src1, src1, #(VVC_MAX_PB_SIZE << 1) - b.ne 1b + b.ne 8b + ret + +16: + ldur q0, [src0, #2] + ldur q1, [src0, #18] + ldur q16, [src0, #-2] + sshr v0.8h, v0.8h, #6 + ldur q17, [src0, #14] + sshr v1.8h, v1.8h, #6 + ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)] + sshr v16.8h, v16.8h, #6 + ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]! + ldur q20, [src1, #2] + sshr v17.8h, v17.8h, #6 + ldur q21, [src1, #18] + sshr v2.8h, v2.8h, #6 + ldur q22, [src1, #-2] + sshr v3.8h, v3.8h, #6 + ldur q23, [src1, #14] + sshr v18.8h, v18.8h, #6 + ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)] + sshr v19.8h, v19.8h, #6 + ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]! + + // results of gradient_h0 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + + // results of gradient_v0 + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + + sshr v20.8h, v20.8h, #6 + sshr v21.8h, v21.8h, #6 + sshr v22.8h, v22.8h, #6 + sshr v23.8h, v23.8h, #6 + + // results of gradient_h1 + sub v20.8h, v20.8h, v22.8h + sub v21.8h, v21.8h, v23.8h + + sshr v24.8h, v24.8h, #6 + sshr v25.8h, v25.8h, #6 + + // gradient_h0 - gradient_h1 + sub v22.8h, v0.8h, v20.8h + sub v23.8h, v1.8h, v21.8h + + // (gradient_h0 + gradient_h1) >> 1 + shadd v16.8h, v0.8h, v20.8h + shadd v17.8h, v1.8h, v21.8h + + st1 {v22.8h, v23.8h}, [gh1], #32 + + sshr v26.8h, v26.8h, #6 + sshr v27.8h, v27.8h, #6 + + st1 {v16.8h, v17.8h}, [gh0], #32 + + // results of gradient_v1 + sub v24.8h, v24.8h, v26.8h + sub v25.8h, v25.8h, v27.8h + + // (gradient_v0 + gradient_v1) >> 1 + shadd v18.8h, v2.8h, v24.8h + shadd v19.8h, v3.8h, v25.8h + + // gradient_v0 - gradient_v1 + sub v26.8h, v2.8h, v24.8h + sub v27.8h, v3.8h, v25.8h + + st1 {v18.8h,v19.8h}, [gv0], #32 + + subs height, height, #1 + st1 {v26.8h,v27.8h}, [gv1], #32 + + b.ne 16b ret .unreq gh0 ----------------------------------------------------------------------- Summary of changes: libavcodec/aarch64/vvc/dsp_init.c | 2 + libavcodec/aarch64/vvc/inter.S | 176 ++++++++++++++++++++++++++++++++------ 2 files changed, 150 insertions(+), 28 deletions(-) hooks/post-receive --
_______________________________________________ ffmpeg-cvslog mailing list -- ffmpeg-cvslog@ffmpeg.org To unsubscribe send an email to ffmpeg-cvslog-le...@ffmpeg.org