The branch, master has been updated
       via  f1a155d9754f2f38da121b8935ea1a5483021a5a (commit)
      from  0bd5a7d3719456f049f4d29abb313968ccacb28c (commit)


- Log -----------------------------------------------------------------
commit f1a155d9754f2f38da121b8935ea1a5483021a5a
Author:     Krzysztof Pyrkosz <ffm...@szaka.eu>
AuthorDate: Fri Sep 5 19:52:11 2025 +0200
Commit:     Martin Storsjö <mar...@martin.st>
CommitDate: Sun Sep 21 19:39:27 2025 +0000

    avcodec/aarch64/vvc: Optimize dmvr_hv_10
    
    Before and after on A53:
    dmvr_hv_10_12x20_neon:                                1838.2 ( 3.02x)
    dmvr_hv_10_20x12_neon:                                1330.2 ( 1.83x)
    dmvr_hv_10_20x20_neon:                                2148.2 ( 1.85x)
    dmvr_hv_12_12x20_neon:                                1839.2 ( 3.02x)
    dmvr_hv_12_20x12_neon:                                1330.6 ( 1.83x)
    dmvr_hv_12_20x20_neon:                                2147.2 ( 1.85x)
    
    dmvr_hv_10_12x20_neon:                                1755.0 ( 3.17x)
    dmvr_hv_10_20x12_neon:                                1165.8 ( 2.09x)
    dmvr_hv_10_20x20_neon:                                1876.1 ( 2.12x)
    dmvr_hv_12_12x20_neon:                                1754.4 ( 3.17x)
    dmvr_hv_12_20x12_neon:                                1167.8 ( 2.09x)
    dmvr_hv_12_20x20_neon:                                1878.8 ( 2.12x)

diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 01d2ff155c..79ff720cdd 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -599,18 +599,13 @@ function ff_vvc_dmvr_hv_8_neon, export=1
 endfunc
 
 function ff_vvc_dmvr_hv_12_neon, export=1
-        movi            v29.4s, #(12 - 6)
-        movi            v30.4s, #(1 << (12 - 7))    // offset1
+        mvni            v29.4s, #(12 - 6 - 1)
         b               0f
 endfunc
 
 function ff_vvc_dmvr_hv_10_neon, export=1
-        movi            v29.4s, #(10 - 6)
-        movi            v30.4s, #(1 << (10 - 7))    // offset1
+        mvni            v29.4s, #(10 - 6 - 1)
 0:
-        movi            v31.4s, #8                  // offset2
-        neg             v29.4s, v29.4s
-
         sub             sp, sp, #(VVC_MAX_PB_SIZE * 4)
 
         movrel          x9, X(ff_vvc_inter_luma_dmvr_filters)
@@ -626,7 +621,6 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         add             x12, x9, my, lsl #1
         ldrb            w10, [x12]
         ldrb            w11, [x12, #1]
-        sxtw            x6, w6
         dup             v2.8h, w10                  // filter_y[0]
         dup             v3.8h, w11                  // filter_y[1]
 
@@ -635,7 +629,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         mov             w10, #0                     // start filter_y or not
         add             height, height, #1
         sub             dst, dst, #(VVC_MAX_PB_SIZE * 2)
-        sub             src_stride, src_stride, x6, lsl #1
+        sub             src_stride, src_stride, w6, sxtw #1
         cset            w15, gt                     // width > 16
 1:
         mov             x12, tmp0
@@ -656,14 +650,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         umlal           v18.4s, v17.4h, v1.4h
         umlal2          v19.4s, v17.8h, v1.8h
 
-        add             v4.4s, v4.4s, v30.4s
-        add             v5.4s, v5.4s, v30.4s
-        add             v18.4s, v18.4s, v30.4s
-        add             v19.4s, v19.4s, v30.4s
-        ushl            v4.4s, v4.4s, v29.4s
-        ushl            v5.4s, v5.4s, v29.4s
-        ushl            v18.4s, v18.4s, v29.4s
-        ushl            v19.4s, v19.4s, v29.4s
+        urshl           v4.4s, v4.4s, v29.4s
+        urshl           v5.4s, v5.4s, v29.4s
+        urshl           v18.4s, v18.4s, v29.4s
+        urshl           v19.4s, v19.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         uqxtn2          v6.8h, v5.4s
         uqxtn           v7.4h, v18.4s
@@ -681,18 +671,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         umlal2          v18.4s, v6.8h, v3.8h
         umlal           v19.4s, v7.4h, v3.4h
         umlal2          v20.4s, v7.8h, v3.8h
-        add             v17.4s, v17.4s, v31.4s
-        add             v18.4s, v18.4s, v31.4s
-        add             v19.4s, v19.4s, v31.4s
-        add             v20.4s, v20.4s, v31.4s
-        ushr            v17.4s, v17.4s, #4
-        ushr            v18.4s, v18.4s, #4
-        ushr            v19.4s, v19.4s, #4
-        ushr            v20.4s, v20.4s, #4
-        uqxtn           v6.4h, v17.4s
-        uqxtn2          v6.8h, v18.4s
-        uqxtn           v7.4h, v19.4s
-        uqxtn2          v7.8h, v20.4s
+        uqrshrn         v6.4h, v17.4s, #4
+        uqrshrn2        v6.8h, v18.4s, #4
+        uqrshrn         v7.4h, v19.4s, #4
+        uqrshrn2        v7.8h, v20.4s, #4
         stp             q6, q7, [x14], #32
         b               3f
 2:
@@ -704,10 +686,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         umlal           v4.4s, v7.4h, v1.4h
         umlal2          v5.4s, v7.8h, v1.8h
 
-        add             v4.4s, v4.4s, v30.4s
-        add             v5.4s, v5.4s, v30.4s
-        ushl            v4.4s, v4.4s, v29.4s
-        ushl            v5.4s, v5.4s, v29.4s
+        urshl           v4.4s, v4.4s, v29.4s
+        urshl           v5.4s, v5.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         uqxtn2          v6.8h, v5.4s
         str             q6, [x13], #16
@@ -719,10 +699,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         umull2          v18.4s, v16.8h, v2.8h
         umlal           v17.4s, v6.4h, v3.4h
         umlal2          v18.4s, v6.8h, v3.8h
-        add             v17.4s, v17.4s, v31.4s
-        add             v18.4s, v18.4s, v31.4s
-        ushr            v17.4s, v17.4s, #4
-        ushr            v18.4s, v18.4s, #4
+        urshr           v17.4s, v17.4s, #4
+        urshr           v18.4s, v18.4s, #4
         uqxtn           v16.4h, v17.4s
         uqxtn2          v16.8h, v18.4s
         str             q16, [x14], #16
@@ -731,8 +709,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         ldr             d6, [src], #8
         umull           v4.4s, v7.4h, v1.4h
         umlal           v4.4s, v6.4h, v0.4h
-        add             v4.4s, v4.4s, v30.4s
-        ushl            v4.4s, v4.4s, v29.4s
+        urshl           v4.4s, v4.4s, v29.4s
         uqxtn           v6.4h, v4.4s
         str             d6, [x13], #8
 
@@ -741,8 +718,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
         ldr             d16, [x12], #8
         umull           v17.4s, v16.4h, v2.4h
         umlal           v17.4s, v6.4h, v3.4h
-        add             v17.4s, v17.4s, v31.4s
-        ushr            v17.4s, v17.4s, #4
+        urshr           v17.4s, v17.4s, #4
         uqxtn           v16.4h, v17.4s
         str             d16, [x14], #8
 4:

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/aarch64/vvc/inter.S | 58 +++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 41 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- ffmpeg-cvslog@ffmpeg.org
To unsubscribe send an email to ffmpeg-cvslog-le...@ffmpeg.org

Reply via email to