The branch, master has been updated
       via  79080a547a39b0257bec081f72963b8f51e94416 (commit)
      from  3eb0cb3b0b0c4ad3ca14818adb26e0f2b6fa1c6c (commit)


- Log -----------------------------------------------------------------
commit 79080a547a39b0257bec081f72963b8f51e94416
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 12:46:50 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Nov 6 02:16:28 2025 +0100

    avcodec/x86/h264_chromamc: Use xmm regs in chroma_mc4 SSSE3 functions
    
    Doubling the register size allowed to avoid two pmaddubsw.
    It is also ABI compliant (the old version lacked an emms)
    and the average versions no longer rely on padding (the old versions
    used pavgb with a memory operand reading eight bytes,
    although only four are needed).
    
    Old benchmarks (the latter four refer to RV40):
    avg_h264_chroma_mc4_8_c:                               145.7 ( 1.00x)
    avg_h264_chroma_mc4_8_ssse3:                            32.3 ( 4.51x)
    put_h264_chroma_mc4_8_c:                               136.1 ( 1.00x)
    put_h264_chroma_mc4_8_ssse3:                            29.0 ( 4.70x)
    avg_chroma_mc4_c:                                      162.1 ( 1.00x)
    avg_chroma_mc4_ssse3:                                   31.1 ( 5.22x)
    put_chroma_mc4_c:                                      137.5 ( 1.00x)
    put_chroma_mc4_ssse3:                                   28.6 ( 4.81x)
    
    New benchmarks:
    avg_h264_chroma_mc4_8_c:                               146.7 ( 1.00x)
    avg_h264_chroma_mc4_8_ssse3:                            26.5 ( 5.53x)
    put_h264_chroma_mc4_8_c:                               136.8 ( 1.00x)
    put_h264_chroma_mc4_8_ssse3:                            22.5 ( 6.09x)
    avg_chroma_mc4_c:                                      165.5 ( 1.00x)
    avg_chroma_mc4_ssse3:                                   27.2 ( 6.08x)
    put_chroma_mc4_c:                                      138.1 ( 1.00x)
    put_chroma_mc4_ssse3:                                   23.2 ( 5.96x)
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index 6a65d5cabd..7c896db179 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -276,51 +276,57 @@ cglobal %1_%2_chroma_mc8%3, 6, 7+UNIX64, 8
 %endmacro
 
 %macro chroma_mc4_ssse3_func 2
-cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
-    movq          m5, [pw_32]
+cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 8
+    mova          m5, [pw_32]
 ..@%1_%2_chroma_mc4_after_init_ %+ cpuname:
-    mov           r6, r4
+    mov          r6d, r4d
     shl          r4d, 8
-    sub          r4d, r6d
-    mov           r6, 8
-    add          r4d, 8           ; x*288+8
-    sub          r6d, r5d
-    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
-    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
+    movd          m0, [r1]
+    sub          r6d, 8
+    sub          r4d, r6d         ; x << 8 | (8-x)
+    mov          r6d, r5d
+    shl          r5d, 16
+    movd          m1, [r1+1]
+    sub          r6d, 8
+    sub          r5d, r6d         ; y << 16 | (8-y)
+    imul         r4d, r5d         ; xy << 24 | (8-x)y << 16 | x(8-y) << 8 | 
(8-x)(8-y)
+    add           r1, r2
 
-    movd          m7, r6d
-    movd          m6, r4d
-    movd          m0, [r1  ]
-    pshufw        m7, m7, 0
-    punpcklbw     m0, [r1+1]
-    pshufw        m6, m6, 0
+    movd          m6, r4d         ; ABCD
+    punpcklwd     m6, m6          ; ABABCDCD
+    pshufd        m7, m6, 0x55    ; CDCDCDCDCDCDCDCD
+    punpcklbw     m0, m1
+    pshufd        m6, m6, 0x0     ; ABABABABABABABAB
 
 .next2rows:
-    movd          m1, [r1+r2*1  ]
-    movd          m3, [r1+r2*2  ]
-    punpcklbw     m1, [r1+r2*1+1]
-    punpcklbw     m3, [r1+r2*2+1]
-    lea           r1, [r1+r2*2]
-    movq          m2, m1
-    movq          m4, m3
-    pmaddubsw     m0, m7
-    pmaddubsw     m1, m6
-    pmaddubsw     m2, m7
-    pmaddubsw     m3, m6
+    movd          m1, [r1]
+    movd          m2, [r1+1]
+    movd          m3, [r1+r2]
+    movd          m4, [r1+r2+1]
+    punpcklbw     m1, m2
+    punpcklqdq    m0, m1
+    pmaddubsw     m0, m6
+    punpcklbw     m3, m4
+    punpcklqdq    m1, m3
+    pmaddubsw     m1, m7
+%ifidn %1, avg
+    movd          m2, [r0]
+    movd          m4, [r0+r2]
+%endif
     paddw         m0, m5
-    paddw         m2, m5
-    paddw         m1, m0
-    paddw         m3, m2
-    psrlw         m1, 6
-    movq          m0, m4
-    psrlw         m3, 6
-    packuswb      m1, m1
-    packuswb      m3, m3
-    CHROMAMC_AVG  m1, [r0  ]
-    CHROMAMC_AVG  m3, [r0+r2]
-    movd     [r0   ], m1
-    movd     [r0+r2], m3
+    lea           r1, [r1+r2*2]
+    paddw         m0, m1
+    psrlw         m0, 6
+    packuswb      m0, m0
+    pshufd        m1, m0, 0x1
+%ifidn %1, avg
+    pavgb         m0, m2
+    pavgb         m1, m4
+%endif
     sub          r3d, 2
+    movd        [r0], m0
+    movd     [r0+r2], m1
+    mova          m0, m3
     lea           r0, [r0+r2*2]
     jg .next2rows
     RET
@@ -379,26 +385,23 @@ cglobal %1_%2_chroma_mc4, 6, 7+UNIX64, 0
 
 %macro rv40_chroma_mc4_func 1 ; put vs avg
 %if CONFIG_RV40_DECODER
-    cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 0
+    cglobal rv40_%1_chroma_mc4, 6, 7+UNIX64, 8
     rv40_get_bias m5
     jmp           ..@%1_h264_chroma_mc4_after_init_ %+ cpuname
 %endif
 %endmacro
 
-%define CHROMAMC_AVG NOTHING
 INIT_XMM ssse3
+%define CHROMAMC_AVG NOTHING
 chroma_mc8_ssse3_func put, h264, _rnd
 chroma_mc8_ssse3_func put, vc1,  _nornd
 rv40_chroma_mc8_func put
-INIT_MMX ssse3
 chroma_mc4_ssse3_func put, h264
 rv40_chroma_mc4_func put
 
 %define CHROMAMC_AVG DIRECT_AVG
-INIT_XMM ssse3
 chroma_mc8_ssse3_func avg, h264, _rnd
 chroma_mc8_ssse3_func avg, vc1,  _nornd
 rv40_chroma_mc8_func avg
-INIT_MMX ssse3
 chroma_mc4_ssse3_func avg, h264
 rv40_chroma_mc4_func avg

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/x86/h264_chromamc.asm | 89 +++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 43 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to