lossless_videodsp: Don't store in eight byte chunks

Andreas Rheinhardt via ffmpeg-cvslog Fri, 19 Dec 2025 12:39:22 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 6368d2baaea8121f9fa23fb40edb5308690d699d
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Dec 18 21:59:33 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Dec 19 20:55:37 2025 +0100

    avcodec/x86/lossless_videodsp: Don't store in eight byte chunks
    
    Use movu (movdqu) instead of movq+movhps.
    
    Old benchmarks:
    add_left_pred_int16_c:                                2265.5 ( 1.00x)
    add_left_pred_int16_ssse3:                             595.4 ( 3.81x)
    add_left_pred_rnd_acc_c:                              1255.0 ( 1.00x)
    add_left_pred_rnd_acc_ssse3:                           326.2 ( 3.85x)
    add_left_pred_rnd_acc_avx2:                            279.0 ( 4.50x)
    add_left_pred_zero_c:                                 1249.5 ( 1.00x)
    add_left_pred_zero_ssse3:                              326.1 ( 3.83x)
    add_left_pred_zero_avx2:                               277.0 ( 4.51x)
    
    New benchmarks:
    add_left_pred_int16_c:                                2266.9 ( 1.00x)
    add_left_pred_int16_ssse3:                             509.9 ( 4.45x)
    add_left_pred_rnd_acc_c:                              1251.4 ( 1.00x)
    add_left_pred_rnd_acc_ssse3:                           282.6 ( 4.43x)
    add_left_pred_rnd_acc_avx2:                            208.9 ( 5.99x)
    add_left_pred_zero_c:                                 1253.7 ( 1.00x)
    add_left_pred_zero_ssse3:                              280.0 ( 4.48x)
    add_left_pred_zero_avx2:                               206.8 ( 6.06x)
    
    The checkasm test has been modified to use an unaligned destination
    for this test.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/lossless_videodsp.asm | 35 ++++++++---------------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/libavcodec/x86/lossless_videodsp.asm 
b/libavcodec/x86/lossless_videodsp.asm
index 359d1ee4ca..7dd10228fc 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -101,17 +101,13 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
     RET
 
 
-%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+%macro ADD_LEFT_LOOP 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
     add     srcq, wq
     add     dstq, wq
     neg     wq
 %%.loop:
     pshufb  xm0, xm5
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
+    mov%2   m1, [srcq+wq]
     psllw   m2, m1, 8
     paddb   m1, m2
     pshufb  m2, m1, m3
@@ -121,24 +117,14 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
     pshufb  m2, m1, m6
     paddb   m1, m2
     paddb   xm0, xm1
-%if %1
-    mova    [dstq+wq], xm0
-%else
-    movq    [dstq+wq], xm0
-    movhps  [dstq+wq+8], xm0
-%endif
+    mov%1   [dstq+wq], xm0
 
 %if mmsize == 32
     vextracti128    xm2, m1, 1 ; get second lane of the ymm
     pshufb          xm0, xm5   ; set alls val to last val of the first lane
     paddb           xm0, xm2
 ;store val
-%if %1
-    mova    [dstq+wq+16], xm0
-%else;
-    movq    [dstq+wq+16], xm0
-    movhps  [dstq+wq+16+8], xm0
-%endif
+    mov%1   [dstq+wq+16], xm0
 %endif
     add     wq, mmsize
     jl %%.loop
@@ -169,11 +155,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
     jnz .src_unaligned
     test    dstq, mmsize - 1
     jnz .dst_unaligned
-    ADD_LEFT_LOOP 1, 1
+    ADD_LEFT_LOOP a, a
 .dst_unaligned:
-    ADD_LEFT_LOOP 0, 1
+    ADD_LEFT_LOOP u, a
 .src_unaligned:
-    ADD_LEFT_LOOP 0, 0
+    ADD_LEFT_LOOP u, u
 %endmacro
 
 INIT_XMM ssse3
@@ -247,12 +233,7 @@ ADD_BYTES
     paddw   m1, m2
     paddw   m0, m1
     pand    m0, m7
-%ifidn %1, a
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
+    mov%1   [dstq+wq], m0
     add     wq, mmsize
     jl %%.loop
     mov     eax, mmsize-1

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/06: avcodec/x86/lossless_videodsp: Don't store in eight byte chunks

Reply via email to