alf: Use xmm registers where sufficient

Andreas Rheinhardt via ffmpeg-cvslog Fri, 06 Mar 2026 11:38:18 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 01a897020e4d5d794cf16984796d4c8a2518bfaf
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Feb 28 21:44:38 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Mar 6 20:02:41 2026 +0100

    avcodec/x86/vvc/alf: Use xmm registers where sufficient
    
    One always has eight samples when processing the luma remainder,
    so xmm registers are sufficient for everything. In fact, this
    actually simplifies loading the luma parameters.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 8798d7b3c9..9563ae74d5 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -59,15 +59,15 @@ SECTION .text
 
 ;%1-%3 out
 ;%4 clip or filter
-%macro LOAD_LUMA_PARAMS_W16 4
+%macro LOAD_LUMA_PARAMS 4
     lea                 offsetq, [3 * xq]                       ;xq * 
ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE
-    movu                    m%1, [%4q + 2 * offsetq + 0 * 32]   ; 2 * for 
sizeof(int16_t)
-    movu                    m%2, [%4q + 2 * offsetq + 1 * 32]
-    movu                    m%3, [%4q + 2 * offsetq + 2 * 32]
+    movu                    m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for 
sizeof(int16_t)
+    movu                    m%2, [%4q + 2 * offsetq + 1 * mmsize]
+    movu                    m%3, [%4q + 2 * offsetq + 2 * mmsize]
 %endmacro
 
 %macro LOAD_LUMA_PARAMS_W16 6
-    LOAD_LUMA_PARAMS_W16    %1, %2, %3, %4
+    LOAD_LUMA_PARAMS        %1, %2, %3, %4
     ;m%1 = 03 02 01 00
     ;m%2 = 07 06 05 04
     ;m%3 = 11 10 09 08
@@ -84,11 +84,26 @@ SECTION .text
     vpermpd                 m%3, m%3, 10000111b         ;11 08 05 02
 %endmacro
 
+%macro LOAD_LUMA_PARAMS_W8 5
+    LOAD_LUMA_PARAMS       %2, %3, %5, %4
+    ;m%2 = 01 00
+    ;m%3 = 03 02
+    ;m%5 = 05 04
+
+    shufpd                  m%1, m%2, m%3, 10b          ;03 00
+    shufpd                  m%2, m%2, m%5, 01b          ;04 01
+    shufpd                  m%3, m%3, m%5, 10b          ;05 02
+%endmacro
+
 ; %1-%3 out
 ; %4    clip or filter
 ; %5-%6 tmp
 %macro LOAD_LUMA_PARAMS 6
+%if mmsize == 32
     LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4, %5, %6
+%else
+    LOAD_LUMA_PARAMS_W8  %1, %2, %3, %4, %5
+%endif
 %endmacro
 
 %macro LOAD_CHROMA_PARAMS 4
@@ -483,8 +498,14 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 
0-0x30, dst, dst_stride,
     cmp           widthq, 0
     je            .w_end
 
+%if LUMA
+INIT_XMM cpuname
+%endif
     LOAD_PARAMS
     FILTER_16x4  widthq
+%if LUMA
+INIT_YMM cpuname
+%endif
 
 .w_end:
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 15/28: avcodec/x86/vvc/alf: Use xmm registers where sufficient

Reply via email to