x86: move entry points to ops_common.asm

Niklas Haas via ffmpeg-cvslog Tue, 09 Jun 2026 10:05:51 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit b328e152a49ae6c96ae978c28a8205feb7777570
Author:     Niklas Haas <[email protected]>
AuthorDate: Sun Apr 19 13:42:00 2026 +0200
Commit:     Niklas Haas <[email protected]>
CommitDate: Tue Jun 9 18:27:20 2026 +0200

    swscale/x86: move entry points to ops_common.asm
    
    As well as the packed shuffle solver. These don't really interact with
    the rest of the code in ops_int.asm, which is, by name at least, intended 
for
    integer op kernels.
    
    More importantly, these functions will be shared with the uops rewrite.
    
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/Makefile       |   3 +-
 libswscale/x86/ops_common.asm | 200 ++++++++++++++++++++++++++++++++++++++++++
 libswscale/x86/ops_int.asm    | 177 -------------------------------------
 3 files changed, 202 insertions(+), 178 deletions(-)

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index a6aa89e21a..906d1a43f6 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -16,7 +16,8 @@ X86ASM-OBJS                     += x86/input.o                
          \
                                    x86/yuv2yuvX.o                       \
 
 ifdef ARCH_X86_64
-X86ASM-OBJS-$(CONFIG_UNSTABLE)  += x86/ops_int.o                        \
+X86ASM-OBJS-$(CONFIG_UNSTABLE)  += x86/ops_common.o                     \
+                                   x86/ops_int.o                        \
                                    x86/ops_float.o                      \
                                    x86/ops.o
 endif
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
new file mode 100644
index 0000000000..a5dd105da5
--- /dev/null
+++ b/libswscale/x86/ops_common.asm
@@ -0,0 +1,200 @@
+;******************************************************************************
+;* Copyright (c) 2025 Niklas Haas
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "ops_include.asm"
+
+SECTION .text
+
+;---------------------------------------------------------
+; Global entry point. See `ops_include.asm` for info.
+
+%macro process_fn 1 ; number of planes
+cglobal sws_process%1_x86, 6, 7 + 2 * %1, 16
+            ; Args:
+            ;   execq, implq, bxd, yd as defined in ops_include.asm
+            ;   bx_end and y_end are initially in tmp0d / tmp1d
+            ;   (see SwsOpFunc signature)
+            ;
+            ; Stack layout:
+            ;   [rsp +  0] = [qword] impl->cont (address of first kernel)
+            ;   [rsp +  8] = [qword] &impl[1]   (restore implq after chain)
+            ;   [rsp + 16] = [dword] bx start   (restore after line finish)
+            ;   [rsp + 20] = [dword] bx end     (loop counter limit)
+            ;   [rsp + 24] = [dword] y end      (loop counter limit)
+            sub rsp, 32
+            mov [rsp + 16], bxd
+            mov [rsp + 20], tmp0d ; bx_end
+            mov [rsp + 24], tmp1d ; y_end
+            mov tmp0q, [implq + SwsOpImpl.cont]
+            add implq, SwsOpImpl.next
+            mov [rsp +  0], tmp0q
+            mov [rsp +  8], implq
+            movsxdifnidn bxq, bxd
+            movsxdifnidn yq, yd
+
+            ; load plane pointers
+            mov in0q,  [execq + SwsOpExec.in0]
+IF %1 > 1,  mov in1q,  [execq + SwsOpExec.in1]
+IF %1 > 2,  mov in2q,  [execq + SwsOpExec.in2]
+IF %1 > 3,  mov in3q,  [execq + SwsOpExec.in3]
+            mov out0q, [execq + SwsOpExec.out0]
+IF %1 > 1,  mov out1q, [execq + SwsOpExec.out1]
+IF %1 > 2,  mov out2q, [execq + SwsOpExec.out2]
+IF %1 > 3,  mov out3q, [execq + SwsOpExec.out3]
+.loop:
+            call [rsp] ; call into op chain
+            mov implq, [rsp + 8]
+            inc bxd
+            cmp bxd, [rsp + 20]
+            jne .loop
+            ; end of line
+            inc yd
+            cmp yd, [rsp + 24]
+            je .end
+            ; bump addresses to point to start of next line
+            add in0q,  [execq + SwsOpExec.in_bump0]
+IF %1 > 1,  add in1q,  [execq + SwsOpExec.in_bump1]
+IF %1 > 2,  add in2q,  [execq + SwsOpExec.in_bump2]
+IF %1 > 3,  add in3q,  [execq + SwsOpExec.in_bump3]
+            add out0q, [execq + SwsOpExec.out_bump0]
+IF %1 > 1,  add out1q, [execq + SwsOpExec.out_bump1]
+IF %1 > 2,  add out2q, [execq + SwsOpExec.out_bump2]
+IF %1 > 3,  add out3q, [execq + SwsOpExec.out_bump3]
+            mov bxd, [rsp + 16]
+            ; conditionally apply y bump (if non-NULL)
+            mov tmp0q, [execq + SwsOpExec.in_bump_y]
+            test tmp0q, tmp0q
+            jz .loop
+            movsxd tmp0q, [tmp0q + yq * 4 - 4] ; load (signed) y bump
+%if %1 > 3
+            mov tmp1q, tmp0q
+            imul tmp1q, [execq + SwsOpExec.in_stride3]
+            add in3q, tmp1q
+%endif
+%if %1 > 2
+            mov tmp1q, tmp0q
+            imul tmp1q, [execq + SwsOpExec.in_stride2]
+            add in2q, tmp1q
+%endif
+%if %1 > 1
+            mov tmp1q, tmp0q
+            imul tmp1q, [execq + SwsOpExec.in_stride1]
+            add in1q, tmp1q
+%endif
+            imul tmp0q, [execq + SwsOpExec.in_stride0]
+            add in0q, tmp0q
+            jmp .loop
+.end:
+            add rsp, 32
+            RET
+%endmacro
+
+process_fn 1
+process_fn 2
+process_fn 3
+process_fn 4
+
+;---------------------------------------------------------
+; Packed shuffle fast-path
+
+; This is a special entry point for handling a subset of operation chains
+; that can be reduced down to a single `pshufb` shuffle mask. For more details
+; about when this works, refer to the documentation of `ff_sws_solve_shuffle`.
+;
+; We specialize this function for every possible combination of pixel strides.
+; For example, gray -> gray16 is classified as an "8, 16" operation because it
+; takes 8 bytes and expands them out to 16 bytes in each application of the
+; 128-bit shuffle mask.
+;
+; Since pshufb can't shuffle across lanes, we only instantiate SSE4 versions 
for
+; all shuffles that are not a clean multiple of 128 bits (e.g. rgb24 -> rgb0).
+; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
+; versions that can handle a larger number of bytes at once.
+
+%macro MOVSIZE 3 ; size, dst, src
+    %if %1 <= 4
+        movd %2, %3
+    %elif %1 <= 8
+        movq %2, %3
+    %else
+        movu %2, %3
+    %endif
+%endmacro
+
+%macro packed_shuffle 2 ; size_in, size_out
+cglobal packed_shuffle%1_%2, 6, 10, 2, \
+    exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
+            mov srcq, [execq + SwsOpExec.in0]
+            mov dstq, [execq + SwsOpExec.out0]
+            mov src_strideq, [execq + SwsOpExec.in_stride0]
+            mov dst_strideq, [execq + SwsOpExec.out_stride0]
+            VBROADCASTI128 m1, [shuffleq]
+            sub bxendd, bxd
+            sub yendd, yd
+            ; reuse now-unneeded regs
+    %define srcidxq execq
+            imul srcidxq, bxendq, -%1
+%if %1 = %2
+    %define dstidxq srcidxq
+%else
+    %define dstidxq shuffleq ; no longer needed reg
+            imul dstidxq, bxendq, -%2
+%endif
+            sub srcq, srcidxq
+            sub dstq, dstidxq
+.loop:
+            MOVSIZE %1, m0, [srcq + srcidxq]
+            pshufb m0, m1
+            MOVSIZE %2, [dstq + dstidxq], m0
+            add srcidxq, %1
+IF %1 != %2,add dstidxq, %2
+            jnz .loop
+            add srcq, src_strideq
+            add dstq, dst_strideq
+            imul srcidxq, bxendq, -%1
+IF %1 != %2,imul dstidxq, bxendq, -%2
+            dec yendd
+            jnz .loop
+            RET
+%endmacro
+
+INIT_XMM sse4
+packed_shuffle  5, 15 ;  8 -> 24
+packed_shuffle  4, 16 ;  8 -> 32, 16 -> 64
+packed_shuffle  2, 12 ;  8 -> 48
+packed_shuffle 16,  8 ; 16 -> 8
+packed_shuffle 10, 15 ; 16 -> 24
+packed_shuffle  8, 16 ; 16 -> 32, 32 -> 64
+packed_shuffle  4, 12 ; 16 -> 48
+packed_shuffle 15,  5 ; 24 -> 8
+packed_shuffle 15, 15 ; 24 -> 24
+packed_shuffle 12, 16 ; 24 -> 32
+packed_shuffle  6, 12 ; 24 -> 48
+packed_shuffle 16,  4 ; 32 -> 8,  64 -> 16
+packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48
+packed_shuffle 16, 16 ; 32 -> 32, 64 -> 64
+packed_shuffle  8, 12 ; 32 -> 48
+packed_shuffle 12, 12 ; 48 -> 48
+
+INIT_YMM avx2
+packed_shuffle 32, 32
+
+INIT_ZMM avx512
+packed_shuffle 64, 64
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index e261b3eccd..7c366a8f8d 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -57,183 +57,6 @@ const1w: times 16 dw 0x01
 
 SECTION .text
 
-;---------------------------------------------------------
-; Global entry point. See `ops_include.asm` for info.
-
-%macro process_fn 1 ; number of planes
-cglobal sws_process%1_x86, 6, 7 + 2 * %1, 16
-            ; Args:
-            ;   execq, implq, bxd, yd as defined in ops_include.asm
-            ;   bx_end and y_end are initially in tmp0d / tmp1d
-            ;   (see SwsOpFunc signature)
-            ;
-            ; Stack layout:
-            ;   [rsp +  0] = [qword] impl->cont (address of first kernel)
-            ;   [rsp +  8] = [qword] &impl[1]   (restore implq after chain)
-            ;   [rsp + 16] = [dword] bx start   (restore after line finish)
-            ;   [rsp + 20] = [dword] bx end     (loop counter limit)
-            ;   [rsp + 24] = [dword] y end      (loop counter limit)
-            sub rsp, 32
-            mov [rsp + 16], bxd
-            mov [rsp + 20], tmp0d ; bx_end
-            mov [rsp + 24], tmp1d ; y_end
-            mov tmp0q, [implq + SwsOpImpl.cont]
-            add implq, SwsOpImpl.next
-            mov [rsp +  0], tmp0q
-            mov [rsp +  8], implq
-            movsxdifnidn bxq, bxd
-            movsxdifnidn yq, yd
-
-            ; load plane pointers
-            mov in0q,  [execq + SwsOpExec.in0]
-IF %1 > 1,  mov in1q,  [execq + SwsOpExec.in1]
-IF %1 > 2,  mov in2q,  [execq + SwsOpExec.in2]
-IF %1 > 3,  mov in3q,  [execq + SwsOpExec.in3]
-            mov out0q, [execq + SwsOpExec.out0]
-IF %1 > 1,  mov out1q, [execq + SwsOpExec.out1]
-IF %1 > 2,  mov out2q, [execq + SwsOpExec.out2]
-IF %1 > 3,  mov out3q, [execq + SwsOpExec.out3]
-.loop:
-            call [rsp] ; call into op chain
-            mov implq, [rsp + 8]
-            inc bxd
-            cmp bxd, [rsp + 20]
-            jne .loop
-            ; end of line
-            inc yd
-            cmp yd, [rsp + 24]
-            je .end
-            ; bump addresses to point to start of next line
-            add in0q,  [execq + SwsOpExec.in_bump0]
-IF %1 > 1,  add in1q,  [execq + SwsOpExec.in_bump1]
-IF %1 > 2,  add in2q,  [execq + SwsOpExec.in_bump2]
-IF %1 > 3,  add in3q,  [execq + SwsOpExec.in_bump3]
-            add out0q, [execq + SwsOpExec.out_bump0]
-IF %1 > 1,  add out1q, [execq + SwsOpExec.out_bump1]
-IF %1 > 2,  add out2q, [execq + SwsOpExec.out_bump2]
-IF %1 > 3,  add out3q, [execq + SwsOpExec.out_bump3]
-            mov bxd, [rsp + 16]
-            ; conditionally apply y bump (if non-NULL)
-            mov tmp0q, [execq + SwsOpExec.in_bump_y]
-            test tmp0q, tmp0q
-            jz .loop
-            movsxd tmp0q, [tmp0q + yq * 4 - 4] ; load (signed) y bump
-%if %1 > 3
-            mov tmp1q, tmp0q
-            imul tmp1q, [execq + SwsOpExec.in_stride3]
-            add in3q, tmp1q
-%endif
-%if %1 > 2
-            mov tmp1q, tmp0q
-            imul tmp1q, [execq + SwsOpExec.in_stride2]
-            add in2q, tmp1q
-%endif
-%if %1 > 1
-            mov tmp1q, tmp0q
-            imul tmp1q, [execq + SwsOpExec.in_stride1]
-            add in1q, tmp1q
-%endif
-            imul tmp0q, [execq + SwsOpExec.in_stride0]
-            add in0q, tmp0q
-            jmp .loop
-.end:
-            add rsp, 32
-            RET
-%endmacro
-
-process_fn 1
-process_fn 2
-process_fn 3
-process_fn 4
-
-;---------------------------------------------------------
-; Packed shuffle fast-path
-
-; This is a special entry point for handling a subset of operation chains
-; that can be reduced down to a single `pshufb` shuffle mask. For more details
-; about when this works, refer to the documentation of `ff_sws_solve_shuffle`.
-;
-; We specialize this function for every possible combination of pixel strides.
-; For example, gray -> gray16 is classified as an "8, 16" operation because it
-; takes 8 bytes and expands them out to 16 bytes in each application of the
-; 128-bit shuffle mask.
-;
-; Since pshufb can't shuffle across lanes, we only instantiate SSE4 versions 
for
-; all shuffles that are not a clean multiple of 128 bits (e.g. rgb24 -> rgb0).
-; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
-; versions that can handle a larger number of bytes at once.
-
-%macro MOVSIZE 3 ; size, dst, src
-    %if %1 <= 4
-        movd %2, %3
-    %elif %1 <= 8
-        movq %2, %3
-    %else
-        movu %2, %3
-    %endif
-%endmacro
-
-%macro packed_shuffle 2 ; size_in, size_out
-cglobal packed_shuffle%1_%2, 6, 10, 2, \
-    exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
-            mov srcq, [execq + SwsOpExec.in0]
-            mov dstq, [execq + SwsOpExec.out0]
-            mov src_strideq, [execq + SwsOpExec.in_stride0]
-            mov dst_strideq, [execq + SwsOpExec.out_stride0]
-            VBROADCASTI128 m1, [shuffleq]
-            sub bxendd, bxd
-            sub yendd, yd
-            ; reuse now-unneeded regs
-    %define srcidxq execq
-            imul srcidxq, bxendq, -%1
-%if %1 = %2
-    %define dstidxq srcidxq
-%else
-    %define dstidxq shuffleq ; no longer needed reg
-            imul dstidxq, bxendq, -%2
-%endif
-            sub srcq, srcidxq
-            sub dstq, dstidxq
-.loop:
-            MOVSIZE %1, m0, [srcq + srcidxq]
-            pshufb m0, m1
-            MOVSIZE %2, [dstq + dstidxq], m0
-            add srcidxq, %1
-IF %1 != %2,add dstidxq, %2
-            jnz .loop
-            add srcq, src_strideq
-            add dstq, dst_strideq
-            imul srcidxq, bxendq, -%1
-IF %1 != %2,imul dstidxq, bxendq, -%2
-            dec yendd
-            jnz .loop
-            RET
-%endmacro
-
-INIT_XMM sse4
-packed_shuffle  5, 15 ;  8 -> 24
-packed_shuffle  4, 16 ;  8 -> 32, 16 -> 64
-packed_shuffle  2, 12 ;  8 -> 48
-packed_shuffle 16,  8 ; 16 -> 8
-packed_shuffle 10, 15 ; 16 -> 24
-packed_shuffle  8, 16 ; 16 -> 32, 32 -> 64
-packed_shuffle  4, 12 ; 16 -> 48
-packed_shuffle 15,  5 ; 24 -> 8
-packed_shuffle 15, 15 ; 24 -> 24
-packed_shuffle 12, 16 ; 24 -> 32
-packed_shuffle  6, 12 ; 24 -> 48
-packed_shuffle 16,  4 ; 32 -> 8,  64 -> 16
-packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48
-packed_shuffle 16, 16 ; 32 -> 32, 64 -> 64
-packed_shuffle  8, 12 ; 32 -> 48
-packed_shuffle 12, 12 ; 48 -> 48
-
-INIT_YMM avx2
-packed_shuffle 32, 32
-
-INIT_ZMM avx512
-packed_shuffle 64, 64
-
 ;---------------------------------------------------------
 ; Planar reads / writes
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 13/34: swscale/x86: move entry points to ops_common.asm

Reply via email to