ops: add reference SWS_OP_FILTER_H implementation

Niklas Haas via ffmpeg-cvslog Sat, 28 Mar 2026 11:54:40 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 4bf51d661531384946cacaa4cc80e0c688f9fe32
Author:     Niklas Haas <[email protected]>
AuthorDate: Mon Mar 16 20:59:28 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/x86/ops: add reference SWS_OP_FILTER_H implementation
    
    This uses a naive gather-based loop, similar to the existing legacy hscale
    SIMD. This has provably correct semantics (and avoids overflow as long as
    the filter scale is 1 << 14 or so), though it's not particularly fast for
    larger filter sizes.
    
    We can specialize this to more efficient implementations in a subset of 
cases,
    but for now, this guarantees a match to the C code.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c         |  98 +++++++++++++++++++++++-
 libswscale/x86/ops_float.asm | 176 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 269 insertions(+), 5 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 053d258c5d..9bf87273d0 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -329,6 +329,98 @@ static int setup_filter_v(const SwsImplParams *params, 
SwsImplResult *out)
     return 0;
 }
 
+static int hscale_sizeof_weight(const SwsOp *op)
+{
+    switch (op->type) {
+    case SWS_PIXEL_U8:  return sizeof(int16_t);
+    case SWS_PIXEL_U16: return sizeof(int16_t);
+    case SWS_PIXEL_F32: return sizeof(float);
+    default:            return 0;
+    }
+}
+
+static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
+{
+    const SwsOp *op = params->op;
+    const SwsFilterWeights *filter = op->rw.kernel;
+
+    /**
+     * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller
+     * size, we need to gather 2/4 taps simultaneously and unroll the inner
+     * loop over several packed samples.
+     */
+    const int taps_align = sizeof(int32_t) / ff_sws_pixel_type_size(op->type);
+    const int filter_size = filter->filter_size;
+    const int block_size = params->table->block_size;
+    const size_t aligned_size = FFALIGN(filter_size, taps_align);
+    const size_t line_size = FFALIGN(filter->dst_size, block_size);
+    av_assert1(FFALIGN(line_size, taps_align) == line_size);
+    if (aligned_size > INT_MAX)
+        return AVERROR(EINVAL);
+
+    union {
+        void *ptr;
+        int16_t *i16;
+        float *f32;
+    } weights;
+
+    const int sizeof_weight = hscale_sizeof_weight(op);
+    weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size);
+    if (!weights.ptr)
+        return AVERROR(ENOMEM);
+
+    /**
+     * Transpose filter weights to group (aligned) taps by block
+     */
+    const int mmsize = block_size * 2;
+    const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd 
*/
+    for (size_t x = 0; x < line_size; x += block_size) {
+        const int elems = FFMIN(block_size, filter->dst_size - x);
+        for (int j = 0; j < filter_size; j++) {
+            const int jb = j & ~(taps_align - 1);
+            const int ji = j - jb;
+            const size_t idx_base = x * aligned_size + jb * block_size + ji;
+            for (int i = 0; i < elems; i++) {
+                const int w = filter->weights[(x + i) * filter_size + j];
+                size_t idx = idx_base;
+                if (op->type == SWS_PIXEL_U8) {
+                    /* Interleave the pixels within each lane, i.e.:
+                     *  [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 
(lane 0)
+                     *  [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 
(lane 1)
+                     *  [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 
(lane 0)
+                     *  [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 
(lane 1)
+                     *  [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 
(lane 0)
+                     *  ...
+                     *  [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 
(lane 1)
+                     *  (repeat for taps 4-7, etc.)
+                     */
+                    const int gather_base = i & ~(gather_size - 1);
+                    const int gather_pos  = i - gather_base;
+                    const int lane_idx    = gather_pos >> 2;
+                    const int pos_in_lane = gather_pos & 3;
+                    idx += gather_base * 4 /* which gather (m0 or m1) */
+                         + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */
+                         + lane_idx * 8 /* 8 ints per lane */
+                         + (pos_in_lane & 1) * 4; /* 4 taps per pair */
+                } else {
+                    idx += i * taps_align;
+                }
+
+                switch (op->type) {
+                case SWS_PIXEL_U8:  weights.i16[idx] = w; break;
+                case SWS_PIXEL_U16: weights.i16[idx] = w; break;
+                case SWS_PIXEL_F32: weights.f32[idx] = w; break;
+                }
+            }
+        }
+    }
+
+    out->priv.ptr = weights.ptr;
+    out->priv.uptr[1] = aligned_size;
+    out->free = ff_op_priv_free;
+    return 0;
+}
+
 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...)                          
 \
     DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT,                                  
 \
         .op = SWS_OP_READ,                                                     
 \
@@ -346,7 +438,8 @@ static int setup_filter_v(const SwsImplParams *params, 
SwsImplResult *out)
 #define DECL_FILTERS_GENERIC(EXT, TYPE)                                        
 \
     DECL_FILTERS(EXT, TYPE, V, filter_v,     .setup = setup_filter_v)          
 \
     DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v,          
 \
-                 .check = check_filter_fma)
+                 .check = check_filter_fma)                                    
 \
+    DECL_FILTERS(EXT, TYPE, H, filter_h,     .setup = setup_filter_h)
 
 #define REF_FILTERS(NAME, SUFFIX)                                              
 \
     &op_##NAME##1##SUFFIX,                                                     
 \
@@ -628,6 +721,9 @@ static const SwsOpTable ops32##EXT = {
         REF_FILTERS(filter_v, _U8##EXT),                                       
 \
         REF_FILTERS(filter_v, _U16##EXT),                                      
 \
         REF_FILTERS(filter_v, _F32##EXT),                                      
 \
+        REF_FILTERS(filter_h, _U8##EXT),                                       
 \
+        REF_FILTERS(filter_h, _U16##EXT),                                      
 \
+        REF_FILTERS(filter_h, _F32##EXT),                                      
 \
         NULL                                                                   
 \
     },                                                                         
 \
 };
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index da2eb8e3ae..6b4ce6fa34 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -19,6 +19,30 @@
 ;******************************************************************************
 
 %include "ops_common.asm"
+%define SWS_FILTER_SCALE (1 << 14)
+
+SECTION_RODATA
+
+align 32
+bias16: times 16 dw 0x8000 ; shift unsigned to signed range
+bias32: times  8 dd 0x8000 * SWS_FILTER_SCALE
+scale_inv: times 8 dd 0x38800000 ; 1.0f / SWS_FILTER_SCALE
+
+; block_size = mmsize * 2 / sizeof(float)  (two grouped registers)
+%macro get_block_size 0
+    %define block_size (mmsize >> 1)
+%if mmsize == 64
+    %define block_shift 5
+%elif mmsize == 32
+    %define block_shift 4
+%elif mmsize == 16
+    %define block_shift 3
+%elif mmsize == 8
+    %define block_shift 2
+%else
+    %error "Unsupported mmsize"
+%endif
+%endmacro
 
 SECTION .text
 
@@ -459,7 +483,146 @@ IF %1 > 1,  add in1q, (mmsize >> 1) * %3
 %undef fltsize
 %endmacro
 
-%macro generic_filter_fns 2 ; type, sizeof_type
+%macro filter_h_iter_U8 4 ; acc, acc2, src, first
+        pcmpeqb m12, m12
+        pcmpeqb m13, m13
+        vpgatherdd m8, [%3 + m14], m12 ; { ABCD | EFGH } 4 pixel per word
+        vpgatherdd m9, [%3 + m15], m13 ; { IJKL | MNOP }
+        ; unpack 4 bytes into separate 16-bit integer registers
+        punpckhbw m10, m8, m12 ; { CCDD | GGHH } 2 pixels per word
+        punpcklbw m8,  m8, m12 ; { AABB | EEFF }
+        punpckhbw m11, m9, m12 ; { KKLL | OOPP }
+        punpcklbw m9,  m9, m12 ; { IIJJ | MMNN }
+        pmaddwd m8,  [weights]
+        pmaddwd m10, [weights + mmsize]
+        pmaddwd m9,  [weights + mmsize * 2]
+        pmaddwd m11, [weights + mmsize * 3]
+%if %4
+        phaddd %1, m8, m10 ; { ABCD | EFGH }
+        phaddd %2, m9, m11 ; { IJKL | MNOP }
+%else
+        phaddd m8, m10
+        phaddd m9, m11
+        paddd %1, m8
+        paddd %2, m9
+%endif
+%endmacro
+
+%macro filter_h_iter_U16 4 ; acc, acc2, src, first
+        pcmpeqb m12, m12
+        pcmpeqb m13, m13
+        vpgatherdd m8, [%3 + m14], m12
+        vpgatherdd m9, [%3 + m15], m13
+        psubw m8, m10
+        psubw m9, m10
+%if %4
+        pmaddwd %1, m8, [weights]
+        pmaddwd %2, m9, [weights + mmsize]
+%else
+        pmaddwd m8, [weights]
+        pmaddwd m9, [weights + mmsize]
+        paddd %1, m8
+        paddd %2, m9
+%endif
+%endmacro
+
+%macro filter_h_iter_F32 4 ; acc, acc2, src, first
+        pcmpeqb m12, m12
+        pcmpeqb m13, m13
+        vpgatherdd m8, [%3 + m14], m12
+        vpgatherdd m9, [%3 + m15], m13
+%if %4
+        mulps %1, m8, [weights]
+        mulps %2, m9, [weights + mmsize]
+%else
+        mulps m8, [weights]
+        mulps m9, [weights + mmsize]
+        addps %1, m8
+        addps %2, m9
+%endif
+%endmacro
+
+%macro filter_h 4 ; elems, type, sizeof_type, sizeof_weight
+op filter_h%1_%2
+%xdefine weights tmp0q
+%xdefine fltsize tmp1q
+            mov tmp0q,   [execq + SwsOpExec.in_offset_x]
+            mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size
+            get_block_size
+            mov tmp2d, bxd
+            shl tmp2q, block_shift         ; x := bx * block_size
+            movu m14, [tmp0q + 4 * tmp2q]  ; &exec->in_offset_x[x]
+            movu m15, [tmp0q + 4 * tmp2q + mmsize]
+            mov weights, [implq + SwsOpImpl.priv]
+%ifidn %2, U16
+            mova m10, [bias16]
+            mova m11, [bias32]
+%endif
+            imul tmp2q, fltsize
+            lea weights, [weights + tmp2q * %4] ; weights += x * filter_size
+            filter_h_iter_%2 mx, mx2, in0q, 1
+IF1 %1 > 1, filter_h_iter_%2 my, my2, in1q, 1
+IF1 %1 > 2, filter_h_iter_%2 mz, mz2, in2q, 1
+IF1 %1 > 3, filter_h_iter_%2 mw, mw2, in3q, 1
+            sub fltsize, 4 / %3
+            jz .done
+            push in0q
+IF %1 > 1,  push in1q
+IF %1 > 2,  push in2q
+IF %1 > 3,  push in3q
+.loop:
+            add in0q, 4
+IF %1 > 1,  add in1q, 4
+IF %1 > 2,  add in2q, 4
+IF %1 > 3,  add in3q, 4
+            add weights, mmsize * 2 * (%4 / %3)
+            filter_h_iter_%2 mx, mx2, in0q, 0
+IF1 %1 > 1, filter_h_iter_%2 my, my2, in1q, 0
+IF1 %1 > 2, filter_h_iter_%2 mz, mz2, in2q, 0
+IF1 %1 > 3, filter_h_iter_%2 mw, mw2, in3q, 0
+            sub fltsize, 4 / %3
+            jnz .loop
+IF %1 > 3,  pop in3q
+IF %1 > 2,  pop in2q
+IF %1 > 1,  pop in1q
+            pop in0q
+.done:
+%ifidn %2, U16
+            paddd mx, m11
+IF %1 > 1,  paddd my, m11
+IF %1 > 2,  paddd mz, m11
+IF %1 > 3,  paddd mw, m11
+            paddd mx2, m11
+IF %1 > 1,  paddd my2, m11
+IF %1 > 2,  paddd mz2, m11
+IF %1 > 3,  paddd mw2, m11
+%endif
+%ifnidn %2, F32
+            vcvtdq2ps mx, mx
+IF %1 > 1,  vcvtdq2ps my, my
+IF %1 > 2,  vcvtdq2ps mz, mz
+IF %1 > 3,  vcvtdq2ps mw, mw
+            vcvtdq2ps mx2, mx2
+IF %1 > 1,  vcvtdq2ps my2, my2
+IF %1 > 2,  vcvtdq2ps mz2, mz2
+IF %1 > 3,  vcvtdq2ps mw2, mw2
+%endif
+            mova m12, [scale_inv]
+            LOAD_CONT tmp0q
+            mulps mx, m12
+IF %1 > 1,  mulps my, m12
+IF %1 > 2,  mulps mz, m12
+IF %1 > 3,  mulps mw, m12
+            mulps mx2, m12
+IF %1 > 1,  mulps my2, m12
+IF %1 > 2,  mulps mz2, m12
+IF %1 > 3,  mulps mw2, m12
+            CONTINUE tmp0q
+%undef weights
+%undef fltsize
+%endmacro
+
+%macro generic_filter_fns 3 ; type, sizeof_type, sizeof_weight
         filter_v 1, %1, %2, v
         filter_v 2, %1, %2, v
         filter_v 3, %1, %2, v
@@ -469,12 +632,17 @@ IF %1 > 1,  add in1q, (mmsize >> 1) * %3
         filter_v 2, %1, %2, fma_v
         filter_v 3, %1, %2, fma_v
         filter_v 4, %1, %2, fma_v
+
+        filter_h 1, %1, %2, %3
+        filter_h 2, %1, %2, %3
+        filter_h 3, %1, %2, %3
+        filter_h 4, %1, %2, %3
 %endmacro
 
 %macro filter_fns 0
-    generic_filter_fns U8,  1
-    generic_filter_fns U16, 2
-    generic_filter_fns F32, 4
+    generic_filter_fns U8,  1, 2
+    generic_filter_fns U16, 2, 2
+    generic_filter_fns F32, 4, 4
 %endmacro
 
 INIT_YMM avx2

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 29/31: swscale/x86/ops: add reference SWS_OP_FILTER_H implementation

Reply via email to