This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 4bf51d661531384946cacaa4cc80e0c688f9fe32 Author: Niklas Haas <[email protected]> AuthorDate: Mon Mar 16 20:59:28 2026 +0100 Commit: Niklas Haas <[email protected]> CommitDate: Sat Mar 28 18:50:14 2026 +0100 swscale/x86/ops: add reference SWS_OP_FILTER_H implementation This uses a naive gather-based loop, similar to the existing legacy hscale SIMD. This has provably correct semantics (and avoids overflow as long as the filter scale is 1 << 14 or so), though it's not particularly fast for larger filter sizes. We can specialize this to more efficient implementations in a subset of cases, but for now, this guarantees a match to the C code. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 98 +++++++++++++++++++++++- libswscale/x86/ops_float.asm | 176 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 269 insertions(+), 5 deletions(-) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 053d258c5d..9bf87273d0 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -329,6 +329,98 @@ static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out) return 0; } +static int hscale_sizeof_weight(const SwsOp *op) +{ + switch (op->type) { + case SWS_PIXEL_U8: return sizeof(int16_t); + case SWS_PIXEL_U16: return sizeof(int16_t); + case SWS_PIXEL_F32: return sizeof(float); + default: return 0; + } +} + +static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out) +{ + const SwsOp *op = params->op; + const SwsFilterWeights *filter = op->rw.kernel; + + /** + * `vpgatherdd` gathers 32 bits at a time; so if we're filtering a smaller + * size, we need to gather 2/4 taps simultaneously and unroll the inner + * loop over several packed samples. + */ + const int taps_align = sizeof(int32_t) / ff_sws_pixel_type_size(op->type); + const int filter_size = filter->filter_size; + const int block_size = params->table->block_size; + const size_t aligned_size = FFALIGN(filter_size, taps_align); + const size_t line_size = FFALIGN(filter->dst_size, block_size); + av_assert1(FFALIGN(line_size, taps_align) == line_size); + if (aligned_size > INT_MAX) + return AVERROR(EINVAL); + + union { + void *ptr; + int16_t *i16; + float *f32; + } weights; + + const int sizeof_weight = hscale_sizeof_weight(op); + weights.ptr = av_calloc(line_size, sizeof_weight * aligned_size); + if (!weights.ptr) + return AVERROR(ENOMEM); + + /** + * Transpose filter weights to group (aligned) taps by block + */ + const int mmsize = block_size * 2; + const int gather_size = mmsize / sizeof(int32_t); /* pixels per vpgatherdd */ + for (size_t x = 0; x < line_size; x += block_size) { + const int elems = FFMIN(block_size, filter->dst_size - x); + for (int j = 0; j < filter_size; j++) { + const int jb = j & ~(taps_align - 1); + const int ji = j - jb; + const size_t idx_base = x * aligned_size + jb * block_size + ji; + for (int i = 0; i < elems; i++) { + const int w = filter->weights[(x + i) * filter_size + j]; + size_t idx = idx_base; + if (op->type == SWS_PIXEL_U8) { + /* Interleave the pixels within each lane, i.e.: + * [a0 a1 a2 a3 | b0 b1 b2 b3 ] pixels 0-1, taps 0-3 (lane 0) + * [e0 e1 e2 e3 | f0 f1 f2 f3 ] pixels 4-5, taps 0-3 (lane 1) + * [c0 c1 c2 c3 | d0 d1 d2 d3 ] pixels 2-3, taps 0-3 (lane 0) + * [g0 g1 g2 g3 | h0 h1 h2 h3 ] pixels 6-7, taps 0-3 (lane 1) + * [i0 i1 i2 i3 | j0 j1 j2 j3 ] pixels 8-9, taps 0-3 (lane 0) + * ... + * [o0 o1 o2 o3 | p0 p1 p2 p3 ] pixels 14-15, taps 0-3 (lane 1) + * (repeat for taps 4-7, etc.) + */ + const int gather_base = i & ~(gather_size - 1); + const int gather_pos = i - gather_base; + const int lane_idx = gather_pos >> 2; + const int pos_in_lane = gather_pos & 3; + idx += gather_base * 4 /* which gather (m0 or m1) */ + + (pos_in_lane >> 1) * (mmsize / 2) /* lo/hi unpack */ + + lane_idx * 8 /* 8 ints per lane */ + + (pos_in_lane & 1) * 4; /* 4 taps per pair */ + } else { + idx += i * taps_align; + } + + switch (op->type) { + case SWS_PIXEL_U8: weights.i16[idx] = w; break; + case SWS_PIXEL_U16: weights.i16[idx] = w; break; + case SWS_PIXEL_F32: weights.f32[idx] = w; break; + } + } + } + } + + out->priv.ptr = weights.ptr; + out->priv.uptr[1] = aligned_size; + out->free = ff_op_priv_free; + return 0; +} + #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \ DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT, \ .op = SWS_OP_READ, \ @@ -346,7 +438,8 @@ static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out) #define DECL_FILTERS_GENERIC(EXT, TYPE) \ DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \ DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \ - .check = check_filter_fma) + .check = check_filter_fma) \ + DECL_FILTERS(EXT, TYPE, H, filter_h, .setup = setup_filter_h) #define REF_FILTERS(NAME, SUFFIX) \ &op_##NAME##1##SUFFIX, \ @@ -628,6 +721,9 @@ static const SwsOpTable ops32##EXT = { REF_FILTERS(filter_v, _U8##EXT), \ REF_FILTERS(filter_v, _U16##EXT), \ REF_FILTERS(filter_v, _F32##EXT), \ + REF_FILTERS(filter_h, _U8##EXT), \ + REF_FILTERS(filter_h, _U16##EXT), \ + REF_FILTERS(filter_h, _F32##EXT), \ NULL \ }, \ }; diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm index da2eb8e3ae..6b4ce6fa34 100644 --- a/libswscale/x86/ops_float.asm +++ b/libswscale/x86/ops_float.asm @@ -19,6 +19,30 @@ ;****************************************************************************** %include "ops_common.asm" +%define SWS_FILTER_SCALE (1 << 14) + +SECTION_RODATA + +align 32 +bias16: times 16 dw 0x8000 ; shift unsigned to signed range +bias32: times 8 dd 0x8000 * SWS_FILTER_SCALE +scale_inv: times 8 dd 0x38800000 ; 1.0f / SWS_FILTER_SCALE + +; block_size = mmsize * 2 / sizeof(float) (two grouped registers) +%macro get_block_size 0 + %define block_size (mmsize >> 1) +%if mmsize == 64 + %define block_shift 5 +%elif mmsize == 32 + %define block_shift 4 +%elif mmsize == 16 + %define block_shift 3 +%elif mmsize == 8 + %define block_shift 2 +%else + %error "Unsupported mmsize" +%endif +%endmacro SECTION .text @@ -459,7 +483,146 @@ IF %1 > 1, add in1q, (mmsize >> 1) * %3 %undef fltsize %endmacro -%macro generic_filter_fns 2 ; type, sizeof_type +%macro filter_h_iter_U8 4 ; acc, acc2, src, first + pcmpeqb m12, m12 + pcmpeqb m13, m13 + vpgatherdd m8, [%3 + m14], m12 ; { ABCD | EFGH } 4 pixel per word + vpgatherdd m9, [%3 + m15], m13 ; { IJKL | MNOP } + ; unpack 4 bytes into separate 16-bit integer registers + punpckhbw m10, m8, m12 ; { CCDD | GGHH } 2 pixels per word + punpcklbw m8, m8, m12 ; { AABB | EEFF } + punpckhbw m11, m9, m12 ; { KKLL | OOPP } + punpcklbw m9, m9, m12 ; { IIJJ | MMNN } + pmaddwd m8, [weights] + pmaddwd m10, [weights + mmsize] + pmaddwd m9, [weights + mmsize * 2] + pmaddwd m11, [weights + mmsize * 3] +%if %4 + phaddd %1, m8, m10 ; { ABCD | EFGH } + phaddd %2, m9, m11 ; { IJKL | MNOP } +%else + phaddd m8, m10 + phaddd m9, m11 + paddd %1, m8 + paddd %2, m9 +%endif +%endmacro + +%macro filter_h_iter_U16 4 ; acc, acc2, src, first + pcmpeqb m12, m12 + pcmpeqb m13, m13 + vpgatherdd m8, [%3 + m14], m12 + vpgatherdd m9, [%3 + m15], m13 + psubw m8, m10 + psubw m9, m10 +%if %4 + pmaddwd %1, m8, [weights] + pmaddwd %2, m9, [weights + mmsize] +%else + pmaddwd m8, [weights] + pmaddwd m9, [weights + mmsize] + paddd %1, m8 + paddd %2, m9 +%endif +%endmacro + +%macro filter_h_iter_F32 4 ; acc, acc2, src, first + pcmpeqb m12, m12 + pcmpeqb m13, m13 + vpgatherdd m8, [%3 + m14], m12 + vpgatherdd m9, [%3 + m15], m13 +%if %4 + mulps %1, m8, [weights] + mulps %2, m9, [weights + mmsize] +%else + mulps m8, [weights] + mulps m9, [weights + mmsize] + addps %1, m8 + addps %2, m9 +%endif +%endmacro + +%macro filter_h 4 ; elems, type, sizeof_type, sizeof_weight +op filter_h%1_%2 +%xdefine weights tmp0q +%xdefine fltsize tmp1q + mov tmp0q, [execq + SwsOpExec.in_offset_x] + mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size + get_block_size + mov tmp2d, bxd + shl tmp2q, block_shift ; x := bx * block_size + movu m14, [tmp0q + 4 * tmp2q] ; &exec->in_offset_x[x] + movu m15, [tmp0q + 4 * tmp2q + mmsize] + mov weights, [implq + SwsOpImpl.priv] +%ifidn %2, U16 + mova m10, [bias16] + mova m11, [bias32] +%endif + imul tmp2q, fltsize + lea weights, [weights + tmp2q * %4] ; weights += x * filter_size + filter_h_iter_%2 mx, mx2, in0q, 1 +IF1 %1 > 1, filter_h_iter_%2 my, my2, in1q, 1 +IF1 %1 > 2, filter_h_iter_%2 mz, mz2, in2q, 1 +IF1 %1 > 3, filter_h_iter_%2 mw, mw2, in3q, 1 + sub fltsize, 4 / %3 + jz .done + push in0q +IF %1 > 1, push in1q +IF %1 > 2, push in2q +IF %1 > 3, push in3q +.loop: + add in0q, 4 +IF %1 > 1, add in1q, 4 +IF %1 > 2, add in2q, 4 +IF %1 > 3, add in3q, 4 + add weights, mmsize * 2 * (%4 / %3) + filter_h_iter_%2 mx, mx2, in0q, 0 +IF1 %1 > 1, filter_h_iter_%2 my, my2, in1q, 0 +IF1 %1 > 2, filter_h_iter_%2 mz, mz2, in2q, 0 +IF1 %1 > 3, filter_h_iter_%2 mw, mw2, in3q, 0 + sub fltsize, 4 / %3 + jnz .loop +IF %1 > 3, pop in3q +IF %1 > 2, pop in2q +IF %1 > 1, pop in1q + pop in0q +.done: +%ifidn %2, U16 + paddd mx, m11 +IF %1 > 1, paddd my, m11 +IF %1 > 2, paddd mz, m11 +IF %1 > 3, paddd mw, m11 + paddd mx2, m11 +IF %1 > 1, paddd my2, m11 +IF %1 > 2, paddd mz2, m11 +IF %1 > 3, paddd mw2, m11 +%endif +%ifnidn %2, F32 + vcvtdq2ps mx, mx +IF %1 > 1, vcvtdq2ps my, my +IF %1 > 2, vcvtdq2ps mz, mz +IF %1 > 3, vcvtdq2ps mw, mw + vcvtdq2ps mx2, mx2 +IF %1 > 1, vcvtdq2ps my2, my2 +IF %1 > 2, vcvtdq2ps mz2, mz2 +IF %1 > 3, vcvtdq2ps mw2, mw2 +%endif + mova m12, [scale_inv] + LOAD_CONT tmp0q + mulps mx, m12 +IF %1 > 1, mulps my, m12 +IF %1 > 2, mulps mz, m12 +IF %1 > 3, mulps mw, m12 + mulps mx2, m12 +IF %1 > 1, mulps my2, m12 +IF %1 > 2, mulps mz2, m12 +IF %1 > 3, mulps mw2, m12 + CONTINUE tmp0q +%undef weights +%undef fltsize +%endmacro + +%macro generic_filter_fns 3 ; type, sizeof_type, sizeof_weight filter_v 1, %1, %2, v filter_v 2, %1, %2, v filter_v 3, %1, %2, v @@ -469,12 +632,17 @@ IF %1 > 1, add in1q, (mmsize >> 1) * %3 filter_v 2, %1, %2, fma_v filter_v 3, %1, %2, fma_v filter_v 4, %1, %2, fma_v + + filter_h 1, %1, %2, %3 + filter_h 2, %1, %2, %3 + filter_h 3, %1, %2, %3 + filter_h 4, %1, %2, %3 %endmacro %macro filter_fns 0 - generic_filter_fns U8, 1 - generic_filter_fns U16, 2 - generic_filter_fns F32, 4 + generic_filter_fns U8, 1, 2 + generic_filter_fns U16, 2, 2 + generic_filter_fns F32, 4, 4 %endmacro INIT_YMM avx2 _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
