This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 9cbd8896708050f12ccfcb1dd9aefda9e57dc349 Author: Niklas Haas <[email protected]> AuthorDate: Tue Jun 9 12:49:20 2026 +0200 Commit: Niklas Haas <[email protected]> CommitDate: Sat Jun 20 14:08:49 2026 +0000 swscale/x86/ops: add AVX2/SSE4 path for SWS_UOP_READ_PALETTE The AVX2 is a fairly straightforward vpgatherdd + 4x4 transpose. The SSE4 fallback is an unrolled scalar loop, for lack of anything better to do. checkasm: - CPU: AMD Ryzen 9 9950X3D 16-Core Processor (00B40F40) - Timing source: x86 (rdtsc) - Bench duration: 10000 µs per function (45898205 cycles) - Random seed: 2518020648 Benchmark results: name cycles (vs ref) u8_read_palette_xyzw_c: 2877.5 u8_read_palette_xyzw_x86_sse4: 1951.9 ( 1.47x) u8_read_palette_xyzw_x86_avx2: 1051.6 ( 2.74x) Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 2 ++ libswscale/x86/ops_int.asm | 57 ++++++++++++++++++++++++++++++++++++++++ libswscale/x86/uops_macros.asm.h | 1 + 3 files changed, 60 insertions(+) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index b522ca06e3..31c2199622 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -312,6 +312,7 @@ static bool uop_is_type_invariant(const SwsUOpType uop) SWS_FOR_STRUCT(TYPE, READ_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \ SWS_FOR_STRUCT(TYPE, READ_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, READ_BIT, DECL_ENTRY, EXT, NULL, NULL) \ +SWS_FOR_STRUCT(TYPE, READ_PALETTE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, WRITE_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \ SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, WRITE_BIT, DECL_ENTRY, EXT, NULL, NULL) \ @@ -334,6 +335,7 @@ SWS_FOR_STRUCT(TYPE, DITHER, DECL_ENTRY, EXT, NULL, setup_dither) SWS_FOR(TYPE, READ_PACKED, REF_ENTRY, EXT) \ SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, READ_BIT, REF_ENTRY, EXT) \ + SWS_FOR(TYPE, READ_PALETTE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY, EXT) \ diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm index 111e6d0796..ce9ab1fdc9 100644 --- a/libswscale/x86/ops_int.asm +++ b/libswscale/x86/ops_int.asm @@ -282,6 +282,62 @@ IF1 V2, read_packed34 mx2, my2, mz2, mw2, in0q + mmsize * COMPS CONTINUE tmp0q %endmacro +%macro read_pal8 6 ; x, y, z, w, palette, index +%if cpuflag(avx2) + pmovzxbd %1, [%6 + 0] + pmovzxbd %2, [%6 + 8] + pmovzxbd %3, [%6 + 16] + pmovzxbd %4, [%6 + 24] + vperm2i128 m8, %1, %3, q0200 + vperm2i128 m9, %1, %3, q0301 + vperm2i128 m10, %2, %4, q0200 + vperm2i128 m11, %2, %4, q0301 + pcmpeqb m14, m14 + pcmpeqb m15, m15 + vpgatherdd %1, [%5 + 4 * m8], m14 + vpgatherdd %2, [%5 + 4 * m9], m15 + pcmpeqb m14, m14 + pcmpeqb m15, m15 + vpgatherdd %3, [%5 + 4 * m10], m14 + vpgatherdd %4, [%5 + 4 * m11], m15 + pshufb %1, m12 + pshufb %2, m12 + pshufb %3, m12 + pshufb %4, m12 + punpckldq m8, %1, %2 + punpckldq m9, %3, %4 + punpckhdq m10, %1, %2 + punpckhdq m11, %3, %4 + punpcklqdq %1, m8, m9 + punpckhqdq %2, m8, m9 + punpcklqdq %3, m10, m11 + punpckhqdq %4, m10, m11 +%else ; !cpuflag(avx2) + %assign i 0 + %rep 16 + movzx tmp1d, byte [%6 + i] + pinsrb %1, [%5 + 4 * tmp1q + 0], i + pinsrb %2, [%5 + 4 * tmp1q + 1], i + pinsrb %3, [%5 + 4 * tmp1q + 2], i + pinsrb %4, [%5 + 4 * tmp1q + 3], i + %assign i i+1 + %endrep +%endif +%endmacro + +%macro READ_PALETTE 0 +assert COMPS == 4 +assert BITS == 8 + %if cpuflag(avx2) + VBROADCASTI128 m12, [read8_unpack4] + %endif + LOAD_CONT tmp0q + read_pal8 mx, my, mz, mw, in1q, in0q +IF1 V2, read_pal8 mx2, my2, mz2, mw2, in1q, in0q + mmsize + add in0q, BLOCK_SIZE + CONTINUE tmp0q +%endmacro + %macro write_packed2 0 %if cpuflag(avx2) vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 } @@ -716,6 +772,7 @@ assert 0, SWS_UOP_DITHER is not implemented for integer types DECL_%1_READ_PACKED (READ_PACKED) DECL_%1_READ_NIBBLE (READ_NIBBLE) DECL_%1_READ_BIT (READ_BIT) + DECL_%1_READ_PALETTE (READ_PALETTE) DECL_%1_WRITE_PACKED (WRITE_PACKED) DECL_%1_WRITE_NIBBLE (WRITE_NIBBLE) DECL_%1_WRITE_BIT (WRITE_BIT) diff --git a/libswscale/x86/uops_macros.asm.h b/libswscale/x86/uops_macros.asm.h index d9565d12f2..fce08f320f 100644 --- a/libswscale/x86/uops_macros.asm.h +++ b/libswscale/x86/uops_macros.asm.h @@ -57,6 +57,7 @@ {DEF_MACRO(READ_PLANAR_FH, TYPE)}, \ {DEF_MACRO(READ_PLANAR_FV, TYPE)}, \ {DEF_MACRO(READ_PLANAR_FV_FMA, TYPE)}, \ + {DEF_MACRO(READ_PALETTE, TYPE)}, \ {DEF_MACRO(WRITE_BIT, TYPE)}, \ {DEF_MACRO(WRITE_NIBBLE, TYPE)}, \ {DEF_MACRO(WRITE_PACKED, TYPE)}, \ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
