[FFmpeg-devel] [PR] swscale/ops_optimizer: rewrite shuffle solver as a dedicated UOP (PR #23440)

Niklas Haas via ffmpeg-devel Wed, 10 Jun 2026 08:41:05 -0700

PR #23440 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23440
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23440.patch



>From 205dc63dee80405f341d2cb782576ddbe76dcd88 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 18:39:39 +0200
Subject: [PATCH 1/9] swscale/ops_optimizer: add ff_sws_uop_list_optimize()
 placeholder

This will be populated with logic in the near future. I decided to
also include the skeleton of the loop here, guarded by #if 0 to
avoid triggering unused variable warnings until we actually use it.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 19 +++++++++++++++++++
 libswscale/uops.c          |  3 ++-
 libswscale/uops.h          |  5 +++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 619c6cf42b..c1ba326727 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -891,6 +891,25 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
     return AVERROR(EINVAL);
 }
 
+int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops)
+{
+#if 0
+    static const SwsUOp dummy = {0};
+
+retry:
+    for (int i = 0; i < uops->num_ops; i++) {
+        const SwsUOp *next = i < uops->num_ops - 1 ? &uops->ops[i + 1] : 
&dummy;
+        SwsUOp *op = &uops->ops[i];
+
+        switch (op->uop) {
+            /* placeholder */
+        }
+    }
+#endif
+
+    return 0;
+}
+
 /**
  * Determine a suitable intermediate buffer format for a given combination
  * of pixel types and number of planes. The exact interpretation of these
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 7af8a8af51..88528e10f7 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -857,7 +857,8 @@ int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList 
*ops,
             return ret;
         input = ops->ops[i].comps;
     }
-    return 0;
+
+    return ff_sws_uop_list_optimize(ctx, flags, uops);
 }
 
 static int register_uop(struct AVTreeNode **root, const SwsUOp *uop)
diff --git a/libswscale/uops.h b/libswscale/uops.h
index b2e9af30a4..2ac385733e 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -243,6 +243,11 @@ void ff_sws_uop_list_free(SwsUOpList **ops);
 /* Takes over ownership of `uop` and sets it to {0}, even on failure. */
 int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);
 
+/**
+ * Called internally by ff_sws_ops_translate().
+ */
+int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops);
+
 /**
  * Translate a list of operations down to micro-ops, which can be further
  * optimized and then directly executed by backends.
-- 
2.52.0


>From c51723015938c0b153ffad03e89b9288d395f547 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 18:38:49 +0200
Subject: [PATCH 2/9] swscale/uops: add ff_sws_uop_list_remove_at()

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c | 11 +++++++++++
 libswscale/uops.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index 88528e10f7..01808f685a 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -397,6 +397,17 @@ int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop)
     return 0;
 }
 
+void ff_sws_uop_list_remove_at(SwsUOpList *uops, int index, int count)
+{
+    const int end = uops->num_ops - count;
+    av_assert2(index >= 0 && count >= 0 && index + count <= uops->num_ops);
+    for (int i = 0; i < count; i++)
+        uop_uninit(&uops->ops[index + i]);
+    for (int i = index; i < end; i++)
+        uops->ops[i] = uops->ops[i + count];
+    uops->num_ops = end;
+}
+
 int ff_sws_dither_height(const SwsDitherUOp *dither)
 {
     int max_offset = 0;
diff --git a/libswscale/uops.h b/libswscale/uops.h
index 2ac385733e..eac8ee191e 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -239,6 +239,7 @@ typedef struct SwsUOpList {
 
 SwsUOpList *ff_sws_uop_list_alloc(void);
 void ff_sws_uop_list_free(SwsUOpList **ops);
+void ff_sws_uop_list_remove_at(SwsUOpList *uops, int index, int count);
 
 /* Takes over ownership of `uop` and sets it to {0}, even on failure. */
 int ff_sws_uop_list_append(SwsUOpList *uops, SwsUOp *uop);
-- 
2.52.0


>From 3dd60a0b0a59fd9ad9a783f72dfcd22faebfe867 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 17:19:48 +0200
Subject: [PATCH 3/9] swscale/uops: add SWS_UOP_RW_SHUFFLE

This directly encodes the "pshufb" loop style processing seen in the x86
backend (and, presumably, other backends down the line). The major
advantage of encoding this as a separate UOP is that it allows the macro
generator to exhaustively enumerate all needed variants of the read/write
chunk sizes, which so-far have been hand-maintained through trial and error.

I decided to also go ahead and pre-emptively generalize the clear
value to allow us to encode RGBA clears in addition to RGB0 clears,
which makes this UOP useful for e.g. Gray -> RGBA32 as well; something
that the current pshufb loop did not handle.

This commit merely adds the UOP defition; it does not yet port
the packed shuffle solver or x86 implementation.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c        | 16 ++++++++++++++++
 libswscale/uops.h        | 13 ++++++++++++-
 libswscale/uops_macros.h |  8 ++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index 01808f685a..d7a642b5ca 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -58,6 +58,7 @@ static const struct {
     UOP_NAME(WRITE_PACKED,      "write_packed"),
     UOP_NAME(WRITE_NIBBLE,      "write_nibble"),
     UOP_NAME(WRITE_BIT,         "write_bit"),
+    UOP_NAME(RW_SHUFFLE,        "rw_shuffle"),
     UOP_NAME(PERMUTE,           "permute"),
     UOP_NAME(COPY,              "copy"),
     UOP_NAME(MOVE,              "move"),
@@ -150,6 +151,10 @@ void ff_sws_uop_name(const SwsUOp *op, char 
buf[SWS_UOP_NAME_MAX])
     case SWS_UOP_READ_PLANAR_FV_FMA:
         av_bprintf(&bp, "_%s", ff_sws_pixel_type_name(par->filter.type));
         break;
+    case SWS_UOP_RW_SHUFFLE:
+        av_bprintf(&bp, "_%x_%u_%u", par->shuffle.clear_value,
+                   par->shuffle.read_size, par->shuffle.write_size);
+        break;
     case SWS_UOP_LSHIFT:
     case SWS_UOP_RSHIFT:
         av_bprintf(&bp, "_%u", par->shift.amount);
@@ -238,6 +243,13 @@ static int generate_entry_struct(void *opaque, void *key)
     case SWS_UOP_READ_PLANAR_FV_FMA:
         av_bprintf(bp, ", .par.filter.type = %s", 
pixel_types[par->filter.type].full);
         break;
+    case SWS_UOP_RW_SHUFFLE:
+        av_bprintf(bp, ", .par.shuffle.clear_value = 0x%x"
+                       ", .par.shuffle.read_size = %u"
+                       ", .par.shuffle.write_size = %u",
+                   par->shuffle.clear_value,
+                   par->shuffle.read_size, par->shuffle.write_size);
+        break;
     case SWS_UOP_LSHIFT:
     case SWS_UOP_RSHIFT:
         av_bprintf(bp, ", .par.shift.amount = %u", par->shift.amount);
@@ -303,6 +315,10 @@ static int generate_entry_args(void *opaque, void *key)
     case SWS_UOP_READ_PLANAR_FV_FMA:
         av_bprintf(bp, ", %s", pixel_types[par->filter.type].full);
         break;
+    case SWS_UOP_RW_SHUFFLE:
+        av_bprintf(bp, ", 0x%x, %u, %u", par->shuffle.clear_value,
+                   par->shuffle.read_size, par->shuffle.write_size);
+        break;
     case SWS_UOP_LSHIFT:
     case SWS_UOP_RSHIFT:
         av_bprintf(bp, ", %u", par->shift.amount);
diff --git a/libswscale/uops.h b/libswscale/uops.h
index eac8ee191e..082124a7d6 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -102,6 +102,9 @@ typedef enum SwsUOpType {
     SWS_UOP_WRITE_NIBBLE,    /* fractional write (4 bits) to single plane */
     SWS_UOP_WRITE_BIT,       /* fractional write (1 bit) to single plane */
 
+    /* Packed shuffle / gather uops */
+    SWS_UOP_RW_SHUFFLE,      /* in-place (packed) indexed shuffle/gather */
+
     /* Data rearrangement uops; mask = non-trivial and needed components */
     SWS_UOP_PERMUTE,         /* rearrange components (no duplicates) */
     SWS_UOP_COPY,            /* copy/duplicate components */
@@ -137,6 +140,12 @@ typedef enum SwsUOpType {
     SWS_UOP_TYPE_NB,
 } SwsUOpType;
 
+typedef struct SwsShuffleUOp {
+    uint8_t clear_value; /* value to clear elements with negative indices to */
+    uint8_t read_size;   /* input bytes per pixel */
+    uint8_t write_size;  /* output bytes per pixel */
+} SwsShuffleUOp;
+
 typedef struct SwsFilterUOp {
     SwsPixelType type; /* pixel type to store result as */
 } SwsFilterUOp;
@@ -188,7 +197,8 @@ typedef struct SwsDitherUOp {
 int ff_sws_dither_height(const SwsDitherUOp *dither);
 
 typedef union SwsUOpParams {
-    SwsFilterUOp    filter; /* for SWS_UOP_READ_*_FV/FH */
+    SwsShuffleUOp   shuffle; /* for SWS_UOP_RW_SHUFFLE */
+    SwsFilterUOp    filter;  /* for SWS_UOP_READ_*_FV/FH */
     SwsShiftUOp     shift;
     SwsSwizzleUOp   swizzle;
     SwsMoveUOp      move;
@@ -212,6 +222,7 @@ typedef struct SwsUOp {
         SwsPixel scalar;
         SwsPixel vec4[4];
         SwsPixel mat4[4][5];        /* row major */
+        int8_t shuffle[16];         /* mask for SWS_UOP_RW_PACKED_SHUFB */
         void *opaque;               /* reserved for internal use */
     } data;
 } SwsUOp;
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index f63d046aa3..a870a27c8b 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -101,6 +101,8 @@
     MACRO(__VA_ARGS__, u8_write_bit_x                          , SWS_PIXEL_U8 
, SWS_UOP_WRITE_BIT       , 0x1)
 #define SWS_FOR_STRUCT_U8_WRITE_BIT(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_write_bit_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_WRITE_BIT       , .mask = 0x1)
+#define SWS_FOR_U8_RW_SHUFFLE(MACRO, ...)
+#define SWS_FOR_STRUCT_U8_RW_SHUFFLE(MACRO, ...)
 #define SWS_FOR_U8_PERMUTE(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_permute_x_y                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
     MACRO(__VA_ARGS__, u8_permute_x_z                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
@@ -439,6 +441,8 @@
 #define SWS_FOR_STRUCT_U16_WRITE_NIBBLE(MACRO, ...)
 #define SWS_FOR_U16_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_STRUCT_U16_WRITE_BIT(MACRO, ...)
+#define SWS_FOR_U16_RW_SHUFFLE(MACRO, ...)
+#define SWS_FOR_STRUCT_U16_RW_SHUFFLE(MACRO, ...)
 #define SWS_FOR_U16_PERMUTE(MACRO, ...) \
     MACRO(__VA_ARGS__, u16_permute_x_y                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
     MACRO(__VA_ARGS__, u16_permute_x_z                         , 
SWS_PIXEL_U16, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
@@ -743,6 +747,8 @@
 #define SWS_FOR_STRUCT_U32_WRITE_NIBBLE(MACRO, ...)
 #define SWS_FOR_U32_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_STRUCT_U32_WRITE_BIT(MACRO, ...)
+#define SWS_FOR_U32_RW_SHUFFLE(MACRO, ...)
+#define SWS_FOR_STRUCT_U32_RW_SHUFFLE(MACRO, ...)
 #define SWS_FOR_U32_PERMUTE(MACRO, ...) \
     MACRO(__VA_ARGS__, u32_permute_x_y                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
     MACRO(__VA_ARGS__, u32_permute_x_z                         , 
SWS_PIXEL_U32, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
@@ -1019,6 +1025,8 @@
 #define SWS_FOR_STRUCT_F32_WRITE_NIBBLE(MACRO, ...)
 #define SWS_FOR_F32_WRITE_BIT(MACRO, ...)
 #define SWS_FOR_STRUCT_F32_WRITE_BIT(MACRO, ...)
+#define SWS_FOR_F32_RW_SHUFFLE(MACRO, ...)
+#define SWS_FOR_STRUCT_F32_RW_SHUFFLE(MACRO, ...)
 #define SWS_FOR_F32_PERMUTE(MACRO, ...)
 #define SWS_FOR_STRUCT_F32_PERMUTE(MACRO, ...)
 #define SWS_FOR_F32_COPY(MACRO, ...)
-- 
2.52.0


>From 833fe6cfafe86414f25238e42a9952bf45e7e2c8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 17:20:44 +0200
Subject: [PATCH 4/9] swscale/uops: add SWS_UOP_FLAG_PSHUFB

Exposes the SWS_UOP_RW_SHUFFLE capability. While this translates
to the exact `pshufb` instruction on x86, we don't expose a separate
capability for e.g. `vpermb` on AVX-512, since the underlying semantics
are essentially equivalent.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c    | 2 +-
 libswscale/uops.h    | 7 ++++---
 libswscale/x86/ops.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index d7a642b5ca..6cafc1081f 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -933,7 +933,7 @@ fail:
 
 static const SwsUOpFlags uop_flags[] = {
     0,
-    SWS_UOP_FLAG_FMA | SWS_UOP_FLAG_MOVE, /* x86 backend */
+    SWS_UOP_FLAG_FMA | SWS_UOP_FLAG_MOVE | SWS_UOP_FLAG_PSHUFB, /* x86 backend 
*/
 };
 
 static int register_uops(SwsContext *ctx, const SwsOpList *ops,
diff --git a/libswscale/uops.h b/libswscale/uops.h
index 082124a7d6..e0da662aa8 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -80,9 +80,10 @@ enum {
 
 typedef uint32_t SwsUOpFlags;
 typedef enum SwsUOpFlagBits {
-    SWS_UOP_FLAG_NONE = 0,
-    SWS_UOP_FLAG_FMA  = (1 << 0), /* platform supports FMA ops */
-    SWS_UOP_FLAG_MOVE = (1 << 1), /* platform supports SWS_UOP_MOVE */
+    SWS_UOP_FLAG_NONE   = 0,
+    SWS_UOP_FLAG_FMA    = (1 << 0), /* platform supports FMA ops */
+    SWS_UOP_FLAG_MOVE   = (1 << 1), /* platform supports SWS_UOP_MOVE */
+    SWS_UOP_FLAG_PSHUFB = (1 << 2), /* platform supports pshufb equivalent */
 } SwsUOpFlagBits;
 
 typedef enum SwsUOpType {
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 4c8eceb1cb..db445135f8 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -591,7 +591,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
         goto fail;
     }
 
-    SwsUOpFlags flags = SWS_UOP_FLAG_MOVE;
+    SwsUOpFlags flags = SWS_UOP_FLAG_MOVE | SWS_UOP_FLAG_PSHUFB;
     if (X86_FMA4(cpu_flags))
         flags |= SWS_UOP_FLAG_FMA;
 
-- 
2.52.0


>From d8e13cbbe99909cd11e4f2e1c30e759d8271f74e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Jun 2026 15:33:25 +0200
Subject: [PATCH 5/9] swscale/uops: add ff_sws_shuffle_mask() helper

This takes a (minimal) shuffle mask and expands it out to the needed
number of bytes by repeating pixels.

I decided to have the implementation live in ops_optimizer.c to locate
it closer to the code where these shuffle masks are generated, to aid
in understanding.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 24 ++++++++++++++++++++++++
 libswscale/uops.h          | 12 ++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index c1ba326727..08b4fd75b8 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -891,6 +891,30 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
     return AVERROR(EINVAL);
 }
 
+int ff_sws_shuffle_mask(const SwsUOp *uop, int8_t shuffle[], int size)
+{
+    const SwsShuffleUOp *par = &uop->par.shuffle;
+    av_assert1(uop->uop == SWS_UOP_RW_SHUFFLE);
+    av_assert1(par->write_size <= sizeof(uop->data.shuffle));
+    av_assert1(size <= INT8_MAX);
+
+    const int num_pixels = size / FFMAX(par->read_size, par->write_size);
+    if (!num_pixels)
+        return AVERROR(EINVAL);
+
+    memset(shuffle, 0, size);
+    for (int n = 0; n < num_pixels; n++) {
+        const int base_in  = n * par->read_size;
+        const int base_out = n * par->write_size;
+        for (int i = 0; i < par->write_size; i++) {
+            const int8_t idx = uop->data.shuffle[i];
+            shuffle[base_out + i] = idx + (idx >= 0) * base_in;
+        }
+    }
+
+    return num_pixels;
+}
+
 int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops)
 {
 #if 0
diff --git a/libswscale/uops.h b/libswscale/uops.h
index e0da662aa8..5d31146775 100644
--- a/libswscale/uops.h
+++ b/libswscale/uops.h
@@ -282,4 +282,16 @@ int ff_sws_ops_translate(SwsContext *ctx, const SwsOpList 
*ops,
  */
 int ff_sws_uops_macros_gen(char **out_str);
 
+/**
+ * Compute a shuffle mask for `pshufb`-style ASM functions, by repeating
+ * the shuffle pattern for a single pixel as many times as it will fit.
+ *
+ * @param uop         An operation of type SWS_UOP_RW_SHUFFLE.
+ * @param shuffle     The output shuffle index mask (or -1 to clear bytes).
+ * @param size        The maximum size (in bytes) of the output shuffle mask.
+ *
+ * @return the number of pixels on success, or a negative error code.
+ */
+int ff_sws_shuffle_mask(const SwsUOp *uop, int8_t shuffle[], int size);
+
 #endif
-- 
2.52.0


>From d08b72634762831b664d5e19596d7df549f0b613 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 17:21:33 +0200
Subject: [PATCH 6/9] swscale/uops: rewrite ff_sws_solve_shuffle() in terms of
 uops

This allows the uop optimizer stage to directly generate RW_SHUFFLE
uops, which will replace the original ff_sws_solve_shuffle() over the next
couple of commits.

This way of solving it has a number of additional upsides, including but not
limited to:

- The ability to move certain low-level optimizations (like expanding
  conversions) out of the higher-level abstraction and have them arise
  purely during uops translation / optimization

- The ability to stop passing SwsOpList to backends at all

The major downside is that, due to our formulation of SWS_UOP_RW_SHUFFLE
as encoding the size for a single pixel directly, we end up with some
duplicate definitions - e.g. `2_4` and `4_8`. This is a bit unavoidable
because we would otherwise lose metadata about the number of pixels, which
the loop needs to know for setting the correct block size for the dispatch
layer.

Otherwise, we could avoid this by pre-emptively aligning up the shuffle mask
to a multiple of 16 bytes. But the only cost is a tiny bit of code duplication
on an already small function.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 131 +++++++++++++++++++++++++++++++++++++
 libswscale/uops_macros.h   |  56 +++++++++++++++-
 2 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 08b4fd75b8..980cc41b69 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -915,8 +915,139 @@ int ff_sws_shuffle_mask(const SwsUOp *uop, int8_t 
shuffle[], int size)
     return num_pixels;
 }
 
+static bool pixel_is_repeating(SwsPixelType type, SwsPixel val)
+{
+    switch (ff_sws_pixel_type_size(type)) {
+    case 1: return true;
+    case 2: return val.u16 == val.u8 * 0x101ul;
+    case 4: return val.u32 == val.u8 * 0x1010101ul;
+    default: break;
+    }
+
+    av_unreachable("Invalid pixel type!");
+    return false;
+}
+
+static int solve_shuffle(const SwsUOpList *const uops, SwsUOp *out_shuffle)
+{
+    if (!uops->num_ops)
+        return AVERROR(EINVAL);
+    const SwsUOp *read = &uops->ops[0];
+    switch (read->uop) {
+    case SWS_UOP_READ_PACKED:
+        break;
+    case SWS_UOP_READ_PLANAR:
+        if (read->mask != SWS_COMP_ELEMS(1))
+             return AVERROR(ENOTSUP);
+        break;
+    default:
+        return AVERROR(ENOTSUP);
+    }
+
+    const int read_size = ff_sws_pixel_type_size(read->type);
+    uint32_t mask[4] = {0};
+    int clear_val = -1;
+    int read_elems = 0;
+    for (int i = 0; i < 4; i++) {
+        if (SWS_COMP_TEST(read->mask, i)) {
+            mask[i] = 0x01010101 * i * read_size + 0x03020100;
+            read_elems++;
+        }
+    }
+
+    for (int opidx = 1; opidx < uops->num_ops; opidx++) {
+        const SwsUOp *uop = &uops->ops[opidx];
+        const SwsUOpParams *par = &uop->par;
+        switch (uop->uop) {
+        case SWS_UOP_COPY:
+        case SWS_UOP_PERMUTE: {
+            uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
+            for (int i = 0; i < 4; i++)
+                mask[i] = orig[par->swizzle.in[i]];
+            break;
+        }
+
+        case SWS_UOP_SWAP_BYTES:
+            for (int i = 0; i < 4; i++) {
+                switch (ff_sws_pixel_type_size(uop->type)) {
+                case 2: mask[i] = av_bswap16(mask[i]); break;
+                case 4: mask[i] = av_bswap32(mask[i]); break;
+                }
+            }
+            break;
+
+        case SWS_UOP_CLEAR:
+            for (int i = 0; i < 4; i++) {
+                if (!SWS_COMP_TEST(uop->mask, i))
+                    continue;
+                SwsPixel val = uop->data.vec4[i];
+                if (!pixel_is_repeating(uop->type, val) ||
+                    (clear_val >= 0 && clear_val != val.u8))
+                    return AVERROR(ENOTSUP); /* would require different bytes 
*/
+                mask[i] = 0xFFFFFFFFul; /* (uint8_t[4]) { -1, -1, -1, -1 } */
+                clear_val = val.u8;
+            }
+            break;
+
+        case SWS_UOP_EXPAND_PAIR:
+        case SWS_UOP_EXPAND_QUAD:
+            for (int i = 0; i < 4; i++)
+                mask[i] = 0x01010101 * (mask[i] & 0xFF);
+            break;
+
+        case SWS_UOP_WRITE_PLANAR:
+            if (uop->mask != SWS_COMP_ELEMS(1))
+                return AVERROR(ENOTSUP);
+            av_fallthrough;
+        case SWS_UOP_WRITE_PACKED: {
+            const int write_elems = av_popcount(uop->mask);
+            const int write_size  = ff_sws_pixel_type_size(uop->type);
+            *out_shuffle = (SwsUOp) {
+                .uop  = SWS_UOP_RW_SHUFFLE,
+                .type = SWS_PIXEL_U8,
+                .mask = SWS_COMP_ELEMS(1), /* single plane for now */
+            };
+
+            SwsShuffleUOp *par = &out_shuffle->par.shuffle;
+            *par = (SwsShuffleUOp) {
+                .read_size   = read_elems * read_size,
+                .write_size  = write_elems * write_size,
+                .clear_value = clear_val >= 0 ? clear_val : 0,
+            };
+
+            /* Generate baseline shuffle for a single pixel */
+            int8_t *shuffle = out_shuffle->data.shuffle;
+            for (int i = 0; i < write_elems; i++) {
+                const int offset = i * write_size;
+                for (int b = 0; b < write_size; b++)
+                    shuffle[offset + b] = mask[i] >> (b * 8);
+            }
+
+            return 0;
+        }
+
+        default:
+            return AVERROR(ENOTSUP);
+        }
+    }
+
+    return AVERROR(EINVAL);
+}
+
 int ff_sws_uop_list_optimize(SwsContext *ctx, SwsUOpFlags flags, SwsUOpList 
*uops)
 {
+    /* Try promoting the entire uop list to a packed shuffle operation */
+    if (flags & SWS_UOP_FLAG_PSHUFB) {
+        SwsUOp shuffle;
+        int ret = solve_shuffle(uops, &shuffle);
+        if (ret >= 0) {
+            ff_sws_uop_list_remove_at(uops, 0, uops->num_ops);
+            return ff_sws_uop_list_append(uops, &shuffle);
+        } else if (ret < 0 && ret != AVERROR(ENOTSUP)) {
+            return ret;
+        }
+    }
+
 #if 0
     static const SwsUOp dummy = {0};
 
diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h
index a870a27c8b..995dcd8be3 100644
--- a/libswscale/uops_macros.h
+++ b/libswscale/uops_macros.h
@@ -101,8 +101,60 @@
     MACRO(__VA_ARGS__, u8_write_bit_x                          , SWS_PIXEL_U8 
, SWS_UOP_WRITE_BIT       , 0x1)
 #define SWS_FOR_STRUCT_U8_WRITE_BIT(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_write_bit_x                          , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_WRITE_BIT       , .mask = 0x1)
-#define SWS_FOR_U8_RW_SHUFFLE(MACRO, ...)
-#define SWS_FOR_STRUCT_U8_RW_SHUFFLE(MACRO, ...)
+#define SWS_FOR_U8_RW_SHUFFLE(MACRO, ...) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_1_2                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 1, 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_1                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 2, 1) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_2                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 2, 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_4                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 2, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_3_4                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 3, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_3_6                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 3, 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_2                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 4, 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_3                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 4, 3) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_4                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 4, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_6                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 4, 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_8                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 4, 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_6_6                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 6, 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_4                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 8, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_6                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 8, 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_8                   , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 8, 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_12_12                 , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 12, 12) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_16_12                 , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 16, 12) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_16_16                 , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0x0, 16, 16) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_1_2                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 1, 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_1_4                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 1, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_2_4                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 2, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_3_4                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 3, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_3_8                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 3, 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_4_4                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 4, 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_4_8                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 4, 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_6_8                  , SWS_PIXEL_U8 
, SWS_UOP_RW_SHUFFLE      , 0x1, 0xff, 6, 8)
+#define SWS_FOR_STRUCT_U8_RW_SHUFFLE(MACRO, ...) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_1_2                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 1, 
.par.shuffle.write_size = 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_1                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 2, 
.par.shuffle.write_size = 1) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_2                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 2, 
.par.shuffle.write_size = 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_2_4                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 2, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_3_4                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 3, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_3_6                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 3, 
.par.shuffle.write_size = 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_2                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_3                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 3) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_4                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_6                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_4_8                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_6_6                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 6, 
.par.shuffle.write_size = 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_4                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 8, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_6                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 8, 
.par.shuffle.write_size = 6) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_8_8                   , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 8, 
.par.shuffle.write_size = 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_12_12                 , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 12, 
.par.shuffle.write_size = 12) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_16_12                 , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 16, 
.par.shuffle.write_size = 12) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_0_16_16                 , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0x0, .par.shuffle.read_size = 16, 
.par.shuffle.write_size = 16) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_1_2                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 1, 
.par.shuffle.write_size = 2) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_1_4                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 1, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_2_4                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 2, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_3_4                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 3, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_3_8                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 3, 
.par.shuffle.write_size = 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_4_4                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 4) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_4_8                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 4, 
.par.shuffle.write_size = 8) \
+    MACRO(__VA_ARGS__, u8_rw_shuffle_x_ff_6_8                  , .type = 
SWS_PIXEL_U8 , .uop = SWS_UOP_RW_SHUFFLE      , .mask = 0x1, 
.par.shuffle.clear_value = 0xff, .par.shuffle.read_size = 6, 
.par.shuffle.write_size = 8)
 #define SWS_FOR_U8_PERMUTE(MACRO, ...) \
     MACRO(__VA_ARGS__, u8_permute_x_y                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 1, 0, 2, 3) \
     MACRO(__VA_ARGS__, u8_permute_x_z                          , SWS_PIXEL_U8 
, SWS_UOP_PERMUTE         , 0x1, 2, 1, 0, 3) \
-- 
2.52.0


>From 3e6ca63a0ad371fd82216442a5b8f593eca574d3 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 10 Jun 2026 17:22:08 +0200
Subject: [PATCH 7/9] swscale/x86/ops: rewrite packed_shuffle() in terms of
 uops macro

This is mostly the same, with the added complication of now needing to handle
the case of clearing to a nonzero value, which we do using `pblendb` on SSE4,
and respectively `vpblendb` / `vpblendmb` on AVX-2 and AVX-512.

Since `pblendb` hard-codes XMM0 implicitly, we have to flip the order of `m0`
and `m1` in the rest of the code as well; though that's not a huge deal.

The bigger complication comes from the fact that the macro gives us the size
per pixel, i.e. not rounded up to 16 bytes, so we need to recreate this logic
in the NASM macro itself. Fortunately, it's fairly straightforward.

The big upside is that we now gain a fast path for e.g. rgb24 -> rgba, which
is arguably more common than rgb24 -> rgb0:

  rgb24 1920x1080 -> rgba 1920x1080, speedup=7.106x faster

It's worth pointing out that, because checkasm sometimes generates weird ops
that don't occur in real pixel formats, it's not a guarantee that we will
actually implement all paths that the optimizer spits out.

This should be improved by a future checkasm refactor; but for now, we need
to return ENOTSUP in the case that we encounter a shuffle mask combination
that doesn't actually exist. (In practice, this only happens for degenerate
no-op tests like 1->1 shuffles)

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c             | 102 ++++++++++++-------------
 libswscale/x86/ops_common.asm    | 127 ++++++++++++++++++++-----------
 libswscale/x86/uops_macros.asm.h |   1 +
 3 files changed, 132 insertions(+), 98 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index db445135f8..0e0ecdfd21 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -474,6 +474,11 @@ SWS_DECL_FUNC(ff_sws_process2_x86);
 SWS_DECL_FUNC(ff_sws_process3_x86);
 SWS_DECL_FUNC(ff_sws_process4_x86);
 
+/* Declare packed shuffle functions */
+SWS_FOR_STRUCT(U8, RW_SHUFFLE, DECL_ENTRY, _sse4,   NULL, NULL)
+SWS_FOR_STRUCT(U8, RW_SHUFFLE, DECL_ENTRY, _avx2,   NULL, NULL)
+SWS_FOR_STRUCT(U8, RW_SHUFFLE, DECL_ENTRY, _avx512, NULL, NULL)
+
 static int movsize(const int bytes, const int mmsize)
 {
     return bytes <= 4 ? 4 : /* movd */
@@ -481,28 +486,30 @@ static int movsize(const int bytes, const int mmsize)
            mmsize;          /* movu */
 }
 
-static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
+static int translate_shuffle(const SwsUOp *uop, int mmsize, SwsCompiledOp *out)
 {
-    uint8_t shuffle[16];
-    int read_bytes, write_bytes;
-    int pixels;
-
-    /* Solve the shuffle mask for one 128-bit lane only */
-    pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, 
&write_bytes);
-    if (pixels < 0)
-        return pixels;
-
     /* We can't shuffle across lanes, so restrict the vector size to XMM
      * whenever the read/write size would be a subset of the full vector */
-    if (read_bytes < 16 || write_bytes < 16)
+    const SwsShuffleUOp *par = &uop->par.shuffle;
+    const int lane_aligned = par->read_size == par->write_size &&
+                             16 % par->read_size == 0;
+    if (!lane_aligned)
         mmsize = 16;
 
-    const int num_lanes = mmsize / 16;
-    const int in_total  = num_lanes * read_bytes;
-    const int out_total = num_lanes * write_bytes;
+    /* Generate the shuffle mask */
+    const int mask_size = lane_aligned ? 16 : mmsize;
+    int8_t *mask = av_malloc(mask_size);
+    if (!mask)
+        return AVERROR(ENOMEM);
 
+    const int pixels = ff_sws_shuffle_mask(uop, mask, mask_size);
+    const int read_chunk  = pixels * par->read_size;
+    const int write_chunk = pixels * par->write_size;
+    const int num_lanes   = lane_aligned ? mmsize / 16 : 1;
+    const int in_total    = num_lanes * read_chunk;
+    const int out_total   = num_lanes * write_chunk;
     *out = (SwsCompiledOp) {
-        .priv        = av_memdup(shuffle, sizeof(shuffle)),
+        .priv        = mask,
         .free        = av_free,
         .slice_align = 1,
         .block_size  = pixels * num_lanes,
@@ -513,36 +520,20 @@ static int solve_shuffle(const SwsOpList *ops, int 
mmsize, SwsCompiledOp *out)
                                      AV_CPU_FLAG_SSE4,
     };
 
-    if (!out->priv)
-        return AVERROR(ENOMEM);
-
-#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)                                      
 \
+#define ASSIGN_SHUFFLE_FUNC(EXT, NAME, ...)                                    
 \
 do {                                                                           
 \
-    SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT);                      
 \
-    if (in_total == IN && out_total == OUT)                                    
 \
-        out->func = ff_packed_shuffle##IN##_##OUT##_##EXT;                     
 \
-} while (0)
+    const SwsOpEntry *entry = &op_##NAME##EXT;                                 
 \
+    if (!memcmp(&uop->par, &entry->par, sizeof(uop->par)))                     
 \
+        out->func = (SwsOpFunc) entry->func;                                   
 \
+} while (0);
 
-    ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
-    ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
-    ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(16,  8, sse4);
-    ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
-    ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
-    ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(15,  5, sse4);
-    ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
-    ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
-    ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(16,  4, sse4);
-    ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
-    ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
-    ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
-    ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
-    av_assert1(out->func);
-    return 0;
+    switch (mmsize) {
+    case 16: SWS_FOR(U8, RW_SHUFFLE, ASSIGN_SHUFFLE_FUNC, _sse4);   break;
+    case 32: SWS_FOR(U8, RW_SHUFFLE, ASSIGN_SHUFFLE_FUNC, _avx2);   break;
+    case 64: SWS_FOR(U8, RW_SHUFFLE, ASSIGN_SHUFFLE_FUNC, _avx512); break;
+    }
+
+    return out->func ? 0 : AVERROR(ENOTSUP);
 }
 
 /* Expand pixel value to 32-bits by repeating as necessary */
@@ -576,15 +567,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
     else
         return AVERROR(ENOTSUP);
 
-    /* Special fast path for in-place packed shuffle */
-    ret = solve_shuffle(ops, mmsize, out);
-    if (ret != AVERROR(ENOTSUP))
-        return ret;
-
-    SwsOpChain *chain = ff_sws_op_chain_alloc();
-    if (!chain)
-        return AVERROR(ENOMEM);
-
+    SwsOpChain *chain = NULL;
     SwsUOpList *uops = ff_sws_uop_list_alloc();
     if (!uops) {
         ret = AVERROR(ENOMEM);
@@ -599,6 +582,21 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
     if (ret < 0)
         goto fail;
 
+    if (uops->num_ops == 1 && uops->ops[0].uop == SWS_UOP_RW_SHUFFLE) {
+        const SwsUOp *uop = &uops->ops[0];
+        char name[SWS_UOP_NAME_MAX];
+        ff_sws_uop_name(uop, name);
+        ret = translate_shuffle(uop, mmsize, out);
+        if (ret >= 0)
+            av_log(ctx, AV_LOG_VERBOSE, "Using x86 packed shuffle fast path: 
%s\n", name);
+        ff_sws_uop_list_free(&uops);
+        return ret;
+    }
+
+    chain = ff_sws_op_chain_alloc();
+    if (!chain)
+        return AVERROR(ENOMEM);
+
     *out = (SwsCompiledOp) {
         /* Use at most two full YMM regs during the widest precision section */
         .block_size  = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
index a5dd105da5..821b7b44e9 100644
--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -116,17 +116,16 @@ process_fn 4
 
 ; This is a special entry point for handling a subset of operation chains
 ; that can be reduced down to a single `pshufb` shuffle mask. For more details
-; about when this works, refer to the documentation of `ff_sws_solve_shuffle`.
+; about when this works, refer to `solve_shuffle()` in ops_optimizer.c.
 ;
-; We specialize this function for every possible combination of pixel strides.
-; For example, gray -> gray16 is classified as an "8, 16" operation because it
-; takes 8 bytes and expands them out to 16 bytes in each application of the
-; 128-bit shuffle mask.
+; This macro gets instantiated for every parameter combination of
+; SWS_UOP_RW_SHUFFLE, which embeds the clear value and read and write sizes.
 ;
-; Since pshufb can't shuffle across lanes, we only instantiate SSE4 versions 
for
-; all shuffles that are not a clean multiple of 128 bits (e.g. rgb24 -> rgb0).
-; For the clean multiples (e.g. rgba -> argb), we also define AVX2 and AVX512
-; versions that can handle a larger number of bytes at once.
+; Since pshufb can't shuffle across lanes, we only call SSE4 versions for
+; all shuffles that are not a clean multiple of 128 bits (e.g. rgb24 -> rgb0),
+; unless we have access to AVX-512 vpermb, or if the lanes are all independent,
+; in which case we can also use `pshufb` on mmsize == 32/64. This is detected
+; by the `LANE_ALIGNED` condition.
 
 %macro MOVSIZE 3 ; size, dst, src
     %if %1 <= 4
@@ -138,63 +137,99 @@ process_fn 4
     %endif
 %endmacro
 
-%macro packed_shuffle 2 ; size_in, size_out
-cglobal packed_shuffle%1_%2, 6, 10, 2, \
-    exec, shuffle, bx, y, bxend, yend, src, dst, src_stride, dst_stride
+%macro RW_SHUFFLE 3
+%assign CLEAR_VALUE %1
+%assign READ_SIZE   %2
+%assign WRITE_SIZE  %3
+
+%assign LANE_ALIGNED (READ_SIZE == WRITE_SIZE && 16 % READ_SIZE == 0)
+%assign MAX_SIZE     (READ_SIZE > WRITE_SIZE ? READ_SIZE : WRITE_SIZE)
+
+; Round up sizes to multiple of mmsize
+%assign PIXELS       (mmsize / MAX_SIZE)
+%assign READ_SIZE    (READ_SIZE * PIXELS)
+%assign WRITE_SIZE   (WRITE_SIZE * PIXELS)
+
+cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, src, dst, 
src_stride, dst_stride
+%if mmsize > 16 && !LANE_ALIGNED
+            ud2 ; runtime checks should prevent this variant from being called
+%else
             mov srcq, [execq + SwsOpExec.in0]
             mov dstq, [execq + SwsOpExec.out0]
             mov src_strideq, [execq + SwsOpExec.in_stride0]
             mov dst_strideq, [execq + SwsOpExec.out_stride0]
-            VBROADCASTI128 m1, [shuffleq]
+
+            ; setup shuffle mask
+            VBROADCASTI128 m0, [shuffleq]
+    %if cpuflag(avx512) && CLEAR_VALUE != 0
+            vpmovb2m k1, m0 ; needed for vpblendmb
+    %endif
+
+            ; setup clear value register if needed
+    %if CLEAR_VALUE == 0xFF
+        %if cpuflag(avx512)
+            vpternlogd m2, m2, m2, 0xff
+        %else
+            pcmpeqb m2, m2
+        %endif
+    %elif CLEAR_VALUE != 0 ; clear-to-0 is implicitly handled by pshufb
+            mov shuffled, CLEAR_VALUE * 0x1010101
+            movd xm2, shuffled
+            VPBROADCASTD m2, xm2
+    %endif
+
+            ; setup loop bounds and variables
             sub bxendd, bxd
             sub yendd, yd
             ; reuse now-unneeded regs
-    %define srcidxq execq
-            imul srcidxq, bxendq, -%1
-%if %1 = %2
-    %define dstidxq srcidxq
-%else
-    %define dstidxq shuffleq ; no longer needed reg
-            imul dstidxq, bxendq, -%2
-%endif
+            %define srcidxq execq
+            imul srcidxq, bxendq, -READ_SIZE
+    %if READ_SIZE == WRITE_SIZE
+            %define dstidxq srcidxq
+    %else
+            %define dstidxq shuffleq ; no longer needed reg
+            imul dstidxq, bxendq, -WRITE_SIZE
+    %endif
             sub srcq, srcidxq
             sub dstq, dstidxq
+
 .loop:
-            MOVSIZE %1, m0, [srcq + srcidxq]
-            pshufb m0, m1
-            MOVSIZE %2, [dstq + dstidxq], m0
-            add srcidxq, %1
-IF %1 != %2,add dstidxq, %2
+            MOVSIZE READ_SIZE, m1, [srcq + srcidxq]
+            pshufb m1, m0
+
+    %if CLEAR_VALUE != 0
+        %if cpuflag(avx512)
+            vpblendmb m1{k1}, m1, m2
+        %elif avx_enabled
+            vpblendvb m1, m1, m2, m0
+        %else
+            pblendvb m1, m2
+        %endif
+    %endif
+
+            MOVSIZE WRITE_SIZE, [dstq + dstidxq], m1
+            add srcidxq, READ_SIZE
+    %if READ_SIZE != WRITE_SIZE
+            add dstidxq, WRITE_SIZE
+    %endif
             jnz .loop
             add srcq, src_strideq
             add dstq, dst_strideq
-            imul srcidxq, bxendq, -%1
-IF %1 != %2,imul dstidxq, bxendq, -%2
+            imul srcidxq, bxendq, -READ_SIZE
+    %if READ_SIZE != WRITE_SIZE
+            imul dstidxq, bxendq, -WRITE_SIZE
+    %endif
             dec yendd
             jnz .loop
             RET
+%endif
 %endmacro
 
 INIT_XMM sse4
-packed_shuffle  5, 15 ;  8 -> 24
-packed_shuffle  4, 16 ;  8 -> 32, 16 -> 64
-packed_shuffle  2, 12 ;  8 -> 48
-packed_shuffle 16,  8 ; 16 -> 8
-packed_shuffle 10, 15 ; 16 -> 24
-packed_shuffle  8, 16 ; 16 -> 32, 32 -> 64
-packed_shuffle  4, 12 ; 16 -> 48
-packed_shuffle 15,  5 ; 24 -> 8
-packed_shuffle 15, 15 ; 24 -> 24
-packed_shuffle 12, 16 ; 24 -> 32
-packed_shuffle  6, 12 ; 24 -> 48
-packed_shuffle 16,  4 ; 32 -> 8,  64 -> 16
-packed_shuffle 16, 12 ; 32 -> 24, 64 -> 48
-packed_shuffle 16, 16 ; 32 -> 32, 64 -> 64
-packed_shuffle  8, 12 ; 32 -> 48
-packed_shuffle 12, 12 ; 48 -> 48
+DECL_U8_RW_SHUFFLE (RW_SHUFFLE)
 
 INIT_YMM avx2
-packed_shuffle 32, 32
+DECL_U8_RW_SHUFFLE (RW_SHUFFLE)
 
 INIT_ZMM avx512
-packed_shuffle 64, 64
+DECL_U8_RW_SHUFFLE (RW_SHUFFLE)
diff --git a/libswscale/x86/uops_macros.asm.h b/libswscale/x86/uops_macros.asm.h
index d9565d12f2..c33af49da4 100644
--- a/libswscale/x86/uops_macros.asm.h
+++ b/libswscale/x86/uops_macros.asm.h
@@ -61,6 +61,7 @@
     {DEF_MACRO(WRITE_NIBBLE,        TYPE)}, \
     {DEF_MACRO(WRITE_PACKED,        TYPE)}, \
     {DEF_MACRO(WRITE_PLANAR,        TYPE)}, \
+    {DEF_MACRO(RW_SHUFFLE,          TYPE)}, \
     {DEF_MACRO(MOVE,                TYPE)}, \
     {DEF_MACRO(SWAP_BYTES,          TYPE)}, \
     {DEF_MACRO(EXPAND_BIT,          TYPE)}, \
-- 
2.52.0


>From b7bfb80494fff99ac615e860c3a149d132d734fd Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Jun 2026 20:43:06 +0200
Subject: [PATCH 8/9] swscale/x86/ops: add AVX512 vpermb packed shuffle path

We can quite easily generalize the existing AVX512 pushfb implementation to
support vpermb when the shuffle mask is cross-lane.

  rgb24 1920x1080 -> rgba 1920x1080, speedup=1.372x faster

This does require some finesse w.r.t. the MOVSIZE macro, to avoid reading or
writing too many bits. In particular, for gray->rgba64le, the pattern even
devolves into 8/64 which would trigger an illegal operand error when combining
movq with zmm registers.

Other than that, we can use {k1}{z} zero masking to cleanly recreate the
semantics of pushfb's implicit zero masking, except for non-zero clear
values where we continue doing what we already do.

Lastly, while EVEX-encoded vpermb can take the source operand directly, this
somewhat defeats the purpose of the READ size optimization; in practice it
was better to keep the read and vpermb separate.

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c          | 13 +++++----
 libswscale/x86/ops_common.asm | 55 +++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 0e0ecdfd21..4aacb960c4 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -481,19 +481,22 @@ SWS_FOR_STRUCT(U8, RW_SHUFFLE, DECL_ENTRY, _avx512, NULL, 
NULL)
 
 static int movsize(const int bytes, const int mmsize)
 {
-    return bytes <= 4 ? 4 : /* movd */
-           bytes <= 8 ? 8 : /* movq */
-           mmsize;          /* movu */
+    return bytes <= 4  ? 4  : /* movd */
+           bytes <= 8  ? 8  : /* movq */
+           bytes <= 16 ? 16 : /* xmm movu */
+           bytes <= 32 ? 32 : /* ymm movu */
+           mmsize;            /* zmm movu */
 }
 
 static int translate_shuffle(const SwsUOp *uop, int mmsize, SwsCompiledOp *out)
 {
     /* We can't shuffle across lanes, so restrict the vector size to XMM
-     * whenever the read/write size would be a subset of the full vector */
+     * whenever the read/write size would be a subset of the full vector,
+     * unless we have access to AVX-512 vpermb */
     const SwsShuffleUOp *par = &uop->par.shuffle;
     const int lane_aligned = par->read_size == par->write_size &&
                              16 % par->read_size == 0;
-    if (!lane_aligned)
+    if (!lane_aligned && mmsize < 64)
         mmsize = 16;
 
     /* Generate the shuffle mask */
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
index 821b7b44e9..24c606426a 100644
--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -127,13 +127,31 @@ process_fn 4
 ; in which case we can also use `pshufb` on mmsize == 32/64. This is detected
 ; by the `LANE_ALIGNED` condition.
 
-%macro MOVSIZE 3 ; size, dst, src
-    %if %1 <= 4
-        movd %2, %3
-    %elif %1 <= 8
-        movq %2, %3
+%macro READ 2 ; dst, src
+    %if READ_SIZE <= 4
+        movd xmm%1, %2
+    %elif READ_SIZE <= 8
+        movq xmm%1, %2
+    %elif READ_SIZE <= 16
+        movu xmm%1, %2
+    %elif READ_SIZE <= 32
+        movu ymm%1, %2
     %else
-        movu %2, %3
+        movu zmm%1, %2
+    %endif
+%endmacro
+
+%macro WRITE 2 ; dst, src
+    %if WRITE_SIZE <= 4
+        movd %1, xmm%2
+    %elif WRITE_SIZE <= 8
+        movq %1, xmm%2
+    %elif WRITE_SIZE <= 16
+        movu %1, xmm%2
+    %elif WRITE_SIZE <= 32
+        movu %1, ymm%2
+    %else
+        movu %1, zmm%2
     %endif
 %endmacro
 
@@ -151,7 +169,7 @@ process_fn 4
 %assign WRITE_SIZE   (WRITE_SIZE * PIXELS)
 
 cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, src, dst, 
src_stride, dst_stride
-%if mmsize > 16 && !LANE_ALIGNED
+%if mmsize > 16 && !LANE_ALIGNED && !cpuflag(avx512)
             ud2 ; runtime checks should prevent this variant from being called
 %else
             mov srcq, [execq + SwsOpExec.in0]
@@ -160,9 +178,16 @@ cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, 
src, dst, src_stride,
             mov dst_strideq, [execq + SwsOpExec.out_stride0]
 
             ; setup shuffle mask
+    %if LANE_ALIGNED
             VBROADCASTI128 m0, [shuffleq]
-    %if cpuflag(avx512) && CLEAR_VALUE != 0
-            vpmovb2m k1, m0 ; needed for vpblendmb
+    %else
+            mova m0, [shuffleq]
+    %endif
+    %if cpuflag(avx512)
+            vpmovb2m k1, m0 ; needed for vpblendmb / vpermb
+        %if CLEAR_VALUE == 0
+            knotq k1, k1
+        %endif
     %endif
 
             ; setup clear value register if needed
@@ -172,7 +197,7 @@ cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, 
src, dst, src_stride,
         %else
             pcmpeqb m2, m2
         %endif
-    %elif CLEAR_VALUE != 0 ; clear-to-0 is implicitly handled by pshufb
+    %elif CLEAR_VALUE != 0 ; clear-to-0 is implicitly handled by pshufb / 
vpermb
             mov shuffled, CLEAR_VALUE * 0x1010101
             movd xm2, shuffled
             VPBROADCASTD m2, xm2
@@ -194,8 +219,14 @@ cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, 
src, dst, src_stride,
             sub dstq, dstidxq
 
 .loop:
-            MOVSIZE READ_SIZE, m1, [srcq + srcidxq]
+            READ m1, [srcq + srcidxq]
+    %if LANE_ALIGNED
             pshufb m1, m0
+    %elif CLEAR_VALUE == 0
+            vpermb m1{k1}{z}, m0, m1
+    %else
+            vpermb m1, m0, m1
+    %endif
 
     %if CLEAR_VALUE != 0
         %if cpuflag(avx512)
@@ -207,7 +238,7 @@ cglobal NAME, 6, 10, 2, exec, shuffle, bx, y, bxend, yend, 
src, dst, src_stride,
         %endif
     %endif
 
-            MOVSIZE WRITE_SIZE, [dstq + dstidxq], m1
+            WRITE [dstq + dstidxq], m1
             add srcidxq, READ_SIZE
     %if READ_SIZE != WRITE_SIZE
             add dstidxq, WRITE_SIZE
-- 
2.52.0


>From 8f8796b24e0dd272ea0e06c10176d2897e1cfeb8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Jun 2026 19:16:21 +0200
Subject: [PATCH 9/9] swscale/ops_optimizer: remove now-unused
 ff_sws_solve_shuffle()

Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_internal.h  | 28 -----------
 libswscale/ops_optimizer.c | 96 --------------------------------------
 2 files changed, 124 deletions(-)

diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h
index 9d8da6bbb5..0bf3180a0d 100644
--- a/libswscale/ops_internal.h
+++ b/libswscale/ops_internal.h
@@ -52,34 +52,6 @@ static inline void ff_sws_pack_op_decode(const SwsOp *op, 
uint64_t mask[4], int
     }
 }
 
-/**
- * "Solve" an op list into a fixed shuffle mask, with an optional ability to
- * also directly clear the output value (for e.g. rgb24 -> rgb0). This can
- * accept any operation chain that only consists of the following operations:
- *
- * - SWS_OP_READ (non-planar, non-fractional)
- * - SWS_OP_SWIZZLE
- * - SWS_OP_SWAP_BYTES
- * - SWS_OP_CLEAR to zero (when clear_val is specified)
- * - SWS_OP_CONVERT (integer expand)
- * - SWS_OP_WRITE (non-planar, non-fractional)
- *
- * Basically, any operation that purely consists of moving around and 
reordering
- * bytes within a single plane, can be turned into a shuffle mask.
- *
- * @param ops         The operation list to decompose.
- * @param shuffle     The output shuffle mask.
- * @param size        The size (in bytes) of the output shuffle mask.
- * @param clear_val   If nonzero, this index will be used to clear the output.
- * @param read_bytes  Returns the number of bytes read per shuffle iteration.
- * @param write_bytes Returns the number of bytes written per shuffle 
iteration.
- *
- * @return  The number of pixels processed per iteration, or a negative error
-            code; in particular AVERROR(ENOTSUP) for unsupported operations.
- */
-int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size,
-                         uint8_t clear_val, int *read_bytes, int *write_bytes);
-
 /**
  * Eliminate SWS_OP_FILTER_* operations by merging them with prior SWS_OP_READ
  * operations. This may require splitting the op list into multiple subpasses,
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 980cc41b69..b4d2894edf 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -795,102 +795,6 @@ retry:
     return 0;
 }
 
-int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
-                         int size, uint8_t clear_val,
-                         int *read_bytes, int *write_bytes)
-{
-    if (!ops->num_ops)
-        return AVERROR(EINVAL);
-
-    const SwsOp *read = ff_sws_op_list_input(ops);
-    if (!read || read->rw.frac || read->rw.filter ||
-        (!read->rw.packed && read->rw.elems > 1))
-        return AVERROR(ENOTSUP);
-
-    const int read_size = ff_sws_pixel_type_size(read->type);
-    uint32_t mask[4] = {0};
-    for (int i = 0; i < read->rw.elems; i++)
-        mask[i] = 0x01010101 * i * read_size + 0x03020100;
-
-    for (int opidx = 1; opidx < ops->num_ops; opidx++) {
-        const SwsOp *op = &ops->ops[opidx];
-        switch (op->op) {
-        case SWS_OP_SWIZZLE: {
-            uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
-            for (int i = 0; i < 4; i++)
-                mask[i] = orig[op->swizzle.in[i]];
-            break;
-        }
-
-        case SWS_OP_SWAP_BYTES:
-            for (int i = 0; i < 4; i++) {
-                switch (ff_sws_pixel_type_size(op->type)) {
-                case 2: mask[i] = av_bswap16(mask[i]); break;
-                case 4: mask[i] = av_bswap32(mask[i]); break;
-                }
-            }
-            break;
-
-        case SWS_OP_CLEAR:
-            for (int i = 0; i < 4; i++) {
-                if (!SWS_COMP_TEST(op->clear.mask, i))
-                    continue;
-                if (op->clear.value[i].num != 0 || !clear_val)
-                    return AVERROR(ENOTSUP);
-                mask[i] = 0x1010101ul * clear_val;
-            }
-            break;
-
-        case SWS_OP_CONVERT: {
-            if (!op->convert.expand)
-                return AVERROR(ENOTSUP);
-            for (int i = 0; i < 4; i++) {
-                switch (ff_sws_pixel_type_size(op->type)) {
-                case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
-                case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
-                }
-            }
-            break;
-        }
-
-        case SWS_OP_WRITE: {
-            if (op->rw.frac || op->rw.filter ||
-                (!op->rw.packed && op->rw.elems > 1))
-                return AVERROR(ENOTSUP);
-
-            /* Initialize to no-op */
-            memset(shuffle, clear_val, size);
-
-            const int write_size  = ff_sws_pixel_type_size(op->type);
-            const int read_chunk  = read->rw.elems * read_size;
-            const int write_chunk = op->rw.elems * write_size;
-            const int num_groups  = size / FFMAX(read_chunk, write_chunk);
-            for (int n = 0; n < num_groups; n++) {
-                const int base_in  = n * read_chunk;
-                const int base_out = n * write_chunk;
-                for (int i = 0; i < op->rw.elems; i++) {
-                    const int offset = base_out + i * write_size;
-                    for (int b = 0; b < write_size; b++) {
-                        const uint8_t idx = mask[i] >> (b * 8);
-                        if (idx != clear_val)
-                            shuffle[offset + b] = base_in + idx;
-                    }
-                }
-            }
-
-            *read_bytes  = num_groups * read_chunk;
-            *write_bytes = num_groups * write_chunk;
-            return num_groups;
-        }
-
-        default:
-            return AVERROR(ENOTSUP);
-        }
-    }
-
-    return AVERROR(EINVAL);
-}
-
 int ff_sws_shuffle_mask(const SwsUOp *uop, int8_t shuffle[], int size)
 {
     const SwsShuffleUOp *par = &uop->par.shuffle;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] swscale/ops_optimizer: rewrite shuffle solver as a dedicated UOP (PR #23440)

Reply via email to