PR #23439 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23439
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23439.patch

Split off from my internal swscale development branch; I wanted to get these in 
early-ish as they affect other swscale backends a bit.


>From 7592415faaaa44686d2daad42fa81cf6dc632d71 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 2 Jun 2026 14:28:56 +0200
Subject: [PATCH 01/14] swscale/ops: group filtered rw metadata into struct

This is a minor cosmetic improvement that allows me to use more
convenient names for a filter-related metadata fields, without
confusion.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/aarch64/ops_impl_conv.c |  4 ++--
 libswscale/ops.c                   | 16 +++++++--------
 libswscale/ops.h                   |  6 ++++--
 libswscale/ops_dispatch.c          |  6 +++---
 libswscale/ops_memcpy.c            |  4 ++--
 libswscale/ops_optimizer.c         | 20 +++++++++----------
 libswscale/uops.c                  |  8 ++++----
 libswscale/vulkan/ops.c            | 32 +++++++++++++++---------------
 tests/checkasm/sw_ops.c            | 14 +++++++------
 9 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/libswscale/aarch64/ops_impl_conv.c 
b/libswscale/aarch64/ops_impl_conv.c
index a66b91b6fb..b360e86a8d 100644
--- a/libswscale/aarch64/ops_impl_conv.c
+++ b/libswscale/aarch64/ops_impl_conv.c
@@ -78,7 +78,7 @@ static int convert_to_aarch64_impl(SwsContext *ctx, const 
SwsOpList *ops, int n,
     /* Map SwsOpType to SwsAArch64OpType */
     switch (op->op) {
     case SWS_OP_READ:
-        if (op->rw.filter)
+        if (op->rw.filter.op)
             return AVERROR(ENOTSUP);
         /**
          * The different types of read operations have been split into
@@ -94,7 +94,7 @@ static int convert_to_aarch64_impl(SwsContext *ctx, const 
SwsOpList *ops, int n,
             out->op = AARCH64_SWS_OP_READ_PLANAR;
         break;
     case SWS_OP_WRITE:
-        if (op->rw.filter)
+        if (op->rw.filter.op)
             return AVERROR(ENOTSUP);
         /**
          * The different types of write operations have been split into
diff --git a/libswscale/ops.c b/libswscale/ops.c
index b28dbec75f..f18249cc1f 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -382,9 +382,9 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
                 op->comps.max[i]   = prev.max[i];
             }
 
-            if (op->rw.filter) {
+            if (op->rw.filter.op) {
                 const SwsComps prev = op->comps;
-                apply_filter_weights(&op->comps, &prev, op->rw.kernel);
+                apply_filter_weights(&op->comps, &prev, op->rw.filter.kernel);
             }
             break;
         case SWS_OP_SWAP_BYTES:
@@ -590,7 +590,7 @@ static void op_uninit(SwsOp *op)
 {
     switch (op->op) {
     case SWS_OP_READ:
-        av_refstruct_unref(&op->rw.kernel);
+        av_refstruct_unref(&op->rw.filter.kernel);
         break;
     case SWS_OP_DITHER:
         av_refstruct_unref(&op->dither.matrix);
@@ -652,8 +652,8 @@ SwsOpList *ff_sws_op_list_duplicate(const SwsOpList *ops)
         const SwsOp *op = &copy->ops[i];
         switch (op->op) {
         case SWS_OP_READ:
-            if (op->rw.kernel)
-                av_refstruct_ref(op->rw.kernel);
+            if (op->rw.filter.kernel)
+                av_refstruct_ref(op->rw.filter.kernel);
             break;
         case SWS_OP_DITHER:
             av_refstruct_ref(op->dither.matrix);
@@ -866,12 +866,12 @@ void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op)
         av_bprintf(bp, "%-20s: %d elem(s) %s >> %d", name,
                    op->rw.elems,  op->rw.packed ? "packed" : "planar",
                    op->rw.frac);
-        if (!op->rw.filter)
+        if (!op->rw.filter.op)
             break;
-        const SwsFilterWeights *kernel = op->rw.kernel;
+        const SwsFilterWeights *kernel = op->rw.filter.kernel;
         av_bprintf(bp, " + %d tap %s filter (%c)",
                    kernel->filter_size, kernel->name,
-                   op->rw.filter == SWS_OP_FILTER_H ? 'H' : 'V');
+                   op->rw.filter.op == SWS_OP_FILTER_H ? 'H' : 'V');
         break;
     case SWS_OP_LSHIFT:
         av_bprintf(bp, "%-20s: << %u", name, op->shift.amount);
diff --git a/libswscale/ops.h b/libswscale/ops.h
index ad0888f7a0..b58a060c6e 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -104,8 +104,10 @@ typedef struct SwsReadWriteOp {
      * Note: As with SWS_OP_FILTER_*, if a filter kernel is in use, the read
      * operation will always output floating point values.
      */
-    SwsOpType filter;         /* some value of SWS_OP_FILTER_* */
-    SwsFilterWeights *kernel; /* (refstruct) */
+    struct {
+        SwsOpType op;               /* some value of SWS_OP_FILTER_* */
+        SwsFilterWeights *kernel;   /* (refstruct) */
+    } filter;
 } SwsReadWriteOp;
 
 typedef struct SwsPackOp {
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 7a9a527bcb..1186a8a73e 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -551,8 +551,8 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1;
     }
 
-    const SwsFilterWeights *filter = read->rw.kernel;
-    if (read->rw.filter == SWS_OP_FILTER_V) {
+    const SwsFilterWeights *filter = read->rw.filter.kernel;
+    if (read->rw.filter.op == SWS_OP_FILTER_V) {
         p->offsets_y = av_refstruct_ref(filter->offsets);
 
         /* Compute relative pointer bumps for each output line */
@@ -570,7 +570,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         }
         bump[filter->dst_size - 1] = 0;
         p->exec_base.in_bump_y = bump;
-    } else if (read->rw.filter == SWS_OP_FILTER_H) {
+    } else if (read->rw.filter.op == SWS_OP_FILTER_H) {
         /* Compute pixel offset map for each output line */
         const int pixels = FFALIGN(filter->dst_size, p->comp.block_size);
         int32_t *offset = av_malloc_array(pixels, sizeof(*offset));
diff --git a/libswscale/ops_memcpy.c b/libswscale/ops_memcpy.c
index cce7aa3013..00f3e79608 100644
--- a/libswscale/ops_memcpy.c
+++ b/libswscale/ops_memcpy.c
@@ -78,7 +78,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
         const SwsOp *op = &ops->ops[n];
         switch (op->op) {
         case SWS_OP_READ:
-            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter)
+            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter.op)
                 return AVERROR(ENOTSUP);
             for (int i = 0; i < op->rw.elems; i++)
                 p.index[i] = i;
@@ -121,7 +121,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
             break;
 
         case SWS_OP_WRITE:
-            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter)
+            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter.op)
                 return AVERROR(ENOTSUP);
             p.num_planes = op->rw.elems;
             break;
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 619c6cf42b..829bd3d0c9 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -412,7 +412,7 @@ retry:
                     op->rw.elems = nb_planes;
                     RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
                         .op = SWS_OP_SWIZZLE,
-                        .type = op->rw.filter ? SWS_PIXEL_F32 : op->type,
+                        .type = op->rw.filter.op ? SWS_PIXEL_F32 : op->type,
                         .swizzle = swiz,
                     }));
                     goto retry;
@@ -735,10 +735,10 @@ retry:
         case SWS_OP_FILTER_H:
         case SWS_OP_FILTER_V:
             /* Merge with prior simple planar read */
-            if (prev->op == SWS_OP_READ && !prev->rw.filter &&
+            if (prev->op == SWS_OP_READ && !prev->rw.filter.op &&
                 !prev->rw.packed && !prev->rw.frac) {
-                prev->rw.filter = op->op;
-                prev->rw.kernel = av_refstruct_ref(op->filter.kernel);
+                prev->rw.filter.op = op->op;
+                prev->rw.filter.kernel = av_refstruct_ref(op->filter.kernel);
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
@@ -803,7 +803,7 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
         return AVERROR(EINVAL);
 
     const SwsOp *read = ff_sws_op_list_input(ops);
-    if (!read || read->rw.frac || read->rw.filter ||
+    if (!read || read->rw.frac || read->rw.filter.op ||
         (!read->rw.packed && read->rw.elems > 1))
         return AVERROR(ENOTSUP);
 
@@ -854,7 +854,7 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
         }
 
         case SWS_OP_WRITE: {
-            if (op->rw.frac || op->rw.filter ||
+            if (op->rw.frac || op->rw.filter.op ||
                 (!op->rw.packed && op->rw.elems > 1))
                 return AVERROR(ENOTSUP);
 
@@ -938,10 +938,10 @@ static void get_input_size(const SwsOpList *ops, 
SwsFormat *fmt)
     fmt->height = ops->src.height;
 
     const SwsOp *read = ff_sws_op_list_input(ops);
-    if (read && read->rw.filter == SWS_OP_FILTER_V) {
-        fmt->height = read->rw.kernel->dst_size;
-    } else if (read && read->rw.filter == SWS_OP_FILTER_H) {
-        fmt->width = read->rw.kernel->dst_size;
+    if (read && read->rw.filter.op == SWS_OP_FILTER_V) {
+        fmt->height = read->rw.filter.kernel->dst_size;
+    } else if (read && read->rw.filter.op == SWS_OP_FILTER_H) {
+        fmt->width = read->rw.filter.kernel->dst_size;
     }
 }
 
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 7af8a8af51..2de9c33eac 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -475,16 +475,16 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList 
*ops, SwsUOpFlags flags,
     };
 
     /* Non-filtered reads don't care about the exact pixel contents */
-    if (!op->rw.filter)
+    if (!op->rw.filter.op)
         uop.type = pixel_type_to_int(op->type);
 
     const bool is_read = op->op == SWS_OP_READ;
-    if (op->rw.filter) {
+    if (op->rw.filter.op) {
         if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.packed)
             return AVERROR(ENOTSUP);
         uop.par.filter.type = SWS_PIXEL_F32;
-        uop.data.kernel = av_refstruct_ref(op->rw.kernel);
-        if (op->rw.filter == SWS_OP_FILTER_H) {
+        uop.data.kernel = av_refstruct_ref(op->rw.filter.kernel);
+        if (op->rw.filter.op == SWS_OP_FILTER_H) {
             uop.uop = SWS_UOP_READ_PLANAR_FH;
         } else if (check_filter_fma(ctx, flags, op)) {
             uop.uop = SWS_UOP_READ_PLANAR_FV_FMA;
diff --git a/libswscale/vulkan/ops.c b/libswscale/vulkan/ops.c
index 702bbdf38f..e4f815124e 100644
--- a/libswscale/vulkan/ops.c
+++ b/libswscale/vulkan/ops.c
@@ -272,9 +272,9 @@ static int create_bufs(FFVulkanOpsCtx *s, VulkanPriv *p, 
const SwsOpList *ops)
                 goto fail;
             p->nb_data_bufs++;
         } else if ((op->op == SWS_OP_READ ||
-                    op->op == SWS_OP_WRITE) && op->rw.filter) {
+                    op->op == SWS_OP_WRITE) && op->rw.filter.op) {
             av_assert0(p->nb_data_bufs + 1 <= FF_ARRAY_ELEMS(p->data_bufs));
-            err = create_filter_buf(s, p, op->rw.kernel,
+            err = create_filter_buf(s, p, op->rw.filter.kernel,
                                     &p->data_bufs[p->nb_data_bufs]);
             if (err < 0)
                 goto fail;
@@ -981,12 +981,12 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
             d->id        = spi_get_id(spi);
             d->binding   = nb_data_bufs;
             var_id       = d->id;
-        } else if (op->op == SWS_OP_READ && op->rw.filter) {
+        } else if (op->op == SWS_OP_READ && op->rw.filter.op) {
             if (id->nb_filter_bufs >= MAX_FILT_BUFS)
                 return AVERROR(ENOTSUP);
-            const SwsFilterWeights *wd = op->rw.kernel;
+            const SwsFilterWeights *wd = op->rw.filter.kernel;
             struct FilterData *f = &id->filt[id->nb_filter_bufs++];
-            f->filter       = op->rw.filter;
+            f->filter       = op->rw.filter.op;
             f->filter_size  = wd->filter_size;
             f->dst_size     = wd->dst_size;
             f->num_weights  = wd->num_weights;
@@ -1130,9 +1130,9 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
 
         switch (op->op) {
         case SWS_OP_READ:
-            if (op->rw.frac) {
+            if (op->rw.frac || op->rw.filter.op) {
                 return AVERROR(ENOTSUP);
-            } else if (op->rw.filter) {
+            } else if (op->rw.filter.op) {
                 data = read_filtered(spi, id, ops, op,
                                      &id->filt[nb_filter_used++],
                                      in_img, gid, gi2);
@@ -1152,7 +1152,7 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
             }
             break;
         case SWS_OP_WRITE:
-            if (op->rw.frac || op->rw.filter) {
+            if (op->rw.frac || op->rw.filter.op) {
                 return AVERROR(ENOTSUP);
             } else if (op->rw.packed) {
                 spi_OpImageWrite(spi, out_img[ops->plane_dst[0]], dst_gid, 
data,
@@ -1316,13 +1316,13 @@ static void read_glsl(const SwsOpList *ops, const SwsOp 
*op, FFVulkanShader *shd
                       int idx, const char *type_name,
                       const char *type_v, const char *type_s)
 {
-    const SwsFilterWeights *wd = op->rw.kernel;
+    const SwsFilterWeights *wd = op->rw.filter.kernel;
     const int interlaced = ops->src.interlaced;
-    if (op->rw.filter) {
-        const char *axis    = op->rw.filter == SWS_OP_FILTER_H ? "pos.x" : 
"pos.y";
-        const char *coord_x = op->rw.filter == SWS_OP_FILTER_H ? "o + i" : 
"pos.x";
+    if (op->rw.filter.op) {
+        const char *axis    = op->rw.filter.op == SWS_OP_FILTER_H ? "pos.x" : 
"pos.y";
+        const char *coord_x = op->rw.filter.op == SWS_OP_FILTER_H ? "o + i" : 
"pos.x";
         const char *coord_y;
-        if (op->rw.filter == SWS_OP_FILTER_H)
+        if (op->rw.filter.op == SWS_OP_FILTER_H)
             coord_y = interlaced ? "spos.y" : "pos.y";
         else
             coord_y = interlaced ? "((o + i) * 2 + int(params.field))" : "o + 
i";
@@ -1411,10 +1411,10 @@ static int add_ops_glsl(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
             nb_desc++;
         } else if (op->op == SWS_OP_FILTER_H || op->op == SWS_OP_FILTER_V ||
                    ((op->op == SWS_OP_READ || op->op == SWS_OP_WRITE) &&
-                    op->rw.filter)) {
+                    op->rw.filter.op)) {
             const SwsFilterWeights *wd = (op->op == SWS_OP_READ ||
                                           op->op == SWS_OP_WRITE) ?
-                                         op->rw.kernel : op->filter.kernel;
+                                         op->rw.filter.kernel : 
op->filter.kernel;
             snprintf(data_buf_name[nb_desc], 256, "filter_buf%i", n);
             snprintf(data_str_name[nb_desc], 256,
                      "float filter_w%i[%i][%i];\n"
@@ -1486,7 +1486,7 @@ static int add_ops_glsl(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
         }
         case SWS_OP_WRITE: {
             const char *dst_pos = ops->dst.interlaced ? "dpos" : "pos";
-            if (op->rw.frac || op->rw.filter) {
+            if (op->rw.frac || op->rw.filter.op) {
                 return AVERROR(ENOTSUP);
             } else if (op->rw.packed) {
                 GLSLF(1, imageStore(dst_img[%i], %s, %s(%s));                  
 ,
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index 90eb9d3fb7..01ffb0e048 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -173,8 +173,8 @@ static void check_compiled(const char *name,
     }
 
     int32_t in_bump_y[LINES];
-    if (read_op->rw.filter == SWS_OP_FILTER_V) {
-        const int *offsets = read_op->rw.kernel->offsets;
+    if (read_op->rw.filter.op == SWS_OP_FILTER_V) {
+        const int *offsets = read_op->rw.filter.kernel->offsets;
         for (int y = 0; y < LINES - 1; y++)
             in_bump_y[y] = offsets[y + 1] - offsets[y] - 1;
         in_bump_y[LINES - 1] = 0;
@@ -182,8 +182,8 @@ static void check_compiled(const char *name,
     }
 
     int32_t in_offset_x[PIXELS];
-    if (read_op->rw.filter == SWS_OP_FILTER_H) {
-        const int *offsets = read_op->rw.kernel->offsets;
+    if (read_op->rw.filter.op == SWS_OP_FILTER_H) {
+        const int *offsets = read_op->rw.filter.kernel->offsets;
         const int rw_bits = rw_pixel_bits(read_op);
         for (int x = 0; x < PIXELS; x++)
             in_offset_x[x] = offsets[x] * rw_bits >> 3;
@@ -466,8 +466,10 @@ static void check_filter(const char *name, const SwsUOp 
*uop)
                     .op        = SWS_OP_READ,
                     .type      = uop->type,
                     .rw.elems  = num,
-                    .rw.filter = is_vert ? SWS_OP_FILTER_V : SWS_OP_FILTER_H,
-                    .rw.kernel = filter,
+                    .rw.filter = {
+                        .op     = is_vert ? SWS_OP_FILTER_V : SWS_OP_FILTER_H,
+                        .kernel = filter,
+                    },
                 }, {
                     .op        = SWS_OP_WRITE,
                     .type      = SWS_PIXEL_F32,
-- 
2.52.0


>From 01828db25724d5b5638034daa3c86b3ebe4722ea Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 12 May 2026 20:12:08 +0200
Subject: [PATCH 02/14] swscale/ops_optimizer: fix broken convert->filter
 commute check

This one failed to adjust prev->type to the result of filtering,
leading to basically broken intermediate op lists. Fortunately, the
optimizer usually ended up eliminating these cases altogether.

Replace it by a fixed check to merge filters with any prior conversion
op that satisfies the criteria (and is deemed beneficial).

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 829bd3d0c9..b998fc6bfd 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -191,12 +191,6 @@ static bool op_commute_filter(SwsOp *op, SwsOp *prev)
         prev->type = SWS_PIXEL_F32;
         return true;
     case SWS_OP_CONVERT:
-        if (prev->convert.to == SWS_PIXEL_F32) {
-            av_assert0(!prev->convert.expand);
-            FFSWAP(SwsPixelType, op->type, prev->type);
-            return true;
-        }
-        return false;
     case SWS_OP_INVALID:
     case SWS_OP_READ:
     case SWS_OP_WRITE:
@@ -367,6 +361,17 @@ retry:
                 FFSWAP(SwsOp, *op, *prev);
                 goto retry;
             }
+
+            /* Merge filter with prior conversion */
+            if (prev->op == SWS_OP_CONVERT && prev->convert.to == op->type) {
+                int size_from = ff_sws_pixel_type_size(prev->type);
+                int size_to   = ff_sws_pixel_type_size(op->type);
+                if (size_from < size_to && !prev->convert.expand) {
+                    op->type = prev->type;
+                    ff_sws_op_list_remove_at(ops, n - 1, 1);
+                    goto retry;
+                }
+            }
             break;
         }
     }
-- 
2.52.0


>From 014648a7431a193c4b1d8d0c1b316da2ae3a58a1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 12 May 2026 21:12:55 +0200
Subject: [PATCH 03/14] swscale/ops: generalize SWS_OP_FILTER_* result type

Instead of hard-coding SWS_PIXEL_F32 here. This is not really useful
yet, but I wanted to clean up the semantics here regardless.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.c        |  1 +
 libswscale/ops.h           |  2 ++
 libswscale/ops_optimizer.c | 13 ++++++++++---
 libswscale/uops.c          |  2 +-
 tests/checkasm/sw_ops.c    |  1 +
 5 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/libswscale/format.c b/libswscale/format.c
index b9a25ed5a1..16ab524ce0 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -1640,6 +1640,7 @@ static int add_filter(SwsContext *ctx, SwsPixelType type, 
SwsOpList *ops,
         .type = type,
         .op   = filter,
         .filter.kernel = kernel,
+        .filter.type   = type,
     });
 }
 
diff --git a/libswscale/ops.h b/libswscale/ops.h
index b58a060c6e..b38cd915de 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -107,6 +107,7 @@ typedef struct SwsReadWriteOp {
     struct {
         SwsOpType op;               /* some value of SWS_OP_FILTER_* */
         SwsFilterWeights *kernel;   /* (refstruct) */
+        SwsPixelType type;          /* pixel type to store result as */
     } filter;
 } SwsReadWriteOp;
 
@@ -205,6 +206,7 @@ uint32_t ff_sws_linear_mask(SwsLinearOp);
 
 typedef struct SwsFilterOp {
     SwsFilterWeights *kernel; /* filter kernel (refstruct) */
+    SwsPixelType type;        /* pixel type to store result as */
 } SwsFilterOp;
 
 typedef struct SwsOp {
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index b998fc6bfd..d0c9fcbd82 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -54,9 +54,11 @@ static bool op_commute_clear(SwsOp *op, SwsOp *next)
     case SWS_OP_MAX:
     case SWS_OP_SCALE:
     case SWS_OP_READ:
+        ff_sws_apply_op_q(next, op->clear.value);
+        return true;
     case SWS_OP_FILTER_H:
     case SWS_OP_FILTER_V:
-        ff_sws_apply_op_q(next, op->clear.value);
+        op->type = next->filter.type;
         return true;
     case SWS_OP_SWIZZLE:
         op->clear.mask = ff_sws_comp_mask_swizzle(op->clear.mask, 
next->swizzle);
@@ -115,8 +117,10 @@ static bool op_commute_swizzle(SwsOp *op, SwsOp *next)
     case SWS_OP_LSHIFT:
     case SWS_OP_RSHIFT:
     case SWS_OP_SCALE:
+        return true;
     case SWS_OP_FILTER_H:
     case SWS_OP_FILTER_V:
+        op->type = next->filter.type;
         return true;
 
     /**
@@ -183,12 +187,14 @@ static bool op_commute_swizzle(SwsOp *op, SwsOp *next)
  */
 static bool op_commute_filter(SwsOp *op, SwsOp *prev)
 {
+    av_assert0(!ff_sws_pixel_type_is_int(op->filter.type));
+
     switch (prev->op) {
     case SWS_OP_SWIZZLE:
     case SWS_OP_SCALE:
     case SWS_OP_LINEAR:
     case SWS_OP_DITHER:
-        prev->type = SWS_PIXEL_F32;
+        prev->type = op->filter.type;
         return true;
     case SWS_OP_CONVERT:
     case SWS_OP_INVALID:
@@ -417,7 +423,7 @@ retry:
                     op->rw.elems = nb_planes;
                     RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
                         .op = SWS_OP_SWIZZLE,
-                        .type = op->rw.filter.op ? SWS_PIXEL_F32 : op->type,
+                        .type = op->rw.filter.op ? op->rw.filter.type : 
op->type,
                         .swizzle = swiz,
                     }));
                     goto retry;
@@ -744,6 +750,7 @@ retry:
                 !prev->rw.packed && !prev->rw.frac) {
                 prev->rw.filter.op = op->op;
                 prev->rw.filter.kernel = av_refstruct_ref(op->filter.kernel);
+                prev->rw.filter.type = op->filter.type;
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
diff --git a/libswscale/uops.c b/libswscale/uops.c
index 2de9c33eac..b2d11d996c 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -482,7 +482,7 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList 
*ops, SwsUOpFlags flags,
     if (op->rw.filter.op) {
         if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.packed)
             return AVERROR(ENOTSUP);
-        uop.par.filter.type = SWS_PIXEL_F32;
+        uop.par.filter.type = op->rw.filter.type;
         uop.data.kernel = av_refstruct_ref(op->rw.filter.kernel);
         if (op->rw.filter.op == SWS_OP_FILTER_H) {
             uop.uop = SWS_UOP_READ_PLANAR_FH;
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index 01ffb0e048..e09635abd4 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -469,6 +469,7 @@ static void check_filter(const char *name, const SwsUOp 
*uop)
                     .rw.filter = {
                         .op     = is_vert ? SWS_OP_FILTER_V : SWS_OP_FILTER_H,
                         .kernel = filter,
+                        .type   = SWS_PIXEL_F32,
                     },
                 }, {
                     .op        = SWS_OP_WRITE,
-- 
2.52.0


>From 544637766c91fcff49d9d709bf70ff95c000e673 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 2 Jun 2026 14:29:39 +0200
Subject: [PATCH 04/14] swscale/ops_dispatch: generalize over_read/over_read to
 array

I want to introduce operations like semiplanar reads, which would
possibly require a different number of over_read bytes per plane.

That aside, this is just a general cleanliness improvement.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_chain.c    |  7 +++++--
 libswscale/ops_chain.h    | 10 +++++-----
 libswscale/ops_dispatch.c | 27 ++++++++++++++++++---------
 libswscale/ops_dispatch.h |  6 +++---
 libswscale/uops_backend.c |  5 +++--
 libswscale/x86/ops.c      | 26 +++++++++++++++++---------
 6 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/libswscale/ops_chain.c b/libswscale/ops_chain.c
index 331f2f696b..d60f7ccf28 100644
--- a/libswscale/ops_chain.c
+++ b/libswscale/ops_chain.c
@@ -173,9 +173,12 @@ int ff_sws_uop_lookup(SwsContext *ctx, const SwsOpTable 
*const tables[],
         return ret;
     }
 
+    for (int i = 0; i < 4; i++) {
+        chain->over_read[i]  = FFMAX(chain->over_read[i],  res.over_read[i]);
+        chain->over_write[i] = FFMAX(chain->over_write[i], res.over_write[i]);
+    }
+
     chain->cpu_flags |= params.table->cpu_flags;
-    chain->over_read  = FFMAX(chain->over_read,  res.over_read);
-    chain->over_write = FFMAX(chain->over_write, res.over_write);
     return 0;
 }
 
diff --git a/libswscale/ops_chain.h b/libswscale/ops_chain.h
index 2126787782..abe4c545cd 100644
--- a/libswscale/ops_chain.h
+++ b/libswscale/ops_chain.h
@@ -86,9 +86,9 @@ typedef struct SwsOpChain {
     SwsOpImpl impl[SWS_MAX_OPS + 1]; /* reserve extra space for the entrypoint 
*/
     void (*free[SWS_MAX_OPS + 1])(SwsOpPriv *);
     int num_impl;
-    int cpu_flags;  /* set of all used CPU flags */
-    int over_read;  /* chain over-reads input by this many bytes */
-    int over_write; /* chain over-writes output by this many bytes */
+    int cpu_flags;      /* set of all used CPU flags */
+    int over_read[4];   /* chain over-reads input by this many bytes */
+    int over_write[4];  /* chain over-writes output by this many bytes */
 } SwsOpChain;
 
 SwsOpChain *ff_sws_op_chain_alloc(void);
@@ -115,8 +115,8 @@ typedef struct SwsImplResult {
     SwsFuncPtr func; /* overrides `SwsOpEntry.func` if non-NULL */
     SwsOpPriv priv; /* private data for this implementation instance */
     void (*free)(SwsOpPriv *priv); /* free function for `priv` */
-    int over_read;  /* implementation over-reads input by this many bytes */
-    int over_write; /* implementation over-writes output by this many bytes */
+    int over_read[4];  /* implementation over-reads input by this many bytes */
+    int over_write[4]; /* implementation over-writes output by this many bytes 
*/
 } SwsImplResult;
 
 typedef struct SwsOpEntry {
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 1186a8a73e..fcf77204db 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -81,8 +81,12 @@ static int compile_backend(SwsContext *ctx, const 
SwsOpBackend *backend,
     *out = compiled;
 
     av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': "
-           "block size = %d, over-read = %d, over-write = %d, cpu flags = 
0x%x\n",
-           backend->name, out->block_size, out->over_read, out->over_write,
+           "block size = %d, over-read = {%d %d %d %d}, over-write = {%d %d %d 
%d}, "
+           "cpu flags = 0x%x\n", backend->name, out->block_size,
+           out->over_read[0], out->over_read[1],
+           out->over_read[2], out->over_read[3],
+           out->over_write[0], out->over_write[1],
+           out->over_write[2], out->over_write[3],
            out->cpu_flags);
 
     ff_sws_op_list_print(ctx, AV_LOG_VERBOSE, AV_LOG_TRACE, ops);
@@ -229,7 +233,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
             input_bytes = pixel_bytes(plane_w, p->pixel_bits_in, AV_ROUND_UP);
         }
 
-        size_t safe_bytes = safe_bytes_pad(input_bytes, comp->over_read);
+        size_t safe_bytes = safe_bytes_pad(input_bytes, comp->over_read[i]);
         size_t safe_blocks_in;
         if (exec->in_offset_x) {
             size_t filter_size = pixel_bytes(p->filter_size_h, 
p->pixel_bits_in,
@@ -260,7 +264,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         int chroma = idx == 1 || idx == 2;
         int sub_x  = chroma ? outdesc->log2_chroma_w : 0;
         int sub_y  = chroma ? outdesc->log2_chroma_h : 0;
-        size_t safe_bytes = safe_bytes_pad(out->linesize[idx], 
comp->over_write);
+        size_t safe_bytes = safe_bytes_pad(out->linesize[idx], 
comp->over_write[i]);
         size_t safe_blocks_out = safe_bytes / exec->block_size_out;
         if (safe_blocks_out < num_blocks) {
             p->memcpy_out = true;
@@ -314,7 +318,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
             needed_size = pixel_bytes(alloc_width, p->pixel_bits_in, 
AV_ROUND_UP);
         }
         size_t loop_size   = p->tail_blocks * exec->block_size_in;
-        tail->in_stride[i] = FFALIGN(needed_size + comp->over_read, align);
+        tail->in_stride[i] = FFALIGN(needed_size + comp->over_read[i], align);
         tail->in_bump[i]   = tail->in_stride[i] - loop_size;
         alloc_size += tail->in_stride[i] * in->height;
     }
@@ -322,7 +326,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
     for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
         size_t needed_size  = pixel_bytes(alloc_width, p->pixel_bits_out, 
AV_ROUND_UP);
         size_t loop_size    = p->tail_blocks * exec->block_size_out;
-        tail->out_stride[i] = FFALIGN(needed_size + comp->over_write, align);
+        tail->out_stride[i] = FFALIGN(needed_size + comp->over_write[i], 
align);
         tail->out_bump[i]   = tail->out_stride[i] - loop_size;
         alloc_size += tail->out_stride[i] * out->height;
     }
@@ -484,17 +488,22 @@ static int rw_pixel_bits(const SwsOp *op)
     return elems * size * bits;
 }
 
-static void align_pass(SwsPass *pass, int block_size, int over_rw, int 
pixel_bits)
+static void align_pass(SwsPass *pass, int block_size, const int *over_rw,
+                       int pixel_bits)
 {
     if (!pass)
         return;
 
     /* Add at least as many pixels as needed to cover the padding requirement 
*/
-    const int pad = (over_rw * 8 + pixel_bits - 1) / pixel_bits;
+    int pad_max = 0;
+    for (int i = 0; i < 4; i++) {
+        const int pad = (over_rw[i] * 8 + pixel_bits - 1) / pixel_bits;
+        pad_max = FFMAX(pad_max, pad);
+    }
 
     SwsPassBuffer *buf = pass->output;
     buf->width_align = FFMAX(buf->width_align, block_size);
-    buf->width_pad = FFMAX(buf->width_pad, pad);
+    buf->width_pad = FFMAX(buf->width_pad, pad_max);
 }
 
 static int compile(SwsGraph *graph, const SwsOpBackend *backend,
diff --git a/libswscale/ops_dispatch.h b/libswscale/ops_dispatch.h
index 7f1304dcc4..237a036f69 100644
--- a/libswscale/ops_dispatch.h
+++ b/libswscale/ops_dispatch.h
@@ -119,9 +119,9 @@ typedef struct SwsCompiledOp {
     int cpu_flags;   /* active set of CPU flags (informative) */
 
     /* Execution parameters for non-opaque functions only */
-    int block_size;  /* number of pixels processed per iteration */
-    int over_read;   /* implementation over-reads input by this many bytes */
-    int over_write;  /* implementation over-writes output by this many bytes */
+    int block_size;     /* number of pixels processed per iteration */
+    int over_read[4];   /* implementation over-reads input by this many bytes 
*/
+    int over_write[4];  /* implementation over-writes output by this many 
bytes */
 
     /* Arbitrary private data */
     void *priv;
diff --git a/libswscale/uops_backend.c b/libswscale/uops_backend.c
index fd7220a57a..50f5302ca6 100644
--- a/libswscale/uops_backend.c
+++ b/libswscale/uops_backend.c
@@ -165,13 +165,14 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
         .slice_align = 1,
         .block_size  = SWS_BLOCK_SIZE,
         .cpu_flags   = chain->cpu_flags,
-        .over_read   = chain->over_read,
-        .over_write  = chain->over_write,
         .priv        = chain,
         .free        = ff_sws_op_chain_free_cb,
         .func        = process,
     };
 
+    memcpy(out->over_read,  chain->over_read,  sizeof(out->over_read));
+    memcpy(out->over_write, chain->over_write, sizeof(out->over_write));
+
     av_log(ctx, AV_LOG_DEBUG, "Compiled micro-ops:\n");
     for (int i = 0; i < uops->num_ops; i++) {
         char name[SWS_UOP_NAME_MAX];
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 4c8eceb1cb..e8b0a20a1c 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -35,8 +35,8 @@ static int setup_rw_packed(const SwsImplParams *params, 
SwsImplResult *out)
     /* 3-component packed reads/writes process one extra garbage word */
     if (uop->mask == SWS_COMP_ELEMS(3)) {
         switch (uop->uop) {
-        case SWS_UOP_READ_PACKED:  out->over_read  = sizeof(uint32_t); break;
-        case SWS_UOP_WRITE_PACKED: out->over_write = sizeof(uint32_t); break;
+        case SWS_UOP_READ_PACKED:  out->over_read[0]  = sizeof(uint32_t); 
break;
+        case SWS_UOP_WRITE_PACKED: out->over_write[0] = sizeof(uint32_t); 
break;
         }
     }
 
@@ -153,7 +153,11 @@ static int setup_filter_h(const SwsImplParams *params, 
SwsImplResult *out)
     out->priv.ptr = weights.ptr;
     out->priv.uptr[1] = aligned_size;
     out->free = ff_op_priv_free;
-    out->over_read = (aligned_size - filter_size) * pixel_size;
+
+    for (int i = 0; i < 4; i++) {
+        if (uop->mask & SWS_COMP(i))
+            out->over_read[i] = (aligned_size - filter_size) * pixel_size;
+    }
     return 0;
 }
 
@@ -236,7 +240,11 @@ static int setup_filter_h_4x4(const SwsImplParams *params, 
SwsImplResult *out)
     out->priv.ptr = weights.ptr;
     out->priv.uptr[1] = aligned_size * sizeof_weights;
     out->free = ff_op_priv_free;
-    out->over_read = (aligned_size - filter_size) * pixel_size;
+
+    for (int i = 0; i < 4; i++) {
+        if (uop->mask & SWS_COMP(i))
+            out->over_read[i] = (aligned_size - filter_size) * pixel_size;
+    }
     return 0;
 }
 
@@ -506,8 +514,8 @@ static int solve_shuffle(const SwsOpList *ops, int mmsize, 
SwsCompiledOp *out)
         .free        = av_free,
         .slice_align = 1,
         .block_size  = pixels * num_lanes,
-        .over_read   = movsize(in_total,  mmsize) - in_total,
-        .over_write  = movsize(out_total, mmsize) - out_total,
+        .over_read   = { movsize(in_total,  mmsize) - in_total },
+        .over_write  = { movsize(out_total, mmsize) - out_total },
         .cpu_flags   = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
                        mmsize > 16 ? AV_CPU_FLAG_AVX2 :
                                      AV_CPU_FLAG_SSE4,
@@ -640,9 +648,9 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
         return ret;
     }
 
-    out->cpu_flags  = chain->cpu_flags;
-    out->over_read  = chain->over_read;
-    out->over_write = chain->over_write;
+    out->cpu_flags = chain->cpu_flags;
+    memcpy(out->over_read,  chain->over_read,  sizeof(out->over_read));
+    memcpy(out->over_write, chain->over_write, sizeof(out->over_write));
     ff_sws_uop_list_free(&uops);
     return 0;
 
-- 
2.52.0


>From 55fc730bd28e8973fb50a7f6be59d1d3bc1aaf6d Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 2 Jun 2026 14:30:17 +0200
Subject: [PATCH 05/14] swscale/ops_dispatch: generalize block_size_in/out to
 array

See previous commit for justification. I decided to split these
refactors up into several independent commits to make it easier
to review and bisect, since they are all independent atomic changes.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c      | 29 ++++++++++++++++-------------
 libswscale/ops_dispatch.h      |  6 +++---
 libswscale/ops_memcpy.c        |  2 +-
 libswscale/x86/ops_include.asm |  4 ++--
 tests/checkasm/sw_ops.c        |  8 ++++----
 5 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index fcf77204db..069474c056 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -242,7 +242,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
                                                 safe_bytes - filter_size,
                                                 exec->in_offset_x);
         } else {
-            safe_blocks_in = safe_bytes / exec->block_size_in;
+            safe_blocks_in = safe_bytes / exec->block_size_in[i];
         }
 
         if (safe_blocks_in < num_blocks) {
@@ -251,7 +251,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
             safe_blocks = FFMIN(safe_blocks, safe_blocks_in);
         }
 
-        size_t loop_size   = num_blocks * exec->block_size_in;
+        size_t loop_size   = num_blocks * exec->block_size_in[i];
         exec->in[i]        = in->data[idx];
         exec->in_stride[i] = in->linesize[idx];
         exec->in_bump[i]   = in->linesize[idx] - loop_size;
@@ -265,13 +265,13 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         int sub_x  = chroma ? outdesc->log2_chroma_w : 0;
         int sub_y  = chroma ? outdesc->log2_chroma_h : 0;
         size_t safe_bytes = safe_bytes_pad(out->linesize[idx], 
comp->over_write[i]);
-        size_t safe_blocks_out = safe_bytes / exec->block_size_out;
+        size_t safe_blocks_out = safe_bytes / exec->block_size_out[i];
         if (safe_blocks_out < num_blocks) {
             p->memcpy_out = true;
             safe_blocks   = FFMIN(safe_blocks, safe_blocks_out);
         }
 
-        size_t loop_size    = num_blocks * exec->block_size_out;
+        size_t loop_size    = num_blocks * exec->block_size_out[i];
         exec->out[i]        = out->data[idx];
         exec->out_stride[i] = out->linesize[idx];
         exec->out_bump[i]   = out->linesize[idx] - loop_size;
@@ -317,7 +317,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         } else {
             needed_size = pixel_bytes(alloc_width, p->pixel_bits_in, 
AV_ROUND_UP);
         }
-        size_t loop_size   = p->tail_blocks * exec->block_size_in;
+        size_t loop_size   = p->tail_blocks * exec->block_size_in[i];
         tail->in_stride[i] = FFALIGN(needed_size + comp->over_read[i], align);
         tail->in_bump[i]   = tail->in_stride[i] - loop_size;
         alloc_size += tail->in_stride[i] * in->height;
@@ -325,7 +325,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 
     for (int i = 0; p->memcpy_out && i < p->planes_out; i++) {
         size_t needed_size  = pixel_bytes(alloc_width, p->pixel_bits_out, 
AV_ROUND_UP);
-        size_t loop_size    = p->tail_blocks * exec->block_size_out;
+        size_t loop_size    = p->tail_blocks * exec->block_size_out[i];
         tail->out_stride[i] = FFALIGN(needed_size + comp->over_write[i], 
align);
         tail->out_bump[i]   = tail->out_stride[i] - loop_size;
         alloc_size += tail->out_stride[i] * out->height;
@@ -419,8 +419,8 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
             /* We process fewer blocks, so the in_bump needs to be increased
              * to reflect that the plane pointers are left on the last block,
              * not the end of the processed line, after each loop iteration */
-            exec.in_bump[i]  += exec.block_size_in  * tail_blocks;
-            exec.out_bump[i] += exec.block_size_out * tail_blocks;
+            exec.in_bump[i]  += exec.block_size_in[i]  * tail_blocks;
+            exec.out_bump[i] += exec.block_size_out[i] * tail_blocks;
         }
 
         comp->func(&exec, comp->priv, 0, y, num_blocks - tail_blocks, y + h);
@@ -448,7 +448,7 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
                        exec.in[i], exec.in_stride[i], lines, p->tail_size_in);
         } else {
             /* Reuse input pointers directly */
-            const size_t loop_size = tail_blocks * exec.block_size_in;
+            const size_t loop_size = tail_blocks * exec.block_size_in[i];
             tail.in[i]        = exec.in[i];
             tail.in_stride[i] = exec.in_stride[i];
             tail.in_bump[i]   = exec.in_stride[i] - loop_size;
@@ -457,7 +457,7 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
 
     for (int i = 0; !memcpy_out && i < p->planes_out; i++) {
         /* Reuse output pointers directly */
-        const size_t loop_size = tail_blocks * exec.block_size_out;
+        const size_t loop_size = tail_blocks * exec.block_size_out[i];
         tail.out[i]        = exec.out[i];
         tail.out_stride[i] = exec.out_stride[i];
         tail.out_bump[i]   = exec.out_stride[i] - loop_size;
@@ -552,8 +552,10 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         goto fail;
     }
 
-    p->exec_base.block_size_in  = block_bits_in  >> 3;
-    p->exec_base.block_size_out = block_bits_out >> 3;
+    for (int i = 0; i < 4; i++) {
+        p->exec_base.block_size_in[i]  = block_bits_in  >> 3;
+        p->exec_base.block_size_out[i] = block_bits_out >> 3;
+    }
 
     for (int i = 0; i < 4; i++) {
         p->idx_in[i]  = i < p->planes_in  ? ops->plane_src[i] : -1;
@@ -602,7 +604,8 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         }
         for (int x = filter->dst_size; x < pixels; x++)
             offset[x] = offset[filter->dst_size - 1];
-        p->exec_base.block_size_in = 0; /* ptr does not advance */
+        for (int i = 0; i < 4; i++)
+            p->exec_base.block_size_in[i] = 0; /* ptr does not advance */
         p->filter_size_h = filter->filter_size;
     }
 
diff --git a/libswscale/ops_dispatch.h b/libswscale/ops_dispatch.h
index 237a036f69..4097b8e947 100644
--- a/libswscale/ops_dispatch.h
+++ b/libswscale/ops_dispatch.h
@@ -54,8 +54,8 @@ typedef struct SwsOpExec {
     /* Extra metadata, may or may not be useful */
     int32_t width, height;      /* Overall output image dimensions */
     int32_t slice_y, slice_h;   /* Start and height of current slice */
-    int32_t block_size_in;      /* Size of a block of pixels in bytes */
-    int32_t block_size_out;
+    int32_t block_size_in[4];   /* Size of a block of pixels in bytes */
+    int32_t block_size_out[4];
 
     /* Subsampling factors for each plane */
     uint8_t in_sub_y[4], out_sub_y[4];
@@ -81,7 +81,7 @@ typedef struct SwsOpExec {
 } SwsOpExec;
 
 static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) +
-                                   6  * sizeof(int32_t) +
+                                   12 * sizeof(int32_t) +
                                    16 * sizeof(uint8_t) +
                                    2  * sizeof(void *),
               "SwsOpExec layout mismatch");
diff --git a/libswscale/ops_memcpy.c b/libswscale/ops_memcpy.c
index 00f3e79608..26634049dd 100644
--- a/libswscale/ops_memcpy.c
+++ b/libswscale/ops_memcpy.c
@@ -43,12 +43,12 @@ static void process(const SwsOpExec *exec, const void *priv,
 {
     const MemcpyPriv *p = priv;
     const int lines = y_end - y_start;
-    const int bytes = x_end * exec->block_size_out;
     av_assert1(x_start == 0 && x_end == exec->width);
 
     for (int i = 0; i < p->num_planes; i++) {
         uint8_t *out = exec->out[i];
         const int idx = p->index[i];
+        const int bytes = x_end * exec->block_size_out[i];
         const int use_loop = exec->out_stride[i] > bytes + SWS_MAX_PADDING;
         if (idx < 0 && !use_loop) {
             memset(out, p->clear_value[i], exec->out_stride[i] * lines);
diff --git a/libswscale/x86/ops_include.asm b/libswscale/x86/ops_include.asm
index cc44a247b0..073ed31e57 100644
--- a/libswscale/x86/ops_include.asm
+++ b/libswscale/x86/ops_include.asm
@@ -123,8 +123,8 @@ struc SwsOpExec
     .height resd 1
     .slice_y resd 1
     .slice_h resd 1
-    .block_size_in resd 1
-    .block_size_out resd 1
+    .block_size_in resd 4
+    .block_size_out resd 4
     .in_sub_y4 resb 4
     .out_sub_y4 resb 4
     .in_sub_x4 resb 4
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index e09635abd4..fdc17bffa8 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -190,19 +190,19 @@ static void check_compiled(const char *name,
         exec.in_offset_x = in_offset_x;
     }
 
-    exec.block_size_in  = comp_ref->block_size * rw_pixel_bits(read_op)  >> 3;
-    exec.block_size_out = comp_ref->block_size * rw_pixel_bits(write_op) >> 3;
     for (int i = 0; i < NB_PLANES; i++) {
         exec.in[i]  = (void *) src0[i];
         exec.out[i] = (void *) dst0[i];
+        exec.block_size_in[i]  = comp_ref->block_size * rw_pixel_bits(read_op) 
 >> 3;
+        exec.block_size_out[i] = comp_ref->block_size * 
rw_pixel_bits(write_op) >> 3;
     }
     checkasm_call(comp_ref->func, &exec, comp_ref->priv, 0, 0, PIXELS / 
comp_ref->block_size, LINES);
 
-    exec.block_size_in  = comp_new->block_size * rw_pixel_bits(read_op)  >> 3;
-    exec.block_size_out = comp_new->block_size * rw_pixel_bits(write_op) >> 3;
     for (int i = 0; i < NB_PLANES; i++) {
         exec.in[i]  = (void *) src1[i];
         exec.out[i] = (void *) dst1[i];
+        exec.block_size_in[i]  = comp_new->block_size * rw_pixel_bits(read_op) 
 >> 3;
+        exec.block_size_out[i] = comp_new->block_size * 
rw_pixel_bits(write_op) >> 3;
     }
     checkasm_call_checked(comp_new->func, &exec, comp_new->priv, 0, 0, PIXELS 
/ comp_new->block_size, LINES);
 
-- 
2.52.0


>From a3cd6a8f64b85d30498b14e6ded3eb119de678cf Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 14 May 2026 14:57:16 +0200
Subject: [PATCH 06/14] swscale/ops_dispatch: reword misleading error

The block size is given in units of pixels, so this message as written
does not even make sense.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 069474c056..a612fd53cd 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -547,7 +547,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
     const int64_t block_bits_in  = (int64_t) comp->block_size * 
p->pixel_bits_in;
     const int64_t block_bits_out = (int64_t) comp->block_size * 
p->pixel_bits_out;
     if (block_bits_in & 0x7 || block_bits_out & 0x7) {
-        av_log(ctx, AV_LOG_ERROR, "Block size must be a multiple of the pixel 
size.\n");
+        av_log(ctx, AV_LOG_ERROR, "Block size must be byte-aligned.\n");
         ret = AVERROR(EINVAL);
         goto fail;
     }
-- 
2.52.0


>From db3701712136fb0288470c55ea6ac7d190ec2342 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 2 Jun 2026 14:38:15 +0200
Subject: [PATCH 07/14] swscale/ops_dispatch: move some code from setup() to
 compile()

This won't change from frame to frame, so there's no reason to redundantly
re-setup these fields. Paves the way for the next change as well.

The one minor annoyance is that this relies on SwsOpList.src/dst being
populated, to gain access to the sub_x/sub_y fields. However, that's not
a big ask, given that e.g. the dispatch layer already relies on the
pixel dimensions from this field being accurate for sizing intermediate
buffers during filter splitting.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 48 ++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index a612fd53cd..5f40010758 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -201,7 +201,6 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
                          const SwsPass *pass)
 {
     const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(in->format);
-    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format);
     const bool float_in = indesc->flags & AV_PIX_FMT_FLAG_FLOAT;
 
     SwsOpPass *p = pass->priv;
@@ -221,15 +220,11 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 
     size_t safe_blocks = num_blocks;
     for (int i = 0; i < p->planes_in; i++) {
-        int idx    = p->idx_in[i];
-        int chroma = idx == 1 || idx == 2;
-        int sub_x  = chroma ? indesc->log2_chroma_w : 0;
-        int sub_y  = chroma ? indesc->log2_chroma_h : 0;
-
+        const int idx = p->idx_in[i];
         size_t input_bytes = in->linesize[idx];
         if (p->filter_size_h && float_in) {
             /* Floating point inputs may contain NaN / Infinity in the padding 
*/
-            const int plane_w = AV_CEIL_RSHIFT(in->width, sub_x);
+            const int plane_w = AV_CEIL_RSHIFT(in->width, exec->in_sub_x[i]);
             input_bytes = pixel_bytes(plane_w, p->pixel_bits_in, AV_ROUND_UP);
         }
 
@@ -255,15 +250,10 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         exec->in[i]        = in->data[idx];
         exec->in_stride[i] = in->linesize[idx];
         exec->in_bump[i]   = in->linesize[idx] - loop_size;
-        exec->in_sub_y[i]  = sub_y;
-        exec->in_sub_x[i]  = sub_x;
     }
 
     for (int i = 0; i < p->planes_out; i++) {
-        int idx    = p->idx_out[i];
-        int chroma = idx == 1 || idx == 2;
-        int sub_x  = chroma ? outdesc->log2_chroma_w : 0;
-        int sub_y  = chroma ? outdesc->log2_chroma_h : 0;
+        const int idx = p->idx_out[i];
         size_t safe_bytes = safe_bytes_pad(out->linesize[idx], 
comp->over_write[i]);
         size_t safe_blocks_out = safe_bytes / exec->block_size_out[i];
         if (safe_blocks_out < num_blocks) {
@@ -275,8 +265,6 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
         exec->out[i]        = out->data[idx];
         exec->out_stride[i] = out->linesize[idx];
         exec->out_bump[i]   = out->linesize[idx] - loop_size;
-        exec->out_sub_y[i]  = sub_y;
-        exec->out_sub_x[i]  = sub_x;
     }
 
     const bool memcpy_in = p->memcpy_first || p->memcpy_last;
@@ -521,6 +509,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         goto fail; /* nothing to do, just return */
 
     const SwsCompiledOp *comp = &p->comp;
+    const SwsFormat *src = &ops->src;
     const SwsFormat *dst = &ops->dst;
     if (p->comp.opaque) {
         SwsCompiledOp c = *comp;
@@ -533,6 +522,8 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         return ret;
     }
 
+    const AVPixFmtDescriptor *indesc  = av_pix_fmt_desc_get(src->format);
+    const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(dst->format);
     const SwsOp *read  = ff_sws_op_list_input(ops);
     const SwsOp *write = ff_sws_op_list_output(ops);
     p->planes_in  = rw_planes(read);
@@ -552,14 +543,29 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         goto fail;
     }
 
-    for (int i = 0; i < 4; i++) {
-        p->exec_base.block_size_in[i]  = block_bits_in  >> 3;
-        p->exec_base.block_size_out[i] = block_bits_out >> 3;
+    for (int i = 0; i < 4; i++)
+        p->idx_in[i] = p->idx_out[i] = -1;
+
+    for (int i = 0; i < p->planes_in; i++) {
+        const int idx = ops->plane_src[i];
+        const int chroma = idx == 1 || idx == 2;
+        const int sub_x = chroma ? indesc->log2_chroma_w : 0;
+        const int sub_y = chroma ? indesc->log2_chroma_h : 0;
+        p->exec_base.in_sub_x[i] = sub_x;
+        p->exec_base.in_sub_y[i] = sub_y;
+        p->exec_base.block_size_in[i] = block_bits_in >> 3;
+        p->idx_in[i] = idx;
     }
 
-    for (int i = 0; i < 4; i++) {
-        p->idx_in[i]  = i < p->planes_in  ? ops->plane_src[i] : -1;
-        p->idx_out[i] = i < p->planes_out ? ops->plane_dst[i] : -1;
+    for (int i = 0; i < p->planes_out; i++) {
+        const int idx = ops->plane_dst[i];
+        const int chroma = idx == 1 || idx == 2;
+        const int sub_x = chroma ? outdesc->log2_chroma_w : 0;
+        const int sub_y = chroma ? outdesc->log2_chroma_h : 0;
+        p->exec_base.out_sub_x[i] = sub_x;
+        p->exec_base.out_sub_y[i] = sub_y;
+        p->exec_base.block_size_out[i] = block_bits_out >> 3;
+        p->idx_out[i] = idx;
     }
 
     const SwsFilterWeights *filter = read->rw.filter.kernel;
-- 
2.52.0


>From 43dfc5665b5e6c1e5aa4ca6f829593d6d397cb13 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 14 May 2026 18:54:18 +0200
Subject: [PATCH 08/14] swscale/ops_dispatch: adjust block_size_in[] by sub_x

This was previously ignored, but should be taken into account for subsampled
planes.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 5f40010758..9f17aea343 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -553,7 +553,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         const int sub_y = chroma ? indesc->log2_chroma_h : 0;
         p->exec_base.in_sub_x[i] = sub_x;
         p->exec_base.in_sub_y[i] = sub_y;
-        p->exec_base.block_size_in[i] = block_bits_in >> 3;
+        p->exec_base.block_size_in[i] = block_bits_in >> (3 + sub_x);
         p->idx_in[i] = idx;
     }
 
@@ -564,7 +564,7 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
         const int sub_y = chroma ? outdesc->log2_chroma_h : 0;
         p->exec_base.out_sub_x[i] = sub_x;
         p->exec_base.out_sub_y[i] = sub_y;
-        p->exec_base.block_size_out[i] = block_bits_out >> 3;
+        p->exec_base.block_size_out[i] = block_bits_out >> (3 + sub_x);
         p->idx_out[i] = idx;
     }
 
-- 
2.52.0


>From 777e9c84193e1d288544e70fd54ca860fa343cd2 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 19 May 2026 11:48:34 +0200
Subject: [PATCH 09/14] swscale/ops: add and use ff_sws_rw_op_planes()

This is rw_planes() from ops_dispatch.c, but exposed internally.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/aarch64/ops.c  |  4 ++--
 libswscale/ops.c          | 10 ++++++++--
 libswscale/ops.h          |  5 +++++
 libswscale/ops_dispatch.c |  9 ++-------
 libswscale/vulkan/ops.c   |  4 ++--
 libswscale/x86/ops.c      |  4 ++--
 6 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c
index 0b9a39fe90..9a3b1f237c 100644
--- a/libswscale/aarch64/ops.c
+++ b/libswscale/aarch64/ops.c
@@ -229,8 +229,8 @@ static int aarch64_compile(SwsContext *ctx, const SwsOpList 
*ops,
 
     const SwsOp *read  = ff_sws_op_list_input(&rest);
     const SwsOp *write = ff_sws_op_list_output(&rest);
-    const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
-    const int write_planes = write->rw.packed ? 1 : write->rw.elems;
+    const int read_planes  = read ? ff_sws_rw_op_planes(read) : 0;
+    const int write_planes = ff_sws_rw_op_planes(write);
     SwsOpFunc process_func = NULL;
     switch (FFMAX(read_planes, write_planes)) {
     case 1: process_func = (SwsOpFunc) ff_sws_process_0001_neon; break;
diff --git a/libswscale/ops.c b/libswscale/ops.c
index f18249cc1f..e4d79a2c60 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -166,6 +166,12 @@ SwsCompMask ff_sws_comp_mask_needed(const SwsOp *op)
     return mask;
 }
 
+int ff_sws_rw_op_planes(const SwsOp *op)
+{
+    av_assert2(op->op == SWS_OP_READ || op->op == SWS_OP_WRITE);
+    return op->rw.packed ? 1 : op->rw.elems;
+}
+
 /* biased towards `a` */
 static AVRational av_min_q(AVRational a, AVRational b)
 {
@@ -736,7 +742,7 @@ bool ff_sws_op_list_is_noop(const SwsOpList *ops)
      * between them, e.g. rgbap <-> gbrap, which doesn't currently exist.
      * However, the check is cheap and lets me sleep at night.
      */
-    const int num_planes = read->rw.packed ? 1 : read->rw.elems;
+    const int num_planes = ff_sws_rw_op_planes(read);
     for (int i = 0; i < num_planes; i++) {
         if (ops->plane_src[i] != ops->plane_dst[i])
             return false;
@@ -983,7 +989,7 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra,
         ff_sws_op_desc(&bp, op);
 
         if (op->op == SWS_OP_READ || op->op == SWS_OP_WRITE) {
-            const int planes = op->rw.packed ? 1 : op->rw.elems;
+            const int planes = ff_sws_rw_op_planes(op);
             desc_plane_order(&bp, planes,
                 op->op == SWS_OP_READ ? ops->plane_src : ops->plane_dst);
         }
diff --git a/libswscale/ops.h b/libswscale/ops.h
index b38cd915de..657726f02f 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -241,6 +241,11 @@ typedef struct SwsOp {
 /* Compute SwsCompMask from a mask of needed components */
 SwsCompMask ff_sws_comp_mask_needed(const SwsOp *op);
 
+/**
+ * Return the number of planes involved in a read/wite operation.
+ */
+int ff_sws_rw_op_planes(const SwsOp *op);
+
 /**
  * Describe an operation in human-readable form.
  */
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 9f17aea343..9b81379bcd 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -462,11 +462,6 @@ static void op_pass_run(const SwsFrame *out, const 
SwsFrame *in, const int y,
     }
 }
 
-static int rw_planes(const SwsOp *op)
-{
-    return op->rw.packed ? 1 : op->rw.elems;
-}
-
 static int rw_pixel_bits(const SwsOp *op)
 {
     const int elems = op->rw.packed ? op->rw.elems : 1;
@@ -526,8 +521,8 @@ static int compile(SwsGraph *graph, const SwsOpBackend 
*backend,
     const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(dst->format);
     const SwsOp *read  = ff_sws_op_list_input(ops);
     const SwsOp *write = ff_sws_op_list_output(ops);
-    p->planes_in  = rw_planes(read);
-    p->planes_out = rw_planes(write);
+    p->planes_in  = ff_sws_rw_op_planes(read);
+    p->planes_out = ff_sws_rw_op_planes(write);
     p->pixel_bits_in  = rw_pixel_bits(read);
     p->pixel_bits_out = rw_pixel_bits(write);
     p->exec_base = (SwsOpExec) {
diff --git a/libswscale/vulkan/ops.c b/libswscale/vulkan/ops.c
index e4f815124e..cfafa1c27c 100644
--- a/libswscale/vulkan/ops.c
+++ b/libswscale/vulkan/ops.c
@@ -929,11 +929,11 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
 
     /* Image ops, to determine types */
     const SwsOp *op_w = ff_sws_op_list_output(ops);
-    int out_img_count = op_w->rw.packed ? 1 : op_w->rw.elems;
+    int out_img_count = ff_sws_rw_op_planes(op_w);
     p->dst_rep = op_w->type == SWS_PIXEL_F32 ? FF_VK_REP_FLOAT : 
FF_VK_REP_UINT;
 
     const SwsOp *op_r = ff_sws_op_list_input(ops);
-    int in_img_count = op_r ? op_r->rw.packed ? 1 : op_r->rw.elems : 0;
+    int in_img_count = op_r ? ff_sws_rw_op_planes(op_r) : 0;
     if (op_r)
         p->src_rep = op_r->type == SWS_PIXEL_F32 ? FF_VK_REP_FLOAT : 
FF_VK_REP_UINT;
 
diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index e8b0a20a1c..1adb73e21c 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -634,8 +634,8 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
 
     const SwsOp *read      = ff_sws_op_list_input(ops);
     const SwsOp *write     = ff_sws_op_list_output(ops);
-    const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
-    const int write_planes = write->rw.packed ? 1 : write->rw.elems;
+    const int read_planes  = read ? ff_sws_rw_op_planes(read) : 0;
+    const int write_planes = ff_sws_rw_op_planes(write);
     switch (FFMAX(read_planes, write_planes)) {
     case 1: out->func = ff_sws_process1_x86; break;
     case 2: out->func = ff_sws_process2_x86; break;
-- 
2.52.0


>From 7e739dc0dae0caebdf37463fa2e0fd04c8be9007 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 19 May 2026 12:02:57 +0200
Subject: [PATCH 10/14] swscale/ops_optimizer: simplify shuffle solver plane
 checks

Using ff_sws_rw_op_planes(), to directly encode the relevant condition (i.e.
physical access to multiple planes).

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_optimizer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index d0c9fcbd82..3bc9fa02ff 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -815,8 +815,7 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
         return AVERROR(EINVAL);
 
     const SwsOp *read = ff_sws_op_list_input(ops);
-    if (!read || read->rw.frac || read->rw.filter.op ||
-        (!read->rw.packed && read->rw.elems > 1))
+    if (!read || read->rw.frac || read->rw.filter.op || 
ff_sws_rw_op_planes(read) > 1)
         return AVERROR(ENOTSUP);
 
     const int read_size = ff_sws_pixel_type_size(read->type);
@@ -866,8 +865,7 @@ int ff_sws_solve_shuffle(const SwsOpList *const ops, 
uint8_t shuffle[],
         }
 
         case SWS_OP_WRITE: {
-            if (op->rw.frac || op->rw.filter.op ||
-                (!op->rw.packed && op->rw.elems > 1))
+            if (op->rw.frac || op->rw.filter.op || ff_sws_rw_op_planes(op) > 1)
                 return AVERROR(ENOTSUP);
 
             /* Initialize to no-op */
-- 
2.52.0


>From a79fe7dc89ef9b7a5aa1b7fc244bbe58ef216111 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 19 May 2026 12:20:40 +0200
Subject: [PATCH 11/14] swscale/ops_memcpy: simplify plane count check

Instead of testing for this condition indirectly via packed and rw.elems,
we can now express the relevant condition directly. The memcpy backend works
if and only if each component lives on a separate plane.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_memcpy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/ops_memcpy.c b/libswscale/ops_memcpy.c
index 26634049dd..2cbe8f314b 100644
--- a/libswscale/ops_memcpy.c
+++ b/libswscale/ops_memcpy.c
@@ -78,7 +78,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
         const SwsOp *op = &ops->ops[n];
         switch (op->op) {
         case SWS_OP_READ:
-            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter.op)
+            if (ff_sws_rw_op_planes(op) != op->rw.elems || op->rw.frac || 
op->rw.filter.op)
                 return AVERROR(ENOTSUP);
             for (int i = 0; i < op->rw.elems; i++)
                 p.index[i] = i;
@@ -121,7 +121,7 @@ static int compile(SwsContext *ctx, const SwsOpList *ops, 
SwsCompiledOp *out)
             break;
 
         case SWS_OP_WRITE:
-            if ((op->rw.packed && op->rw.elems != 1) || op->rw.frac || 
op->rw.filter.op)
+            if (ff_sws_rw_op_planes(op) != op->rw.elems || op->rw.frac || 
op->rw.filter.op)
                 return AVERROR(ENOTSUP);
             p.num_planes = op->rw.elems;
             break;
-- 
2.52.0


>From 10666e7bab73e5499007c5a21b142336764eddf6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Tue, 2 Jun 2026 14:39:34 +0200
Subject: [PATCH 12/14] swscale/ops: generalize SwsReadWriteOp.packed to enum

I want to start adding more data layouts, like semiplanar formats (nv12), or
palette formats. I made an effort to distinguish existing checks for rw.packed
into "mode != PLANAR" and "mode == PACKED", based on the intent of the
surrounding code, in anticipation of these new layouts.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/format.c        | 22 +++++++++++++---------
 libswscale/ops.c           | 24 ++++++++++++++++++++----
 libswscale/ops.h           |  7 ++++++-
 libswscale/ops_dispatch.c  |  7 ++++++-
 libswscale/ops_optimizer.c |  8 ++++----
 libswscale/uops.c          |  4 ++--
 libswscale/vulkan/ops.c    | 16 ++++++++--------
 tests/checkasm/sw_ops.c    | 31 +++++++++++++++++++++++++++----
 8 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/libswscale/format.c b/libswscale/format.c
index 16ab524ce0..4f538d43ab 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -734,20 +734,20 @@ typedef struct FmtInfo {
     int            shift;
 } FmtInfo;
 
-#define BITSTREAM_FMT(SWIZ, FRAC, PACKED, ...) (FmtInfo) {      \
-    .rw = { .elems = 1, .frac = FRAC, .packed = PACKED },       \
+#define BITSTREAM_FMT(SWIZ, FRAC, MODE, ...) (FmtInfo) {        \
+    .rw = { .elems = 1, .frac = FRAC, .mode = SWS_RW_##MODE },  \
     .swizzle = SWIZ,                                            \
     __VA_ARGS__                                                 \
 }
 
 #define SUBPACKED_FMT(SWIZ, ...) (FmtInfo) {                    \
-    .rw = { .elems = 1, .packed = true },                       \
+    .rw = { .elems = 1, .mode = SWS_RW_PACKED },                \
     .swizzle = SWIZ,                                            \
     .pack.pattern = {__VA_ARGS__},                              \
 }
 
 #define PACKED_FMT(SWIZ, N, ...) (FmtInfo) {                    \
-    .rw = { .elems = N, .packed = (N) > 1 },                    \
+    .rw = { .elems = N, .mode = SWS_RW_PACKED },                \
     .swizzle = SWIZ,                                            \
     __VA_ARGS__                                                 \
 }
@@ -767,9 +767,9 @@ static FmtInfo fmt_info_irregular(enum AVPixelFormat fmt)
     /* Bitstream formats */
     case AV_PIX_FMT_MONOWHITE:
     case AV_PIX_FMT_MONOBLACK:
-        return BITSTREAM_FMT(RGBA, 3, false);
-    case AV_PIX_FMT_RGB4: return BITSTREAM_FMT(RGBA, 1, true, .pack = {{ 1, 2, 
1 }});
-    case AV_PIX_FMT_BGR4: return BITSTREAM_FMT(BGRA, 1, true, .pack = {{ 1, 2, 
1 }});
+        return BITSTREAM_FMT(RGBA, 3, PLANAR);
+    case AV_PIX_FMT_RGB4: return BITSTREAM_FMT(RGBA, 1, PACKED, .pack = {{ 1, 
2, 1 }});
+    case AV_PIX_FMT_BGR4: return BITSTREAM_FMT(BGRA, 1, PACKED, .pack = {{ 1, 
2, 1 }});
 
     /* Sub-packed 8-bit aligned formats */
     case AV_PIX_FMT_RGB4_BYTE:  return SUBPACKED_FMT(RGBA, 1, 2, 1);
@@ -865,10 +865,14 @@ static int fmt_analyze_regular(const AVPixFmtDescriptor 
*desc, SwsReadWriteOp *r
         *swizzle = swiz;
     }
 
+    SwsReadWriteMode mode = SWS_RW_PLANAR;
+    if (desc->nb_components > 1 && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR))
+        mode = SWS_RW_PACKED;
+
     *shift = (SwsShiftOp) { desc->comp[0].shift };
     *rw_op = (SwsReadWriteOp) {
-        .elems  = desc->nb_components,
-        .packed = desc->nb_components > 1 && !(desc->flags & 
AV_PIX_FMT_FLAG_PLANAR),
+        .elems = desc->nb_components,
+        .mode  = mode,
     };
     return 0;
 }
diff --git a/libswscale/ops.c b/libswscale/ops.c
index e4d79a2c60..4afeaabf54 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -169,7 +169,13 @@ SwsCompMask ff_sws_comp_mask_needed(const SwsOp *op)
 int ff_sws_rw_op_planes(const SwsOp *op)
 {
     av_assert2(op->op == SWS_OP_READ || op->op == SWS_OP_WRITE);
-    return op->rw.packed ? 1 : op->rw.elems;
+    switch (op->rw.mode) {
+    case SWS_RW_PLANAR: return op->rw.elems;
+    case SWS_RW_PACKED: return 1;
+    }
+
+    av_unreachable("Invalid read/write mode!");
+    return 0;
 }
 
 /* biased towards `a` */
@@ -376,7 +382,12 @@ void ff_sws_op_list_update_comps(SwsOpList *ops)
             /* Active components are taken from the user-provided values,
              * other components are explicitly stripped */
             for (int i = 0; i < op->rw.elems; i++) {
-                const int idx = op->rw.packed ? i : ops->plane_src[i];
+                int idx = 0;
+                switch (op->rw.mode) {
+                case SWS_RW_PACKED: idx = i; break;
+                case SWS_RW_PLANAR: idx = ops->plane_src[i]; break;
+                }
+
                 av_assert0(!(ops->comps_src.flags[idx] & SWS_COMP_GARBAGE));
                 op->comps.flags[i] = ops->comps_src.flags[idx];
                 op->comps.min[i]   = ops->comps_src.min[idx];
@@ -731,7 +742,7 @@ bool ff_sws_op_list_is_noop(const SwsOpList *ops)
     const SwsOp *write = ff_sws_op_list_output(ops);
     if (!read || !write || ops->num_ops > 2 ||
         read->type != write->type ||
-        read->rw.packed != write->rw.packed ||
+        read->rw.mode != write->rw.mode ||
         read->rw.elems != write->rw.elems ||
         read->rw.frac != write->rw.frac)
         return false;
@@ -857,6 +868,11 @@ static void print_q4(AVBPrint *bp, const AVRational q4[4], 
SwsCompMask mask)
     av_bprintf(bp, "}");
 }
 
+static const char *const rw_mode_names[] = {
+    [SWS_RW_PLANAR] = "planar",
+    [SWS_RW_PACKED] = "packed",
+};
+
 void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op)
 {
     const char *name  = ff_sws_op_type_name(op->op);
@@ -870,7 +886,7 @@ void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op)
     case SWS_OP_READ:
     case SWS_OP_WRITE:
         av_bprintf(bp, "%-20s: %d elem(s) %s >> %d", name,
-                   op->rw.elems,  op->rw.packed ? "packed" : "planar",
+                   op->rw.elems, rw_mode_names[op->rw.mode],
                    op->rw.frac);
         if (!op->rw.filter.op)
             break;
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 657726f02f..f3bc46a041 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -84,6 +84,11 @@ typedef struct SwsComps {
     AVRational min[4], max[4];
 } SwsComps;
 
+typedef enum SwsReadWriteMode {
+    SWS_RW_PLANAR,  /* one plane per component */
+    SWS_RW_PACKED,  /* all components on a single plane */
+} SwsReadWriteMode;
+
 typedef struct SwsReadWriteOp {
     /**
      * Examples:
@@ -93,9 +98,9 @@ typedef struct SwsReadWriteOp {
      *   monow     = 1x u8 (frac 3)
      *   rgb4      = 1x u8 (frac 1)
      */
+    SwsReadWriteMode mode; /* how data is laid out in memory */
     uint8_t elems; /* number of elements (of type `op.type`) to read/write */
     uint8_t frac;  /* fractional pixel step factor (log2) */
-    bool packed;   /* read multiple elements from a single plane */
 
     /**
      * Filter kernel to apply to each plane while sampling. Currently, only
diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index 9b81379bcd..eb8d1e0e51 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -464,7 +464,12 @@ static void op_pass_run(const SwsFrame *out, const 
SwsFrame *in, const int y,
 
 static int rw_pixel_bits(const SwsOp *op)
 {
-    const int elems = op->rw.packed ? op->rw.elems : 1;
+    int elems = 0;
+    switch (op->rw.mode) {
+    case SWS_RW_PLANAR: elems = 1; break;
+    case SWS_RW_PACKED: elems = op->rw.elems; break;
+    }
+
     const int size  = ff_sws_pixel_type_size(op->type);
     const int bits  = 8 >> op->rw.frac;
     av_assert1(bits >= 1);
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index 3bc9fa02ff..3b09c35e89 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -404,7 +404,7 @@ retry:
         switch (op->op) {
         case SWS_OP_READ:
             /* "Compress" planar reads where not all components are needed */
-            if (!op->rw.packed) {
+            if (op->rw.mode == SWS_RW_PLANAR) {
                 SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
                 int nb_planes = 0;
                 for (int i = 0; i < op->rw.elems; i++) {
@@ -528,7 +528,7 @@ retry:
             }
 
             /* Swizzle planes instead of components, if possible */
-            if (prev->op == SWS_OP_READ && !prev->rw.packed) {
+            if (prev->op == SWS_OP_READ && prev->rw.mode == SWS_RW_PLANAR) {
                 for (int dst = 0; dst < prev->rw.elems; dst++) {
                     const int src = op->swizzle.in[dst];
                     if (src > dst && src < prev->rw.elems) {
@@ -544,7 +544,7 @@ retry:
                 }
             }
 
-            if (next->op == SWS_OP_WRITE && !next->rw.packed) {
+            if (next->op == SWS_OP_WRITE && next->rw.mode == SWS_RW_PLANAR) {
                 for (int dst = 0; dst < next->rw.elems; dst++) {
                     const int src = op->swizzle.in[dst];
                     if (src > dst && src < next->rw.elems) {
@@ -747,7 +747,7 @@ retry:
         case SWS_OP_FILTER_V:
             /* Merge with prior simple planar read */
             if (prev->op == SWS_OP_READ && !prev->rw.filter.op &&
-                !prev->rw.packed && !prev->rw.frac) {
+                prev->rw.mode == SWS_RW_PLANAR && !prev->rw.frac) {
                 prev->rw.filter.op = op->op;
                 prev->rw.filter.kernel = av_refstruct_ref(op->filter.kernel);
                 prev->rw.filter.type = op->filter.type;
diff --git a/libswscale/uops.c b/libswscale/uops.c
index b2d11d996c..7f779a504c 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -480,7 +480,7 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList 
*ops, SwsUOpFlags flags,
 
     const bool is_read = op->op == SWS_OP_READ;
     if (op->rw.filter.op) {
-        if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.packed)
+        if (op->op == SWS_OP_WRITE || op->rw.frac || op->rw.mode != 
SWS_RW_PLANAR)
             return AVERROR(ENOTSUP);
         uop.par.filter.type = op->rw.filter.type;
         uop.data.kernel = av_refstruct_ref(op->rw.filter.kernel);
@@ -491,7 +491,7 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList 
*ops, SwsUOpFlags flags,
         } else {
             uop.uop = SWS_UOP_READ_PLANAR_FV;
         }
-    } else if (op->rw.packed && op->rw.elems > 1) {
+    } else if (op->rw.mode == SWS_RW_PACKED && op->rw.elems > 1) {
         if (op->rw.frac)
             return AVERROR(ENOTSUP);
         uop.uop = is_read ? SWS_UOP_READ_PACKED : SWS_UOP_WRITE_PACKED;
diff --git a/libswscale/vulkan/ops.c b/libswscale/vulkan/ops.c
index cfafa1c27c..4f5b4160af 100644
--- a/libswscale/vulkan/ops.c
+++ b/libswscale/vulkan/ops.c
@@ -852,7 +852,7 @@ static int read_filtered(SPICtx *spi, SPIRVIDs *id, const 
SwsOpList *ops,
     /* Accumulators, initialized to zero */
     int acc_s[4] = { id->f32_0, id->f32_0, id->f32_0, id->f32_0 };
     int acc_v = id->f32_0;
-    if (op->rw.packed)
+    if (op->rw.mode == SWS_RW_PACKED)
         acc_v = spi_OpCompositeConstruct(spi, id->f32vec4_type,
                                          id->f32_0, id->f32_0,
                                          id->f32_0, id->f32_0);
@@ -877,7 +877,7 @@ static int read_filtered(SPICtx *spi, SPIRVIDs *id, const 
SwsOpList *ops,
             spi_OpCompositeConstruct(spi, id->i32vec2_type, c, pos_y) :
             spi_OpCompositeConstruct(spi, id->i32vec2_type, pos_x, c);
 
-        if (op->rw.packed) {
+        if (op->rw.mode == SWS_RW_PACKED) {
             int px = spi_OpImageRead(spi, read_vtype,
                                      in_img[ops->plane_src[0]], coord,
                                      SpvImageOperandsMaskNone);
@@ -902,7 +902,7 @@ static int read_filtered(SPICtx *spi, SPIRVIDs *id, const 
SwsOpList *ops,
         }
     }
 
-    if (op->rw.packed)
+    if (op->rw.mode == SWS_RW_PACKED)
         return acc_v;
     return spi_OpCompositeConstruct(spi, id->f32vec4_type,
                                     acc_s[0], acc_s[1], acc_s[2], acc_s[3]);
@@ -1136,7 +1136,7 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
                 data = read_filtered(spi, id, ops, op,
                                      &id->filt[nb_filter_used++],
                                      in_img, gid, gi2);
-            } else if (op->rw.packed) {
+            } else if (op->rw.mode == SWS_RW_PACKED) {
                 data = spi_OpImageRead(spi, type_v, in_img[ops->plane_src[0]],
                                        src_gid, SpvImageOperandsMaskNone);
             } else {
@@ -1154,7 +1154,7 @@ static int add_ops_spirv(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
         case SWS_OP_WRITE:
             if (op->rw.frac || op->rw.filter.op) {
                 return AVERROR(ENOTSUP);
-            } else if (op->rw.packed) {
+            } else if (op->rw.mode == SWS_RW_PACKED) {
                 spi_OpImageWrite(spi, out_img[ops->plane_dst[0]], dst_gid, 
data,
                                  SpvImageOperandsMaskNone);
             } else {
@@ -1332,7 +1332,7 @@ static void read_glsl(const SwsOpList *ops, const SwsOp 
*op, FFVulkanShader *shd
                    wd->filter_size);
         av_bprintf(&shd->src, "        float w = filter_w%i[%s][i];\n",
                    idx, axis);
-        if (op->rw.packed) {
+        if (op->rw.mode == SWS_RW_PACKED) {
             GLSLF(2, tmp += w * %s(imageLoad(src_img[%i], ivec2(%s, %s)));     
,
                   type_v, ops->plane_src[0], coord_x, coord_y);
         } else {
@@ -1345,7 +1345,7 @@ static void read_glsl(const SwsOpList *ops, const SwsOp 
*op, FFVulkanShader *shd
         GLSLC(1, f32 = tmp;                                                   
);
     } else {
         const char *src_pos = interlaced ? "spos" : "pos";
-        if (op->rw.packed) {
+        if (op->rw.mode == SWS_RW_PACKED) {
             GLSLF(1, %s = %s(imageLoad(src_img[%i], %s));                      
,
                   type_name, type_v, ops->plane_src[0], src_pos);
         } else {
@@ -1488,7 +1488,7 @@ static int add_ops_glsl(SwsContext *sws, VulkanPriv *p, 
FFVulkanOpsCtx *s,
             const char *dst_pos = ops->dst.interlaced ? "dpos" : "pos";
             if (op->rw.frac || op->rw.filter.op) {
                 return AVERROR(ENOTSUP);
-            } else if (op->rw.packed) {
+            } else if (op->rw.mode == SWS_RW_PACKED) {
                 GLSLF(1, imageStore(dst_img[%i], %s, %s(%s));                  
 ,
                       ops->plane_dst[0], dst_pos, type_v, type_name);
             } else {
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index fdc17bffa8..99140ced52 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -56,7 +56,12 @@ static const char *tprintf(char buf[], size_t size, const 
char *fmt, ...)
 
 static int rw_pixel_bits(const SwsOp *op)
 {
-    const int elems = op->rw.packed ? op->rw.elems : 1;
+    int elems = 0;
+    switch (op->rw.mode) {
+    case SWS_RW_PLANAR: elems = 1; break;
+    case SWS_RW_PACKED: elems = op->rw.elems; break;
+    }
+
     const int size  = ff_sws_pixel_type_size(op->type);
     const int bits  = 8 >> op->rw.frac;
     av_assert1(bits >= 1);
@@ -233,7 +238,7 @@ static void check_compiled(const char *name,
             break;
         }
 
-        if (write_op->rw.packed)
+        if (write_op->rw.mode == SWS_RW_PACKED)
             break;
     }
 
@@ -394,13 +399,22 @@ static AVRational rndq(SwsPixelType t)
 
 static void check_read(const char *name, const SwsUOp *uop)
 {
+    SwsReadWriteMode mode;
+    switch (uop->uop) {
+    case SWS_UOP_READ_PACKED:
+    case SWS_UOP_READ_BIT:
+    case SWS_UOP_READ_NIBBLE: mode = SWS_RW_PACKED; break;
+    case SWS_UOP_READ_PLANAR: mode = SWS_RW_PLANAR; break;
+    default: return;
+    }
+
     const int num = mask_num(uop->mask);
     check_ops(name, NULL, (SwsOp[]) {
         {
             .op        = SWS_OP_READ,
             .type      = uop->type,
             .rw.elems  = num,
-            .rw.packed = uop->uop != SWS_UOP_READ_PLANAR,
+            .rw.mode   = mode,
             .rw.frac   = uop->uop == SWS_UOP_READ_BIT    ? 3 :
                          uop->uop == SWS_UOP_READ_NIBBLE ? 1 : 0,
         }, {
@@ -413,6 +427,15 @@ static void check_read(const char *name, const SwsUOp *uop)
 
 static void check_write(const char *name, const SwsUOp *uop)
 {
+    SwsReadWriteMode mode;
+    switch (uop->uop) {
+    case SWS_UOP_WRITE_BIT:
+    case SWS_UOP_WRITE_NIBBLE:
+    case SWS_UOP_READ_PACKED: mode = SWS_RW_PACKED; break;
+    case SWS_UOP_READ_PLANAR: mode = SWS_RW_PLANAR; break;
+    default: return;
+    }
+
     const int frac = uop->uop == SWS_UOP_WRITE_BIT    ? 3 :
                      uop->uop == SWS_UOP_WRITE_NIBBLE ? 1 : 0;
     const int num = mask_num(uop->mask);
@@ -428,7 +451,7 @@ static void check_write(const char *name, const SwsUOp *uop)
             .op        = SWS_OP_WRITE,
             .type      = uop->type,
             .rw.elems  = num,
-            .rw.packed = uop->uop != SWS_UOP_WRITE_PLANAR,
+            .rw.mode   = mode,
             .rw.frac   = frac,
         }, {0}
     });
-- 
2.52.0


>From de300b2a30403593a9379af1642fc0bac216c65f Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Thu, 21 May 2026 10:53:59 +0200
Subject: [PATCH 13/14] swscale/uops: add default fallback for translate_op()

Makes it a bit easier to add ops and uops in separate commits.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/uops.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libswscale/uops.c b/libswscale/uops.c
index 7f779a504c..b73aedb6e1 100644
--- a/libswscale/uops.c
+++ b/libswscale/uops.c
@@ -841,6 +841,8 @@ static int translate_op(SwsContext *ctx, SwsUOpList *uops, 
SwsUOpFlags flags,
         uop.uop = SWS_UOP_SWAP_BYTES;
         uop.type = pixel_type_to_int(op->type);
         break;
+    default:
+        return AVERROR(ENOTSUP);
     }
 
     av_assert0(uop.uop != SWS_UOP_INVALID);
-- 
2.52.0


>From f1c2648878224ee116b5528400b8d5ac32334d4a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 8 Jun 2026 16:19:44 +0200
Subject: [PATCH 14/14] swscale/ops_dispatch: generalize tail_offset/size
 parameters

These values could differ per plane, e.g. when combining subsampled chroma
with non-subsampled luma.

Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_dispatch.c | 45 ++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c
index eb8d1e0e51..dbb5809d5f 100644
--- a/libswscale/ops_dispatch.c
+++ b/libswscale/ops_dispatch.c
@@ -35,10 +35,10 @@ typedef struct SwsOpPass {
     SwsOpExec exec_base;
     SwsOpExec exec_tail;
     size_t num_blocks;
-    int tail_off_in;
-    int tail_off_out;
-    int tail_size_in;
-    int tail_size_out;
+    int tail_off_in[4];
+    int tail_off_out[4];
+    int tail_size_in[4];
+    int tail_size_out[4];
     int planes_in;
     int planes_out;
     int pixel_bits_in;
@@ -281,17 +281,22 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
 
     const size_t safe_width = safe_blocks * block_size;
     const size_t tail_size  = pass->width - safe_width;
-    p->tail_off_out  = pixel_bytes(safe_width, p->pixel_bits_out, 
AV_ROUND_DOWN);
-    p->tail_size_out = pixel_bytes(tail_size,  p->pixel_bits_out, AV_ROUND_UP);
-    p->tail_blocks   = num_blocks - safe_blocks;
+    p->tail_blocks = num_blocks - safe_blocks;
 
-    if (exec->in_offset_x) {
-        p->tail_off_in  = exec->in_offset_x[safe_width];
-        p->tail_size_in = exec->in_offset_x[pass->width - 1] - p->tail_off_in;
-        p->tail_size_in += pixel_bytes(p->filter_size_h, p->pixel_bits_in, 
AV_ROUND_UP);
-    } else {
-        p->tail_off_in  = pixel_bytes(safe_width, p->pixel_bits_in, 
AV_ROUND_DOWN);
-        p->tail_size_in = pixel_bytes(tail_size,  p->pixel_bits_in, 
AV_ROUND_UP);
+    for (int i = 0; i < p->planes_out; i++) {
+        p->tail_off_out[i]  = pixel_bytes(safe_width, p->pixel_bits_out, 
AV_ROUND_DOWN);
+        p->tail_size_out[i] = pixel_bytes(tail_size,  p->pixel_bits_out, 
AV_ROUND_UP);
+    }
+
+    for (int i = 0; i < p->planes_in; i++) {
+        if (exec->in_offset_x) {
+            p->tail_off_in[i]  = exec->in_offset_x[safe_width];
+            p->tail_size_in[i] = exec->in_offset_x[pass->width - 1] - 
p->tail_off_in[i];
+            p->tail_size_in[i] += pixel_bytes(p->filter_size_h, 
p->pixel_bits_in, AV_ROUND_UP);
+        } else {
+            p->tail_off_in[i]  = pixel_bytes(safe_width, p->pixel_bits_in, 
AV_ROUND_DOWN);
+            p->tail_size_in[i] = pixel_bytes(tail_size,  p->pixel_bits_in, 
AV_ROUND_UP);
+        }
     }
 
     const size_t alloc_width = aligned_w - safe_width;
@@ -301,7 +306,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
             /* The input offset map is already padded to multiples of the block
              * size, and clamps the input offsets to the image boundaries; so
              * we just need to compensate for the comp->over_read */
-            needed_size = p->tail_size_in;
+            needed_size = p->tail_size_in[i];
         } else {
             needed_size = pixel_bytes(alloc_width, p->pixel_bits_in, 
AV_ROUND_UP);
         }
@@ -345,7 +350,7 @@ static int op_pass_setup(const SwsFrame *out, const 
SwsFrame *in,
     if (memcpy_in && exec->in_offset_x) {
         tail->in_offset_x = (int32_t *) tail_buf;
         for (int i = safe_width; i < aligned_w; i++)
-            tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in;
+            tail->in_offset_x[i] = exec->in_offset_x[i] - p->tail_off_in[i];
     }
 
     return 0;
@@ -421,11 +426,11 @@ static void op_pass_run(const SwsFrame *out, const 
SwsFrame *in, const int y,
     for (int i = 0; i < p->planes_in; i++) {
         /* Input offsets are relative to the base pointer */
         if (!exec.in_offset_x || memcpy_in)
-            exec.in[i] += p->tail_off_in;
+            exec.in[i] += p->tail_off_in[i];
         tail.in[i] += y * tail.in_stride[i];
     }
     for (int i = 0; i < p->planes_out; i++) {
-        exec.out[i] += p->tail_off_out;
+        exec.out[i] += p->tail_off_out[i];
         tail.out[i] += y * tail.out_stride[i];
     }
 
@@ -433,7 +438,7 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
         if (memcpy_in) {
             const int lines = get_lines_in(p, y, h, i);
             copy_lines((uint8_t *) tail.in[i], tail.in_stride[i],
-                       exec.in[i], exec.in_stride[i], lines, p->tail_size_in);
+                       exec.in[i], exec.in_stride[i], lines, 
p->tail_size_in[i]);
         } else {
             /* Reuse input pointers directly */
             const size_t loop_size = tail_blocks * exec.block_size_in[i];
@@ -458,7 +463,7 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame 
*in, const int y,
     for (int i = 0; memcpy_out && i < p->planes_out; i++) {
         const int lines = h >> tail.out_sub_y[i];
         copy_lines(exec.out[i], exec.out_stride[i],
-                   tail.out[i], tail.out_stride[i], lines, p->tail_size_out);
+                   tail.out[i], tail.out_stride[i], lines, 
p->tail_size_out[i]);
     }
 }
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to