PR #21099 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21099
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21099.patch

This eliminates the hard-coded {0, 3, 2, 5} per-row offsets in favor of values 
chosen by the SwsOp provider. The main advantage of this is that it allows us 
to e.g. internally reorder or split components without affecting the overall 
result. Paves the way towards e.g. processing luma and chroma separately, which 
was previously impossible because the fixed coordinate index <-> dither offset 
made that affect the result.

TODO:
- [ ] Re-define dither swap operations if needed (test for performance 
regressions!)


>From d1781277e34a3d9c8d7f6fc3a255669b36f3b12f Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 1 Dec 2025 16:07:26 +0100
Subject: [PATCH 1/6] swscale/ops: add explicit row offset to SwsDitherOp

To improve decorrelation between components, we offset the dither matrix
slightly for each component. This is currently done by adding a hard-coded
offset of {0, 3, 2, 5} to each of the four components, respectively.

However, this represents a serious challenge when re-ordering SwsDitherOp
past a swizzle, or when splitting an SwsOpList into multiple sub-operations
(e.g. for decoupling luma from subsampled chroma when they are independent).

To fix this on a fundamental level, we have to keep track of the offset per
channel as part of the SwsDitherOp metadata, and respect those values at
runtime.

This commit merely adds the metadata; the update to the underlying backends
will come in a follow-up commit.
---
 libswscale/format.c        | 7 +++++++
 libswscale/ops.c           | 6 ++++--
 libswscale/ops.h           | 1 +
 libswscale/ops_optimizer.c | 2 +-
 tests/checkasm/sw_ops.c    | 4 +++-
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/libswscale/format.c b/libswscale/format.c
index 2ae6d50523..8ec5d94440 100644
--- a/libswscale/format.c
+++ b/libswscale/format.c
@@ -1324,6 +1324,13 @@ static int fmt_dither(SwsContext *ctx, SwsOpList *ops,
         dither.matrix = generate_bayer_matrix(dither.size_log2);
         if (!dither.matrix)
             return AVERROR(ENOMEM);
+
+        /* Brute-forced offsets; minimizes quantization error across a 16x16
+         * bayer dither pattern for  standard RGB and YUV pixel formats */
+        const int offsets_16x16[4] = {0, 3, 2, 5};
+        for (int i = 0; i < 4; i++)
+            dither.y_offset[i] = offsets_16x16[i];
+
         return ff_sws_op_list_append(ops, &(SwsOp) {
             .op     = SWS_OP_DITHER,
             .type   = type,
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 1c408d7482..f33dd02a37 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -460,8 +460,10 @@ void ff_sws_op_list_print(void *log, int lev, const 
SwsOpList *ops)
                    op->convert.expand ? " (expand)" : "");
             break;
         case SWS_OP_DITHER:
-            av_log(log, lev, "%-20s: %dx%d matrix\n", "SWS_OP_DITHER",
-                    1 << op->dither.size_log2, 1 << op->dither.size_log2);
+            av_log(log, lev, "%-20s: %dx%d matrix + {%d %d %d %d}\n", 
"SWS_OP_DITHER",
+                    1 << op->dither.size_log2, 1 << op->dither.size_log2,
+                    op->dither.y_offset[0], op->dither.y_offset[1],
+                    op->dither.y_offset[2], op->dither.y_offset[3]);
             break;
         case SWS_OP_MIN:
             av_log(log, lev, "%-20s: x <= {%s %s %s %s}\n", "SWS_OP_MIN",
diff --git a/libswscale/ops.h b/libswscale/ops.h
index dccc00d2f0..36d592e556 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -132,6 +132,7 @@ typedef struct SwsConvertOp {
 typedef struct SwsDitherOp {
     AVRational *matrix; /* tightly packed dither matrix (refstruct) */
     int size_log2; /* size (in bits) of the dither matrix */
+    uint8_t y_offset[4]; /* row offset for each component */
 } SwsDitherOp;
 
 typedef struct SwsLinearOp {
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
index b13e7ebdc1..a0cf0bcaf1 100644
--- a/libswscale/ops_optimizer.c
+++ b/libswscale/ops_optimizer.c
@@ -40,7 +40,6 @@ static bool op_type_is_independent(SwsOpType op)
     case SWS_OP_LSHIFT:
     case SWS_OP_RSHIFT:
     case SWS_OP_CONVERT:
-    case SWS_OP_DITHER:
     case SWS_OP_MIN:
     case SWS_OP_MAX:
     case SWS_OP_SCALE:
@@ -53,6 +52,7 @@ static bool op_type_is_independent(SwsOpType op)
     case SWS_OP_LINEAR:
     case SWS_OP_PACK:
     case SWS_OP_UNPACK:
+    case SWS_OP_DITHER:
         return false;
     case SWS_OP_TYPE_NB:
         break;
diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c
index aea25bbbbc..bd70adc212 100644
--- a/tests/checkasm/sw_ops.c
+++ b/tests/checkasm/sw_ops.c
@@ -624,6 +624,7 @@ static void check_dither(void)
         /* Test all sizes up to 256x256 */
         for (int size_log2 = 0; size_log2 <= 8; size_log2++) {
             const int size = 1 << size_log2;
+            const int mask = size - 1;
             AVRational *matrix = av_refstruct_allocz(size * size * 
sizeof(*matrix));
             if (!matrix) {
                 fail();
@@ -641,7 +642,8 @@ static void check_dither(void)
                 .op = SWS_OP_DITHER,
                 .type = t,
                 .dither.size_log2 = size_log2,
-                .dither.matrix = matrix,
+                .dither.matrix    = matrix,
+                .dither.y_offset  = {0, 3 & mask, 2 & mask, 5 & mask},
             });
 
             av_refstruct_unref(&matrix);
-- 
2.49.1


>From 02b879424842e49d407a93ee0644180fdff1eba6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 1 Dec 2025 16:10:38 +0100
Subject: [PATCH 2/6] swscale/ops_tmpl_float: respect specified dither matrix
 offsets

Since we only need 8 bytes to store the dither matrix pointer, we actually
still have 8 bytes left-over. That means we could either store the 8-bit
row offset directly, or alternatively compute a 16-bit pointer offsets.

I have chosen to do the former for the C backend, in the interest of
simplicity.

The one downside of this approach is that it would fail on hypothetical
128-bit platforms; although I seriously hope that this code does not live
long enough to see the need for 128-bit addressable memory.
---
 libswscale/ops_tmpl_float.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
index 345f9dd57b..78b3a9c4fc 100644
--- a/libswscale/ops_tmpl_float.c
+++ b/libswscale/ops_tmpl_float.c
@@ -56,6 +56,11 @@ DECL_SETUP(setup_dither)
     if (!matrix)
         return AVERROR(ENOMEM);
 
+    static_assert(sizeof(out->ptr) <= sizeof(uint8_t[8]), ">8 byte pointers 
not supported");
+    uint8_t *offset = &out->u8[8];
+    for (int i = 0; i < 4; i++)
+        offset[i] = op->dither.y_offset[i];
+
     for (int y = 0; y < size; y++) {
         for (int x = 0; x < size; x++)
             matrix[y * width + x] = av_q2pixel(op->dither.matrix[y * size + 
x]);
@@ -69,12 +74,13 @@ DECL_SETUP(setup_dither)
 DECL_FUNC(dither, const int size_log2)
 {
     const pixel_t *restrict matrix = impl->priv.ptr;
+    const uint8_t *offset = &impl->priv.u8[8];
     const int mask = (1 << size_log2) - 1;
     const int y_line = iter->y;
-    const int row0 = (y_line +  0) & mask;
-    const int row1 = (y_line +  3) & mask;
-    const int row2 = (y_line +  2) & mask;
-    const int row3 = (y_line +  5) & mask;
+    const int row0 = (y_line + offset[0]) & mask;
+    const int row1 = (y_line + offset[1]) & mask;
+    const int row2 = (y_line + offset[2]) & mask;
+    const int row3 = (y_line + offset[3]) & mask;
     const int size = 1 << size_log2;
     const int width = FFMAX(size, SWS_BLOCK_SIZE);
     const int base = iter->x & ~(SWS_BLOCK_SIZE - 1) & (size - 1);
-- 
2.49.1


>From b56c8bcf5abb46a8430d5750ffb57b0907bfe3fa Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 1 Dec 2025 16:18:03 +0100
Subject: [PATCH 3/6] swscale/ops_tmpl_float: actually skip allocation for
 !size_log2 case

This check was wrong; 1 << 0 = 1. The intent was to skip allocating a 1x1
matrix by assuming it is a constant 0.5. But as written, the check never
actually executed.

This did not affect the runtime performance, nor did it leak memory; but it
did mean we didn't hit the intended `assert`.
---
 libswscale/ops_tmpl_float.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
index 78b3a9c4fc..c64ae4188f 100644
--- a/libswscale/ops_tmpl_float.c
+++ b/libswscale/ops_tmpl_float.c
@@ -44,7 +44,7 @@
 DECL_SETUP(setup_dither)
 {
     const int size = 1 << op->dither.size_log2;
-    if (!size) {
+    if (size == 1) {
         /* We special case this value */
         av_assert1(!av_cmp_q(op->dither.matrix[0], av_make_q(1, 2)));
         out->ptr = NULL;
-- 
2.49.1


>From 58998a1509160f83b7dc9057f0909d6d690968c8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Dec 2025 18:19:04 +0100
Subject: [PATCH 4/6] swscale/x86/ops_float: remove special case for 2x2 matrix

This is an exceptionally unlikely (in fact, currently impossible) case to
actually hit, and not worth micro-optimizing for. More specifically, having
this special case prevents me from easily adding per-row offsets.
---
 libswscale/x86/ops.c         | 20 +++++++++++---------
 libswscale/x86/ops_float.asm | 25 ++++++++-----------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 97bee93f5b..ecbd79564e 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -184,18 +184,20 @@ static int setup_shift(const SwsOp *op, SwsOpPriv *out)
         .setup = ff_sws_setup_q,                                               
 \
     );
 
-/* 2x2 matrix fits inside SwsOpPriv directly; save an indirect in this case */
-static_assert(sizeof(SwsOpPriv) >= sizeof(float[2][2]), "2x2 dither matrix too 
large");
 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
 {
-    const int size = 1 << op->dither.size_log2;
-    float *matrix = out->f32;
-    if (size > 2) {
-        matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
-        if (!matrix)
-            return AVERROR(ENOMEM);
+    /* 1x1 matrix / single constant */
+    if (!op->dither.size_log2) {
+        const AVRational k = op->dither.matrix[0];
+        out->f32[0] = (float) k.num / k.den;
+        return 0;
     }
 
+    const int size = 1 << op->dither.size_log2;
+    float *matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
+    if (!matrix)
+        return AVERROR(ENOMEM);
+
     for (int i = 0; i < size * size; i++)
         matrix[i] = (float) op->dither.matrix[i].num / 
op->dither.matrix[i].den;
 
@@ -206,7 +208,7 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)
     DECL_COMMON_PATTERNS(F32, dither##SIZE##EXT,                               
 \
         .op    = SWS_OP_DITHER,                                                
 \
         .setup = setup_dither,                                                 
 \
-        .free  = (1 << SIZE) > 2 ? av_free : NULL,                             
 \
+        .free  = (SIZE) ? av_free : NULL,                                      
 \
         .dither_size = SIZE,                                                   
 \
     );
 
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 6bdbd1b74e..ef08212fd6 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -183,7 +183,9 @@ IF W,   mulps mw2, m8
         lea tmp0q, %2
         and tmp0q, (1 << %1) - 1
         shl tmp0q, %1+2
-%if %1 == 2
+%if %1 == 1
+        vbroadcastsd   %4, [%3 + tmp0q]
+%elif %1 == 2
         VBROADCASTI128 %4, [%3 + tmp0q]
 %else
         mova %4, [%3 + tmp0q]
@@ -209,23 +211,12 @@ op dither%1
         %define DY DX
         %define DZ DX
         %define DW DX
-%elif %1 == 1
-        ; 2x2 matrix, only sign of y matters
-        mov tmp0d, yd
-        and tmp0d, 1
-        shl tmp0d, 3
-    %if X || Z
-        ; dither matrix is stored directly in the private data
-        vbroadcastsd DX, [implq + SwsOpImpl.priv + tmp0q]
-    %endif
-    %if Y || W
-        xor tmp0d, 8
-        vbroadcastsd DY, [implq + SwsOpImpl.priv + tmp0q]
-    %endif
-        %define DZ DX
-        %define DW DY
 %else
-        ; matrix is at least 4x4, load all four channels with custom offset
+        ; load all four channels with custom offset
+        ;
+        ; note that for 2x2, we would only need to look at the sign of `y`, but
+        ; this special case is ignored for simplicity reasons (and because
+        ; the current upstream format code never generates matrices that small)
     %if (4 << %1) > mmsize
         %define DX2 m12
         %define DY2 m13
-- 
2.49.1


>From 958c5e31f12e280482252957f7e2bf4cfff19f4b Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Dec 2025 18:39:58 +0100
Subject: [PATCH 5/6] swscale/x86/ops: over-allocate dither matrix

I want to change the way offsets are calculated inside the dither asm;
and the cleanest way to solve that problem is to just over-allocate the
entire dither matrix based on the maximum offset range we expected.
---
 libswscale/x86/ops.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index ecbd79564e..134ac67986 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -194,13 +194,25 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)
     }
 
     const int size = 1 << op->dither.size_log2;
-    float *matrix = out->ptr = av_mallocz(size * size * sizeof(*matrix));
+    int max_offset = 0;
+    for (int i = 0; i < 4; i++) {
+        const int offset = op->dither.y_offset[i] & (size - 1);
+        max_offset = FFMAX(max_offset, offset);
+    }
+
+    /* Allocate extra rows to allow over-reading for row offsets */
+    const int stride = size * sizeof(float);
+    const int num_rows = size + max_offset;
+    float *matrix = out->ptr = av_mallocz(num_rows * stride);
     if (!matrix)
         return AVERROR(ENOMEM);
 
     for (int i = 0; i < size * size; i++)
         matrix[i] = (float) op->dither.matrix[i].num / 
op->dither.matrix[i].den;
 
+    for (int i = size; i < num_rows; i++)
+        memcpy(&matrix[i * size], &matrix[(i - size) * size], stride);
+
     return 0;
 }
 
-- 
2.49.1


>From 49cd7150e3c912b3d244fb2218976be5a396e2ac Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Wed, 3 Dec 2025 19:12:37 +0100
Subject: [PATCH 6/6] swscale/x86/ops_float: store and load per row dither
 offset directly

Instead of computing y + N with a hard-coded index offset, calculate the
relative offset as a 16-bit integer in C and add that to the pointer directly.
Since we no longer mask the resulting combined address, this may result in
overread, but that's fine since we over-provisioned the array in the previous
commit.
---
 libswscale/x86/ops.c          |  7 +++++++
 libswscale/x86/ops_common.asm |  3 +++
 libswscale/x86/ops_float.asm  | 19 +++++++++++--------
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 134ac67986..afcef29a6e 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -213,6 +213,13 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)
     for (int i = size; i < num_rows; i++)
         memcpy(&matrix[i * size], &matrix[(i - size) * size], stride);
 
+    /* Store relative pointer offset to each row inside extra space */
+    static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers 
not supported");
+    assert(max_offset * stride <= UINT16_MAX);
+    uint16_t *offset = &out->u16[4];
+    for (int i = 0; i < 4; i++)
+        offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
+
     return 0;
 }
 
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
index 3c9154584a..e04ee70b56 100644
--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -245,6 +245,9 @@ endstruc
 %define tmp0d   r4d
 %define tmp1d   r5d
 
+%define tmp0w   r4w
+%define tmp1w   r5w
+
 ; Registers for plane pointers; put at the end (and in ascending plane order)
 ; so that we can avoid reserving them when not necessary
 %define out0q   r6q
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index ef08212fd6..2863085a8e 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -179,10 +179,8 @@ IF W,   mulps mw2, m8
         CONTINUE tmp0q
 %endmacro
 
-%macro load_dither_row 5 ; size_log2, y, addr, out, out2
-        lea tmp0q, %2
-        and tmp0q, (1 << %1) - 1
-        shl tmp0q, %1+2
+%macro load_dither_row 5 ; size_log2, comp_idx, addr, out, out2
+        mov tmp0w, [implq + SwsOpImpl.priv + (4 + %2) * 2] ; priv.u16[4 + i]
 %if %1 == 1
         vbroadcastsd   %4, [%3 + tmp0q]
 %elif %1 == 2
@@ -225,6 +223,11 @@ op dither%1
     %endif
         ; dither matrix is stored indirectly at the private data address
         mov tmp1q, [implq + SwsOpImpl.priv]
+        ; add y offset
+        mov tmp0d, yd
+        and tmp0d, (1 << %1) - 1
+        shl tmp0d, %1 + 2 ; * sizeof(float)
+        add tmp1q, tmp0q
     %if (4 << %1) > 2 * mmsize
         ; need to add in x offset
         mov tmp0d, bxd
@@ -232,10 +235,10 @@ op dither%1
         and tmp0d, (4 << %1) - 1
         add tmp1q, tmp0q
     %endif
-IF X,   load_dither_row %1, [yd + 0], tmp1q, DX, DX2
-IF Y,   load_dither_row %1, [yd + 3], tmp1q, DY, DY2
-IF Z,   load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2
-IF W,   load_dither_row %1, [yd + 5], tmp1q, DW, DW2
+IF X,   load_dither_row %1, 0, tmp1q, DX, DX2
+IF Y,   load_dither_row %1, 1, tmp1q, DY, DY2
+IF Z,   load_dither_row %1, 2, tmp1q, DZ, DZ2
+IF W,   load_dither_row %1, 3, tmp1q, DW, DW2
 %endif
         LOAD_CONT tmp0q
 IF X,   addps mx, DX
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to