ops_float: store and load per row dither offset directly

Niklas Haas via ffmpeg-cvslog Mon, 15 Dec 2025 06:33:06 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 3f7e3cedb58cb963a5cb90e81b7654d993b842cf
Author:     Niklas Haas <[email protected]>
AuthorDate: Wed Dec 3 19:12:37 2025 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Mon Dec 15 14:31:58 2025 +0000

    swscale/x86/ops_float: store and load per row dither offset directly
    
    Instead of computing y + N with a hard-coded index offset, calculate the
    relative offset as a 16-bit integer in C and add that to the pointer 
directly.
    Since we no longer mask the resulting combined address, this may result in
    overread, but that's fine since we over-provisioned the array in the 
previous
    commit.
---
 libswscale/x86/ops.c          |  7 +++++++
 libswscale/x86/ops_common.asm |  3 +++
 libswscale/x86/ops_float.asm  | 19 +++++++++++--------
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 1d8a2e77da..bc61266588 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -215,6 +215,13 @@ static int setup_dither(const SwsOp *op, SwsOpPriv *out)
 
     memcpy(&matrix[size * size], matrix, max_offset * stride);
 
+    /* Store relative pointer offset to each row inside extra space */
+    static_assert(sizeof(out->ptr) <= sizeof(uint16_t[4]), ">8 byte pointers 
not supported");
+    assert(max_offset * stride <= UINT16_MAX);
+    uint16_t *offset = &out->u16[4];
+    for (int i = 0; i < 4; i++)
+        offset[i] = (op->dither.y_offset[i] & (size - 1)) * stride;
+
     return 0;
 }
 
diff --git a/libswscale/x86/ops_common.asm b/libswscale/x86/ops_common.asm
index 3c9154584a..e04ee70b56 100644
--- a/libswscale/x86/ops_common.asm
+++ b/libswscale/x86/ops_common.asm
@@ -245,6 +245,9 @@ endstruc
 %define tmp0d   r4d
 %define tmp1d   r5d
 
+%define tmp0w   r4w
+%define tmp1w   r5w
+
 ; Registers for plane pointers; put at the end (and in ascending plane order)
 ; so that we can avoid reserving them when not necessary
 %define out0q   r6q
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index ef08212fd6..2863085a8e 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -179,10 +179,8 @@ IF W,   mulps mw2, m8
         CONTINUE tmp0q
 %endmacro
 
-%macro load_dither_row 5 ; size_log2, y, addr, out, out2
-        lea tmp0q, %2
-        and tmp0q, (1 << %1) - 1
-        shl tmp0q, %1+2
+%macro load_dither_row 5 ; size_log2, comp_idx, addr, out, out2
+        mov tmp0w, [implq + SwsOpImpl.priv + (4 + %2) * 2] ; priv.u16[4 + i]
 %if %1 == 1
         vbroadcastsd   %4, [%3 + tmp0q]
 %elif %1 == 2
@@ -225,6 +223,11 @@ op dither%1
     %endif
         ; dither matrix is stored indirectly at the private data address
         mov tmp1q, [implq + SwsOpImpl.priv]
+        ; add y offset
+        mov tmp0d, yd
+        and tmp0d, (1 << %1) - 1
+        shl tmp0d, %1 + 2 ; * sizeof(float)
+        add tmp1q, tmp0q
     %if (4 << %1) > 2 * mmsize
         ; need to add in x offset
         mov tmp0d, bxd
@@ -232,10 +235,10 @@ op dither%1
         and tmp0d, (4 << %1) - 1
         add tmp1q, tmp0q
     %endif
-IF X,   load_dither_row %1, [yd + 0], tmp1q, DX, DX2
-IF Y,   load_dither_row %1, [yd + 3], tmp1q, DY, DY2
-IF Z,   load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2
-IF W,   load_dither_row %1, [yd + 5], tmp1q, DW, DW2
+IF X,   load_dither_row %1, 0, tmp1q, DX, DX2
+IF Y,   load_dither_row %1, 1, tmp1q, DY, DY2
+IF Z,   load_dither_row %1, 2, tmp1q, DZ, DZ2
+IF W,   load_dither_row %1, 3, tmp1q, DW, DW2
 %endif
         LOAD_CONT tmp0q
 IF X,   addps mx, DX

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 06/11: swscale/x86/ops_float: store and load per row dither offset directly

Reply via email to