PR #21045 opened by averne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21045 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21045.patch
Also fix dequant for 4:2:2 subsample. >From 1982add48595db4891b16131928b9eb25fb85e2f Mon Sep 17 00:00:00 2001 From: averne <[email protected]> Date: Sat, 29 Nov 2025 17:26:51 +0100 Subject: [PATCH 1/2] vulkan/prores: fix dequantization for 4:2:2 subsampling Bug introduced in d00f41f due to an oversight. --- libavcodec/vulkan/prores_idct.comp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/prores_idct.comp index 05ba8e4967..5d0d41cfa5 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/prores_idct.comp @@ -127,7 +127,7 @@ void main(void) uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma; /* Table 15 */ - uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> 4)]; + uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - chroma_shift))]; int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; [[unroll]] for (uint i = 0; i < 8; ++i) { -- 2.49.1 >From 1c5bb1b12da142ae111b35565420ffd1ccc9a029 Mon Sep 17 00:00:00 2001 From: averne <[email protected]> Date: Sat, 29 Nov 2025 17:25:17 +0100 Subject: [PATCH 2/2] vulkan/prores: normalize coefficients during IDCT This allows increased internal precision. In addition, we can introduce an offset to the DC coefficient during the second IDCT step, to remove a per-element addition in the output codepath. Finally, by processing columns first we can remove the barrier after loading coefficients. Signed-off-by: averne <[email protected]> --- libavcodec/vulkan/prores_idct.comp | 57 +++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/prores_idct.comp index 5d0d41cfa5..5eef61e57a 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/prores_idct.comp @@ -37,19 +37,27 @@ void put_px(uint tex_idx, ivec2 pos, uint v) #endif } -const float idct_8x8_scales[] = { - 0.353553390593274f, // cos(4 * pi/16) / 2 - 0.490392640201615f, // cos(1 * pi/16) / 2 - 0.461939766255643f, // cos(2 * pi/16) / 2 - 0.415734806151273f, // cos(3 * pi/16) / 2 - 0.353553390593274f, // cos(4 * pi/16) / 2 - 0.277785116509801f, // cos(5 * pi/16) / 2 - 0.191341716182545f, // cos(6 * pi/16) / 2 - 0.097545161008064f, // cos(7 * pi/16) / 2 +const float idct_scale[64] = { + 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, + 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, + 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, + 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, + 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, + 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, + 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, + 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, + 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, + 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, + 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, + 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, + 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, + 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, + 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, + 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, }; /* 7.4 Inverse Transform */ -void idct(uint block, uint offset, uint stride) +void idct8(uint block, uint offset, uint stride) { float t0, t1, t2, t3, t4, t5, t6, t7, u8; float u0, u1, u2, u3, u4, u5, u6, u7; @@ -117,6 +125,12 @@ void main(void) uint chroma_shift = comp != 0 ? log2_chroma_w : 0; bool act = gid.x < mb_width << (4 - chroma_shift); + /** + * Normalize coefficients to [-1, 1] for increased precision during the iDCT. + * DCT coeffs have the range of a 12-bit signed integer (7.4 Inverse Transform). + */ + const float norm = 1.0f / (1 << 11); + /* Coalesced load of DCT coeffs in shared memory, inverse quantization */ if (act) { /** @@ -131,28 +145,31 @@ void main(void) int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; [[unroll]] for (uint i = 0; i < 8; ++i) { + uint cidx = (i << 3) + idx; int c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16); - float v = float(c * qscale * int(qmat[(i << 3) + idx])); - blocks[block][i * 9 + idx] = v * idct_8x8_scales[idx] * idct_8x8_scales[i]; + float v = float(c * qscale * int(qmat[cidx])) * norm; + blocks[block][i * 9 + idx] = v * idct_scale[cidx]; } } - /* Row-wise iDCT */ - barrier(); - idct(block, idx * 9, 1); - /* Column-wise iDCT */ + idct8(block, idx, 9); barrier(); - idct(block, idx, 9); - float fact = 1.0f / (1 << (12 - depth)), off = 1 << (depth - 1); + /* Remap [-1, 1] to [0, 2] to remove a per-element addition in the output loop */ + blocks[block][idx * 9] += 1.0f; + + /* Row-wise iDCT */ + idct8(block, idx * 9, 1); + barrier(); + + float fact = 1 << (depth - 1); int maxv = (1 << depth) - 1; /* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */ - barrier(); if (act) { [[unroll]] for (uint i = 0; i < 8; ++i) { - float v = round(blocks[block][i * 9 + idx] * fact + off); + float v = round(blocks[block][i * 9 + idx] * fact); put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv)); } } -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
