This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 6eced881885201e4834cea8d6aee4ce114d7fc9a Author: Lynne <[email protected]> AuthorDate: Thu Dec 18 13:30:30 2025 +0100 Commit: Lynne <[email protected]> CommitDate: Mon Dec 22 19:46:26 2025 +0100 vulkan: merge ProRes and ProRes RAW iDCTs This cleans up the code a bit, and reduces binary size. --- libavcodec/vulkan/Makefile | 4 +- libavcodec/vulkan/{prores_idct.comp => dct.comp} | 93 +++++------------------- libavcodec/vulkan/prores_idct.comp | 84 --------------------- libavcodec/vulkan/prores_raw_idct.comp | 82 --------------------- libavcodec/vulkan_prores.c | 4 + libavcodec/vulkan_prores_raw.c | 5 ++ 6 files changed, 29 insertions(+), 243 deletions(-) diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 78d511a90e..35e96c506d 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -14,11 +14,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ vulkan/ffv1_dec_setup.o vulkan/ffv1_dec.o -OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \ +OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \ vulkan/prores_raw_decode.o \ vulkan/prores_raw_idct.o -OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \ +OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \ vulkan/prores_vld.o \ vulkan/prores_idct.o diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/dct.comp similarity index 60% copy from libavcodec/vulkan/prores_idct.comp copy to libavcodec/vulkan/dct.comp index 5eef61e57a..34c6ad128f 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/dct.comp @@ -1,4 +1,7 @@ /* + * Copyright (c) 2025 Lynne <[email protected]> + * Copyright (c) 2016 Nathan Egge <[email protected]> + * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or @@ -16,26 +19,24 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -/* Two macroblocks, padded to avoid bank conflicts */ -shared float blocks[4*2][8*(8+1)]; +/** + * Orthonormal inverse 8-point Type-II DCT based on the Chen factorization[1]. + * 1D with scale factors moved up front. + * This computes an n-point Type-II DCT by first computing an n/2-point Type-II DCT + * of the even indexed inputs and an n/2-point Type-IV DST of the odd indexed inputs, + * and then combining them using a "butterfly" operation. + * + * [1] W.H. Chen, C. Smith, and S. Fralick, + * "A Fast Computational Algorithm for the Discrete Cosine Transform", + * IEEE Transactions on Communications, Vol. 25, No. 9, pp 1004-1009, Sept. 1977 + */ -uint get_px(uint tex_idx, ivec2 pos) -{ -#ifndef INTERLACED - return imageLoad(dst[tex_idx], pos).x; -#else - return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field)).x; +#ifndef NB_COMPONENTS +#define NB_COMPONENTS 1 #endif -} -void put_px(uint tex_idx, ivec2 pos, uint v) -{ -#ifndef INTERLACED - imageStore(dst[tex_idx], pos, uvec4(v)); -#else - imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), uvec4(v)); -#endif -} +/* Padded by 1 row to avoid bank conflicts */ +shared float blocks[NB_BLOCKS][NB_COMPONENTS*8*(8 + 1)]; const float idct_scale[64] = { 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, @@ -56,7 +57,6 @@ const float idct_scale[64] = { 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, }; -/* 7.4 Inverse Transform */ void idct8(uint block, uint offset, uint stride) { float t0, t1, t2, t3, t4, t5, t6, t7, u8; @@ -117,60 +117,3 @@ void idct8(uint block, uint offset, uint stride) blocks[block][6*stride + offset] = u6; blocks[block][7*stride + offset] = u7; } - -void main(void) -{ - uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID; - uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7; - uint chroma_shift = comp != 0 ? log2_chroma_w : 0; - bool act = gid.x < mb_width << (4 - chroma_shift); - - /** - * Normalize coefficients to [-1, 1] for increased precision during the iDCT. - * DCT coeffs have the range of a 12-bit signed integer (7.4 Inverse Transform). - */ - const float norm = 1.0f / (1 << 11); - - /* Coalesced load of DCT coeffs in shared memory, inverse quantization */ - if (act) { - /** - * According to the VK spec indexing an array in push constant memory with - * a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326), - * so copy the whole matrix locally. - */ - uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma; - - /* Table 15 */ - uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - chroma_shift))]; - int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx; - - [[unroll]] for (uint i = 0; i < 8; ++i) { - uint cidx = (i << 3) + idx; - int c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + i))), 16); - float v = float(c * qscale * int(qmat[cidx])) * norm; - blocks[block][i * 9 + idx] = v * idct_scale[cidx]; - } - } - - /* Column-wise iDCT */ - idct8(block, idx, 9); - barrier(); - - /* Remap [-1, 1] to [0, 2] to remove a per-element addition in the output loop */ - blocks[block][idx * 9] += 1.0f; - - /* Row-wise iDCT */ - idct8(block, idx * 9, 1); - barrier(); - - float fact = 1 << (depth - 1); - int maxv = (1 << depth) - 1; - - /* 7.5.1 Color Component Samples. Rescale, clamp and write back to global memory */ - if (act) { - [[unroll]] for (uint i = 0; i < 8; ++i) { - float v = round(blocks[block][i * 9 + idx] * fact); - put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, maxv)); - } - } -} diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/prores_idct.comp index 5eef61e57a..25431d61c1 100644 --- a/libavcodec/vulkan/prores_idct.comp +++ b/libavcodec/vulkan/prores_idct.comp @@ -16,9 +16,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -/* Two macroblocks, padded to avoid bank conflicts */ -shared float blocks[4*2][8*(8+1)]; - uint get_px(uint tex_idx, ivec2 pos) { #ifndef INTERLACED @@ -37,87 +34,6 @@ void put_px(uint tex_idx, ivec2 pos, uint v) #endif } -const float idct_scale[64] = { - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, - 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, - 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, - 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, - 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, - 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, - 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, - 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, - 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, - 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, - 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, -}; - -/* 7.4 Inverse Transform */ -void idct8(uint block, uint offset, uint stride) -{ - float t0, t1, t2, t3, t4, t5, t6, t7, u8; - float u0, u1, u2, u3, u4, u5, u6, u7; - - /* Input */ - t0 = blocks[block][0*stride + offset]; - u4 = blocks[block][1*stride + offset]; - t2 = blocks[block][2*stride + offset]; - u6 = blocks[block][3*stride + offset]; - t1 = blocks[block][4*stride + offset]; - u5 = blocks[block][5*stride + offset]; - t3 = blocks[block][6*stride + offset]; - u7 = blocks[block][7*stride + offset]; - - /* Embedded scaled inverse 4-point Type-II DCT */ - u0 = t0 + t1; - u1 = t0 - t1; - u3 = t2 + t3; - u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; - t0 = u0 + u3; - t3 = u0 - u3; - t1 = u1 + u2; - t2 = u1 - u2; - - /* Embedded scaled inverse 4-point Type-IV DST */ - t5 = u5 + u6; - t6 = u5 - u6; - t7 = u4 + u7; - t4 = u4 - u7; - u7 = t7 + t5; - u5 = (t7 - t5)*(1.4142135623730950488016887242097f); - u8 = (t4 + t6)*(1.8477590650225735122563663787936f); - u4 = u8 - t4*(1.0823922002923939687994464107328f); - u6 = u8 - t6*(2.6131259297527530557132863468544f); - t7 = u7; - t6 = t7 - u6; - t5 = t6 + u5; - t4 = t5 - u4; - - /* Butterflies */ - u0 = t0 + t7; - u7 = t0 - t7; - u6 = t1 + t6; - u1 = t1 - t6; - u2 = t2 + t5; - u5 = t2 - t5; - u4 = t3 + t4; - u3 = t3 - t4; - - /* Output */ - blocks[block][0*stride + offset] = u0; - blocks[block][1*stride + offset] = u1; - blocks[block][2*stride + offset] = u2; - blocks[block][3*stride + offset] = u3; - blocks[block][4*stride + offset] = u4; - blocks[block][5*stride + offset] = u5; - blocks[block][6*stride + offset] = u6; - blocks[block][7*stride + offset] = u7; -} - void main(void) { uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID; diff --git a/libavcodec/vulkan/prores_raw_idct.comp b/libavcodec/vulkan/prores_raw_idct.comp index 29ddf3b9e8..01aad98330 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp +++ b/libavcodec/vulkan/prores_raw_idct.comp @@ -24,8 +24,6 @@ #define BLOCK_ID (gl_LocalInvocationID.y) #define ROW_ID (gl_LocalInvocationID.x) -shared float blocks[16][4*64]; - const ivec2 scan[64] = { ivec2( 0, 0), ivec2( 4, 0), ivec2( 0, 2), ivec2( 4, 2), ivec2( 0, 8), ivec2( 4, 8), ivec2( 6, 8), ivec2( 2, 10), @@ -45,86 +43,6 @@ const ivec2 scan[64] = { ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14), }; -const float idct_scale[64] = { - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 0.2038732892122293, - 0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 0.0478354290456362, - 0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 0.1920444391778541, - 0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 0.0450599888754343, - 0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 0.1728354290456362, - 0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 0.0405529186026822, - 0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 0.1469844503024199, - 0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 0.0344874224103679, - 0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 0.1154849415639109, - 0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 0.0270965939155924, - 0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 0.0795474112858021, - 0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 0.0186644585125857, - 0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 0.0405529186026822, - 0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 0.0095150584360892, -}; - -void idct8(uint block, uint offset, uint stride) -{ - float t0, t1, t2, t3, t4, t5, t6, t7, u8; - float u0, u1, u2, u3, u4, u5, u6, u7; - - /* Input */ - t0 = blocks[block][0*stride + offset]; - u4 = blocks[block][1*stride + offset]; - t2 = blocks[block][2*stride + offset]; - u6 = blocks[block][3*stride + offset]; - t1 = blocks[block][4*stride + offset]; - u5 = blocks[block][5*stride + offset]; - t3 = blocks[block][6*stride + offset]; - u7 = blocks[block][7*stride + offset]; - - /* Embedded scaled inverse 4-point Type-II DCT */ - u0 = t0 + t1; - u1 = t0 - t1; - u3 = t2 + t3; - u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3; - t0 = u0 + u3; - t3 = u0 - u3; - t1 = u1 + u2; - t2 = u1 - u2; - - /* Embedded scaled inverse 4-point Type-IV DST */ - t5 = u5 + u6; - t6 = u5 - u6; - t7 = u4 + u7; - t4 = u4 - u7; - u7 = t7 + t5; - u5 = (t7 - t5)*(1.4142135623730950488016887242097f); - u8 = (t4 + t6)*(1.8477590650225735122563663787936f); - u4 = u8 - t4*(1.0823922002923939687994464107328f); - u6 = u8 - t6*(2.6131259297527530557132863468544f); - t7 = u7; - t6 = t7 - u6; - t5 = t6 + u5; - t4 = t5 - u4; - - /* Butterflies */ - u0 = t0 + t7; - u7 = t0 - t7; - u6 = t1 + t6; - u1 = t1 - t6; - u2 = t2 + t5; - u5 = t2 - t5; - u4 = t3 + t4; - u3 = t3 - t4; - - /* Output */ - blocks[block][0*stride + offset] = u0; - blocks[block][1*stride + offset] = u1; - blocks[block][2*stride + offset] = u2; - blocks[block][3*stride + offset] = u3; - blocks[block][4*stride + offset] = u4; - blocks[block][5*stride + offset] = u5; - blocks[block][6*stride + offset] = u6; - blocks[block][7*stride + offset] = u7; -} - void main(void) { const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c index bc0364609c..afea8857e8 100644 --- a/libavcodec/vulkan_prores.c +++ b/libavcodec/vulkan_prores.c @@ -24,6 +24,7 @@ #include "libavutil/vulkan_spirv.h" extern const char *ff_source_common_comp; +extern const char *ff_source_dct_comp; extern const char *ff_source_prores_vld_comp; extern const char *ff_source_prores_idct_comp; @@ -511,6 +512,9 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, }; RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0)); + GLSLC(0, #define NB_BLOCKS 4*2); + GLSLD(ff_source_dct_comp); + GLSLD(ff_source_prores_idct_comp); RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 82e8f3ad16..0a844156fb 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -26,6 +26,7 @@ #include "libavutil/mem.h" extern const char *ff_source_common_comp; +extern const char *ff_source_dct_comp; extern const char *ff_source_prores_raw_decode_comp; extern const char *ff_source_prores_raw_idct_comp; @@ -385,6 +386,10 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, RET(add_common_data(avctx, s, shd, 0)); + GLSLC(0, #define NB_BLOCKS 16); + GLSLC(0, #define NB_COMPONENTS 4); + GLSLD(ff_source_dct_comp); + GLSLD(ff_source_prores_raw_idct_comp); RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main", _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
