PR #21203 opened by averne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21203.patch
This introduces a more aggressive bitstream caching, and saves the codebook/scan tables in shared memory, to reduce the overhead of hitting global memory. It gives a nice speedup on AMD, but less significant on NVIDIA/Intel: For a 4k, 422p10 file: - AMD 6700XT: 18% (249 vs 211 fps) - NVIDIA RTX 3050: 4% (98 vs 94 fps) - Intel Tiger Lake GT2: 12% (38 vs 34 fps) >From 540834ad991af1144d4b6d8d34737eff488fa303 Mon Sep 17 00:00:00 2001 From: averne <[email protected]> Date: Sun, 14 Dec 2025 23:01:45 +0100 Subject: [PATCH 1/3] lavc/vulkan/common: allow configurable bitstream caching in shared memory --- libavcodec/vulkan/common.comp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp index d50e629f06..1e34c9bab2 100644 --- a/libavcodec/vulkan/common.comp +++ b/libavcodec/vulkan/common.comp @@ -229,13 +229,14 @@ struct GetBitContext { gb.bits_valid += 32; \ } #else /* GET_BITS_SMEM */ -shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z]; +shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z*GET_BITS_SMEM]; -#define FILL_SMEM() \ - { \ - u32vec4buf ptr = u32vec4buf(gb.buf); \ - gb_storage[gl_LocalInvocationIndex] = ptr[0].v; \ - gb.cur_smem_pos = 0; \ +#define FILL_SMEM() \ + { \ + u32vec4buf ptr = u32vec4buf(gb.buf); \ + [[unroll]] for (uint i = 0; i < GET_BITS_SMEM; ++i) \ + gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM+i] = ptr[i].v; \ + gb.cur_smem_pos = 0; \ } #define LOAD64() \ @@ -251,15 +252,15 @@ shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize FILL_SMEM(); \ } -#define RELOAD32() \ - { \ - if (gb.cur_smem_pos >= 4) \ - FILL_SMEM(); \ - uint v = gb_storage[gl_LocalInvocationIndex][gb.cur_smem_pos]; \ - gb.buf += 4; \ - gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \ - gb.bits_valid += 32; \ - gb.cur_smem_pos += 1; \ +#define RELOAD32() \ + { \ + if (gb.cur_smem_pos >= 4*GET_BITS_SMEM) \ + FILL_SMEM(); \ + uint v = gb_storage[gl_LocalInvocationIndex*GET_BITS_SMEM][gb.cur_smem_pos]; \ + gb.buf += 4; \ + gb.bits = uint64_t(reverse4(v)) << (32 - gb.bits_valid) | gb.bits; \ + gb.bits_valid += 32; \ + gb.cur_smem_pos += 1; \ } #endif /* GET_BITS_SMEM */ -- 2.49.1 >From 04f0ab1992e5b019a9bb4e6f0c9b6130339cc575 Mon Sep 17 00:00:00 2001 From: averne <[email protected]> Date: Sun, 14 Dec 2025 23:05:07 +0100 Subject: [PATCH 2/3] vulkan/prores: increase bitstream caching Now caches 64B of data when the reader hits the refill codepath --- libavcodec/vulkan_prores.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c index 338c09d46f..da10f93548 100644 --- a/libavcodec/vulkan_prores.c +++ b/libavcodec/vulkan_prores.c @@ -405,7 +405,7 @@ static int init_shader(AVCodecContext *avctx, FFVulkanContext *s, local_size >> 16 & 0xff, local_size >> 8 & 0xff, local_size >> 0 & 0xff, 0)); - av_bprintf(&shd->src, "#define GET_BITS_SMEM\n"); + av_bprintf(&shd->src, "#define GET_BITS_SMEM %d\n", 4); if (interlaced) av_bprintf(&shd->src, "#define INTERLACED\n"); -- 2.49.1 >From d33674642084a857b348cc8a256ad5cfd4b67e42 Mon Sep 17 00:00:00 2001 From: averne <[email protected]> Date: Sun, 14 Dec 2025 23:13:11 +0100 Subject: [PATCH 3/3] vulkan/prores: copy constant tables to shared memory The shader needs ~3 loads per DCT coeff. This data was not observed to get efficiently stored in the upper cached levels, loading it explicitely in shared memory fixes that. --- libavcodec/vulkan/prores_vld.comp | 106 +++++++++++++++++------------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/libavcodec/vulkan/prores_vld.comp b/libavcodec/vulkan/prores_vld.comp index 298a5baf4c..30d5dcb04d 100644 --- a/libavcodec/vulkan/prores_vld.comp +++ b/libavcodec/vulkan/prores_vld.comp @@ -19,6 +19,58 @@ #define U8(x) (uint8_t (x)) #define U16(x) (uint16_t(x)) +/** + * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) + * According to the SMPTE document, abs(prev_dc_diff) should be used + * to index the table, duplicating the entries removes the abs operation. + */ +const uint16_t k_dc_codebook[] = { U16(0x100), + U16(0x210), U16(0x210), + U16(0x321), U16(0x321), + U16(0x430), U16(0x430), }; + +/* Table 10 */ +const uint16_t k_ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101), + U16(0x100), U16(0x211), U16(0x211), U16(0x211), + U16(0x211), U16(0x210), U16(0x210), U16(0x210), + U16(0x210), U16(0x210), U16(0x210), U16(0x320), }; +/* Table 11 */ +const uint16_t k_ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100), + U16(0x210), U16(0x210), U16(0x210), U16(0x210), + U16(0x320) }; + +#ifndef INTERLACED + /* Figure 4, encoded as (x << 0) | (y << 4) */ + const uint8_t k_scan_tbl[] = { + U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13), + U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33), + U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16), + U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37), + U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52), + U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54), + U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56), + U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77), + }; +#else + /* Figure 5 */ + const uint8_t k_scan_tbl[] = { + U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31), + U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33), + U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61), + U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73), + U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25), + U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45), + U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65), + U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77), + }; +#endif + +shared uint16_t dc_codebook [k_dc_codebook .length()], + ac_run_codebook [k_ac_run_codebook .length()], + ac_level_codebook[k_ac_level_codebook.length()]; + +shared uint8_t scan_tbl[k_scan_tbl.length()]; + void put_px(uint tex_idx, ivec2 pos, uint v) { #ifndef INTERLACED @@ -72,16 +124,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count) uint c = to_signed(decode_codeword(gb, 0x650)); put_px(gid.z, base_pos, c); - /** - * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) - * According to the SMPTE document, abs(prev_dc_diff) should be used - * to index the table, duplicating the entries removes the abs operation. - */ - const uint16_t dc_codebook[] = { U16(0x100), - U16(0x210), U16(0x210), - U16(0x321), U16(0x321), - U16(0x430), U16(0x430), }; - uint cw = 5, prev_dc_diff = 0; for (int i = 1; i < num_blocks; ++i) { cw = decode_codeword(gb, dc_codebook[min(cw, 6)]); @@ -95,43 +137,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count) /* 7.1.1.4 AC Coefficients */ { - /* Table 10 */ - const uint16_t ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101), - U16(0x100), U16(0x211), U16(0x211), U16(0x211), - U16(0x211), U16(0x210), U16(0x210), U16(0x210), - U16(0x210), U16(0x210), U16(0x210), U16(0x320), }; - - /* Table 11 */ - const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100), - U16(0x210), U16(0x210), U16(0x210), U16(0x210), - U16(0x320) }; - -#ifndef INTERLACED - /* Figure 4, encoded as (x << 0) | (y << 4) */ - const uint8_t scan_tbl[] = { - U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13), - U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33), - U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16), - U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37), - U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52), - U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54), - U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56), - U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77), - }; -#else - /* Figure 5 */ - const uint8_t scan_tbl[] = { - U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31), - U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33), - U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61), - U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73), - U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25), - U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45), - U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65), - U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77), - }; -#endif - uint block_mask = num_blocks - 1; uint block_shift = findLSB(num_blocks); @@ -276,6 +281,13 @@ void main(void) if (left_bits(gb) == 0) return; + /* Copy constant tables to local memory */ + dc_codebook = k_dc_codebook; + ac_run_codebook = k_ac_run_codebook; + ac_level_codebook = k_ac_level_codebook; + + scan_tbl = k_scan_tbl; + /** * 4 ProRes Frame Structure * ProRes tiles pictures into a grid of slices, whose size is determined -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
