This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit b9078c0939c9c8840fed1e569eead4f2cef7eaeb Author: averne <[email protected]> AuthorDate: Sun Dec 14 23:13:11 2025 +0100 Commit: Lynne <[email protected]> CommitDate: Mon Dec 15 12:29:00 2025 +0000 vulkan/prores: copy constant tables to shared memory The shader needs ~3 loads per DCT coeff. This data was not observed to get efficiently stored in the upper cached levels, loading it explicitely in shared memory fixes that. Also reduce code size by moving the bitstream initialization outside of the switch/case. --- libavcodec/vulkan/prores_vld.comp | 120 +++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 52 deletions(-) diff --git a/libavcodec/vulkan/prores_vld.comp b/libavcodec/vulkan/prores_vld.comp index 298a5baf4c..4b486fe2b4 100644 --- a/libavcodec/vulkan/prores_vld.comp +++ b/libavcodec/vulkan/prores_vld.comp @@ -19,6 +19,58 @@ #define U8(x) (uint8_t (x)) #define U16(x) (uint16_t(x)) +/** + * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) + * According to the SMPTE document, abs(prev_dc_diff) should be used + * to index the table, duplicating the entries removes the abs operation. + */ +const uint16_t k_dc_codebook[] = { U16(0x100), + U16(0x210), U16(0x210), + U16(0x321), U16(0x321), + U16(0x430), U16(0x430), }; + +/* Table 10 */ +const uint16_t k_ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101), + U16(0x100), U16(0x211), U16(0x211), U16(0x211), + U16(0x211), U16(0x210), U16(0x210), U16(0x210), + U16(0x210), U16(0x210), U16(0x210), U16(0x320), }; +/* Table 11 */ +const uint16_t k_ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100), + U16(0x210), U16(0x210), U16(0x210), U16(0x210), + U16(0x320) }; + +#ifndef INTERLACED + /* Figure 4, encoded as (x << 0) | (y << 4) */ + const uint8_t k_scan_tbl[] = { + U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13), + U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33), + U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16), + U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37), + U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52), + U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54), + U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56), + U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77), + }; +#else + /* Figure 5 */ + const uint8_t k_scan_tbl[] = { + U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31), + U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33), + U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61), + U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73), + U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25), + U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45), + U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65), + U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77), + }; +#endif + +shared uint16_t dc_codebook [k_dc_codebook .length()], + ac_run_codebook [k_ac_run_codebook .length()], + ac_level_codebook[k_ac_level_codebook.length()]; + +shared uint8_t scan_tbl[k_scan_tbl.length()]; + void put_px(uint tex_idx, ivec2 pos, uint v) { #ifndef INTERLACED @@ -72,16 +124,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count) uint c = to_signed(decode_codeword(gb, 0x650)); put_px(gid.z, base_pos, c); - /** - * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or kexp + 1) << 8) - * According to the SMPTE document, abs(prev_dc_diff) should be used - * to index the table, duplicating the entries removes the abs operation. - */ - const uint16_t dc_codebook[] = { U16(0x100), - U16(0x210), U16(0x210), - U16(0x321), U16(0x321), - U16(0x430), U16(0x430), }; - uint cw = 5, prev_dc_diff = 0; for (int i = 1; i < num_blocks; ++i) { cw = decode_codeword(gb, dc_codebook[min(cw, 6)]); @@ -95,43 +137,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint mb_count) /* 7.1.1.4 AC Coefficients */ { - /* Table 10 */ - const uint16_t ac_run_codebook [] = { U16(0x102), U16(0x102), U16(0x101), U16(0x101), - U16(0x100), U16(0x211), U16(0x211), U16(0x211), - U16(0x211), U16(0x210), U16(0x210), U16(0x210), - U16(0x210), U16(0x210), U16(0x210), U16(0x320), }; - - /* Table 11 */ - const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), U16(0x100), - U16(0x210), U16(0x210), U16(0x210), U16(0x210), - U16(0x320) }; - -#ifndef INTERLACED - /* Figure 4, encoded as (x << 0) | (y << 4) */ - const uint8_t scan_tbl[] = { - U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), U8(0x13), - U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), U8(0x33), - U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), U8(0x16), - U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), U8(0x37), - U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), U8(0x52), - U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), U8(0x54), - U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), U8(0x56), - U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), U8(0x77), - }; -#else - /* Figure 5 */ - const uint8_t scan_tbl[] = { - U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), U8(0x31), - U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), U8(0x33), - U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), U8(0x61), - U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), U8(0x73), - U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), U8(0x25), - U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), U8(0x45), - U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), U8(0x65), - U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), U8(0x77), - }; -#endif - uint block_mask = num_blocks - 1; uint block_shift = findLSB(num_blocks); @@ -251,22 +256,26 @@ void main(void) a_size = slice_size - hdr_size - y_size - u_size - v_size; - GetBitContext gb; + bs += hdr_size; + int bs_size = 0; switch (gid.z) { case 0: - init_get_bits(gb, u8buf(bs + hdr_size), int(y_size)); + bs_size = int(y_size); break; case 1: - init_get_bits(gb, u8buf(bs + hdr_size + y_size), int(u_size)); + bs_size = int(u_size), bs += y_size; break; case 2: - init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size), int(v_size)); + bs_size = int(v_size), bs += y_size + u_size; break; case 3: - init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size + v_size), int(a_size)); + bs_size = int(a_size), bs += y_size + u_size + v_size; break; } + GetBitContext gb; + init_get_bits(gb, bs, bs_size); + /** * Support for the grayscale "extension" in the prores_aw encoder. * According to the spec, entropy coded data should never be empty, @@ -276,6 +285,13 @@ void main(void) if (left_bits(gb) == 0) return; + /* Copy constant tables to local memory */ + dc_codebook = k_dc_codebook; + ac_run_codebook = k_ac_run_codebook; + ac_level_codebook = k_ac_level_codebook; + + scan_tbl = k_scan_tbl; + /** * 4 ProRes Frame Structure * ProRes tiles pictures into a grid of slices, whose size is determined _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
