prores: copy constant tables to shared memory

averne via ffmpeg-cvslog Mon, 15 Dec 2025 04:30:19 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit b9078c0939c9c8840fed1e569eead4f2cef7eaeb
Author:     averne <[email protected]>
AuthorDate: Sun Dec 14 23:13:11 2025 +0100
Commit:     Lynne <[email protected]>
CommitDate: Mon Dec 15 12:29:00 2025 +0000

    vulkan/prores: copy constant tables to shared memory
    
    The shader needs ~3 loads per DCT coeff.
    This data was not observed to get efficiently stored
    in the upper cached levels, loading it explicitely in
    shared memory fixes that.
    
    Also reduce code size by moving the bitstream
    initialization outside of the switch/case.
---
 libavcodec/vulkan/prores_vld.comp | 120 +++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 52 deletions(-)

diff --git a/libavcodec/vulkan/prores_vld.comp 
b/libavcodec/vulkan/prores_vld.comp
index 298a5baf4c..4b486fe2b4 100644
--- a/libavcodec/vulkan/prores_vld.comp
+++ b/libavcodec/vulkan/prores_vld.comp
@@ -19,6 +19,58 @@
 #define U8(x)  (uint8_t (x))
 #define U16(x) (uint16_t(x))
 
+/**
+ * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | ((kexp or 
kexp + 1) << 8)
+ * According to the SMPTE document, abs(prev_dc_diff) should be used
+ * to index the table, duplicating the entries removes the abs operation.
+ */
+const uint16_t k_dc_codebook[] = { U16(0x100),
+                                   U16(0x210), U16(0x210),
+                                   U16(0x321), U16(0x321),
+                                   U16(0x430), U16(0x430), };
+
+/* Table 10 */
+const uint16_t k_ac_run_codebook  [] = { U16(0x102), U16(0x102), U16(0x101), 
U16(0x101),
+                                         U16(0x100), U16(0x211), U16(0x211), 
U16(0x211),
+                                         U16(0x211), U16(0x210), U16(0x210), 
U16(0x210),
+                                         U16(0x210), U16(0x210), U16(0x210), 
U16(0x320), };
+/* Table 11 */
+const uint16_t k_ac_level_codebook[] = { U16(0x202), U16(0x101), U16(0x102), 
U16(0x100),
+                                         U16(0x210), U16(0x210), U16(0x210), 
U16(0x210),
+                                         U16(0x320) };
+
+#ifndef INTERLACED
+    /* Figure 4, encoded as (x << 0) | (y << 4) */
+    const uint8_t k_scan_tbl[] = {
+        U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), U8(0x12), 
U8(0x13),
+        U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), U8(0x32), 
U8(0x33),
+        U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), U8(0x07), 
U8(0x16),
+        U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), U8(0x36), 
U8(0x37),
+        U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), U8(0x43), 
U8(0x52),
+        U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), U8(0x45), 
U8(0x54),
+        U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), U8(0x47), 
U8(0x56),
+        U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), U8(0x76), 
U8(0x77),
+    };
+#else
+    /* Figure 5 */
+    const uint8_t k_scan_tbl[] = {
+        U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), U8(0x21), 
U8(0x31),
+        U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), U8(0x23), 
U8(0x33),
+        U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), U8(0x70), 
U8(0x61),
+        U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), U8(0x63), 
U8(0x73),
+        U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), U8(0x34), 
U8(0x25),
+        U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), U8(0x54), 
U8(0x45),
+        U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), U8(0x74), 
U8(0x65),
+        U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), U8(0x67), 
U8(0x77),
+    };
+#endif
+
+shared uint16_t dc_codebook      [k_dc_codebook      .length()],
+                ac_run_codebook  [k_ac_run_codebook  .length()],
+                ac_level_codebook[k_ac_level_codebook.length()];
+
+shared uint8_t  scan_tbl[k_scan_tbl.length()];
+
 void put_px(uint tex_idx, ivec2 pos, uint v)
 {
 #ifndef INTERLACED
@@ -72,16 +124,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint 
mb_count)
         uint c = to_signed(decode_codeword(gb, 0x650));
         put_px(gid.z, base_pos, c);
 
-        /**
-         * Table 9, encoded as (last_rice_q << 0) | (krice or kexp << 4) | 
((kexp or kexp + 1) << 8)
-         * According to the SMPTE document, abs(prev_dc_diff) should be used
-         * to index the table, duplicating the entries removes the abs 
operation.
-         */
-        const uint16_t dc_codebook[] = { U16(0x100),
-                                         U16(0x210), U16(0x210),
-                                         U16(0x321), U16(0x321),
-                                         U16(0x430), U16(0x430), };
-
         uint cw = 5, prev_dc_diff = 0;
         for (int i = 1; i < num_blocks; ++i) {
             cw = decode_codeword(gb, dc_codebook[min(cw, 6)]);
@@ -95,43 +137,6 @@ void decode_comp(in GetBitContext gb, uvec2 mb_pos, uint 
mb_count)
 
     /* 7.1.1.4 AC Coefficients */
     {
-        /* Table 10 */
-        const uint16_t ac_run_codebook  [] = { U16(0x102), U16(0x102), 
U16(0x101), U16(0x101),
-                                               U16(0x100), U16(0x211), 
U16(0x211), U16(0x211),
-                                               U16(0x211), U16(0x210), 
U16(0x210), U16(0x210),
-                                               U16(0x210), U16(0x210), 
U16(0x210), U16(0x320), };
-
-        /* Table 11 */
-        const uint16_t ac_level_codebook[] = { U16(0x202), U16(0x101), 
U16(0x102), U16(0x100),
-                                               U16(0x210), U16(0x210), 
U16(0x210), U16(0x210),
-                                               U16(0x320) };
-
-#ifndef INTERLACED
-        /* Figure 4, encoded as (x << 0) | (y << 4) */
-        const uint8_t scan_tbl[] = {
-            U8(0x00), U8(0x01), U8(0x10), U8(0x11), U8(0x02), U8(0x03), 
U8(0x12), U8(0x13),
-            U8(0x20), U8(0x21), U8(0x30), U8(0x31), U8(0x22), U8(0x23), 
U8(0x32), U8(0x33),
-            U8(0x04), U8(0x05), U8(0x14), U8(0x24), U8(0x15), U8(0x06), 
U8(0x07), U8(0x16),
-            U8(0x25), U8(0x34), U8(0x35), U8(0x26), U8(0x17), U8(0x27), 
U8(0x36), U8(0x37),
-            U8(0x40), U8(0x41), U8(0x50), U8(0x60), U8(0x51), U8(0x42), 
U8(0x43), U8(0x52),
-            U8(0x61), U8(0x70), U8(0x71), U8(0x62), U8(0x53), U8(0x44), 
U8(0x45), U8(0x54),
-            U8(0x63), U8(0x72), U8(0x73), U8(0x64), U8(0x55), U8(0x46), 
U8(0x47), U8(0x56),
-            U8(0x65), U8(0x74), U8(0x75), U8(0x66), U8(0x57), U8(0x67), 
U8(0x76), U8(0x77),
-        };
-#else
-        /* Figure 5 */
-        const uint8_t scan_tbl[] = {
-            U8(0x00), U8(0x10), U8(0x01), U8(0x11), U8(0x20), U8(0x30), 
U8(0x21), U8(0x31),
-            U8(0x02), U8(0x12), U8(0x03), U8(0x13), U8(0x22), U8(0x32), 
U8(0x23), U8(0x33),
-            U8(0x40), U8(0x50), U8(0x41), U8(0x42), U8(0x51), U8(0x60), 
U8(0x70), U8(0x61),
-            U8(0x52), U8(0x43), U8(0x53), U8(0x62), U8(0x71), U8(0x72), 
U8(0x63), U8(0x73),
-            U8(0x04), U8(0x14), U8(0x05), U8(0x06), U8(0x15), U8(0x24), 
U8(0x34), U8(0x25),
-            U8(0x16), U8(0x07), U8(0x17), U8(0x26), U8(0x35), U8(0x44), 
U8(0x54), U8(0x45),
-            U8(0x36), U8(0x27), U8(0x37), U8(0x46), U8(0x55), U8(0x64), 
U8(0x74), U8(0x65),
-            U8(0x56), U8(0x47), U8(0x57), U8(0x66), U8(0x75), U8(0x76), 
U8(0x67), U8(0x77),
-        };
-#endif
-
         uint block_mask  = num_blocks - 1;
         uint block_shift = findLSB(num_blocks);
 
@@ -251,22 +256,26 @@ void main(void)
 
     a_size = slice_size - hdr_size - y_size - u_size - v_size;
 
-    GetBitContext gb;
+    bs += hdr_size;
+    int bs_size = 0;
     switch (gid.z) {
         case 0:
-            init_get_bits(gb, u8buf(bs + hdr_size),                            
int(y_size));
+            bs_size = int(y_size);
             break;
         case 1:
-            init_get_bits(gb, u8buf(bs + hdr_size + y_size),                   
int(u_size));
+            bs_size = int(u_size), bs += y_size;
             break;
         case 2:
-            init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size),          
int(v_size));
+            bs_size = int(v_size), bs += y_size + u_size;
             break;
         case 3:
-            init_get_bits(gb, u8buf(bs + hdr_size + y_size + u_size + v_size), 
int(a_size));
+            bs_size = int(a_size), bs += y_size + u_size + v_size;
             break;
     }
 
+    GetBitContext gb;
+    init_get_bits(gb, bs, bs_size);
+
     /**
      * Support for the grayscale "extension" in the prores_aw encoder.
      * According to the spec, entropy coded data should never be empty,
@@ -276,6 +285,13 @@ void main(void)
     if (left_bits(gb) == 0)
         return;
 
+    /* Copy constant tables to local memory */
+    dc_codebook       = k_dc_codebook;
+    ac_run_codebook   = k_ac_run_codebook;
+    ac_level_codebook = k_ac_level_codebook;
+
+    scan_tbl = k_scan_tbl;
+
     /**
      * 4 ProRes Frame Structure
      * ProRes tiles pictures into a grid of slices, whose size is determined

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/03: vulkan/prores: copy constant tables to shared memory

Reply via email to