[FFmpeg-devel] [PR] vulkan/prores: fix for Apple M-series (PR #21502)

averne via ffmpeg-devel Sat, 17 Jan 2026 07:42:43 -0800

PR #21502 opened by averne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21502
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21502.patch


This fixes a assumption that vulkan implementations provide more push constant 
memory than the spec mandates.
With this, the prores vulkan decoder runs on Apple M-series.

Quick benchmarking (on my M3 Pro machine) shows performance on par with the 
Apple's ASIC decoder exposed by videotoolbox:
- 155 fps on a 4k, 422p10 sample (videotoolbox: 161)
- 184 fps on a 4k, 444p12 sample (videotoolbox: 144)
- 83 fps on a 5700x3040, 422p10 sample (videotoolbox: 81)


>From c4aa639f9c9dd6e93a823bd8b73072558c2db64c Mon Sep 17 00:00:00 2001
From: averne <[email protected]>
Date: Tue, 23 Dec 2025 14:32:49 +0100
Subject: [PATCH] vulkan/prores: reduce push constants size

The VK specs only mandates 128B, and some platforms
don't actually implement more.  This moves the quantization
matrices to the per-frame buffer.
---
 libavcodec/vulkan/prores_idct.comp.glsl | 19 +++++--------
 libavcodec/vulkan/prores_vld.comp.glsl  |  3 ---
 libavcodec/vulkan_prores.c              | 36 ++++++++++++++++---------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/libavcodec/vulkan/prores_idct.comp.glsl 
b/libavcodec/vulkan/prores_idct.comp.glsl
index ee9eddf19d..800a93db66 100644
--- a/libavcodec/vulkan/prores_idct.comp.glsl
+++ b/libavcodec/vulkan/prores_idct.comp.glsl
@@ -28,7 +28,10 @@ layout (constant_id = 0) const bool interlaced = false;
 layout (set = 0, binding = 0) readonly buffer quant_idx_buf {
     uint8_t quant_idx[];
 };
-layout (set = 0, binding = 1) uniform uimage2D dst[];
+layout (set = 0, binding = 1) readonly buffer qmat_buf {
+    uint8_t qmat[];
+};
+layout (set = 0, binding = 2) uniform uimage2D dst[];
 
 layout (push_constant, scalar) uniform pushConstants {
    u8buf    slice_data;
@@ -45,9 +48,6 @@ layout (push_constant, scalar) uniform pushConstants {
    uint8_t  depth;
    uint8_t  alpha_info;
    uint8_t  bottom_field;
-
-   uint8_t  qmat_luma  [8*8];
-   uint8_t  qmat_chroma[8*8];
 };
 
 uint get_px(uint tex_idx, ivec2 pos)
@@ -79,21 +79,14 @@ void main(void)
 
     /* Coalesced load of DCT coeffs in shared memory, inverse quantization */
     if (act) {
-        /**
-         * According to the VK spec indexing an array in push constant memory 
with
-         * a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326),
-         * so copy the whole matrix locally.
-         */
-        uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
-
         /* Table 15 */
         uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - 
chroma_shift))];
-        int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
+        int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx, mat = int(gid.z != 
0) << 6;
 
         [[unroll]] for (uint i = 0; i < 8; ++i) {
             uint cidx = (i << 3) + idx;
             int   c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + 
i))), 16);
-            float v = float(c * qscale * int(qmat[cidx])) * norm;
+            float v = float(c * qscale * int(qmat[mat + cidx])) * norm;
             blocks[block][i * 9 + idx] = v * idct_scale[cidx];
         }
     }
diff --git a/libavcodec/vulkan/prores_vld.comp.glsl 
b/libavcodec/vulkan/prores_vld.comp.glsl
index a22f7fed77..85b4dbdd61 100644
--- a/libavcodec/vulkan/prores_vld.comp.glsl
+++ b/libavcodec/vulkan/prores_vld.comp.glsl
@@ -48,9 +48,6 @@ layout (push_constant, scalar) uniform pushConstants {
    uint8_t  depth;
    uint8_t  alpha_info;
    uint8_t  bottom_field;
-
-   uint8_t  qmat_luma  [8*8];
-   uint8_t  qmat_chroma[8*8];
 };
 
 /**
diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index 019637f6ee..c7451c20fd 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -42,8 +42,8 @@ typedef struct ProresVulkanDecodePicture {
     uint32_t bitstream_size;
     uint32_t slice_num;
 
-    uint32_t slice_offsets_sz,  mb_params_sz;
-    uint32_t slice_offsets_off, mb_params_off;
+    uint32_t slice_offsets_sz,  qmat_sz,  mb_params_sz;
+    uint32_t slice_offsets_off, qmat_off, mb_params_off;
 } ProresVulkanDecodePicture;
 
 typedef struct ProresVulkanDecodeContext {
@@ -68,9 +68,6 @@ typedef struct ProresVkParameters {
     uint8_t  depth;
     uint8_t  alpha_info;
     uint8_t  bottom_field;
-
-    uint8_t  qmat_luma  [64];
-    uint8_t  qmat_chroma[64];
 } ProresVkParameters;
 
 static int vk_prores_start_frame(AVCodecContext          *avctx,
@@ -88,10 +85,13 @@ static int vk_prores_start_frame(AVCodecContext          
*avctx,
     int err;
 
     pp->slice_offsets_sz = (pr->slice_count + 1) * sizeof(uint32_t);
+    pp->qmat_sz          = sizeof(pr->qmat_luma) + sizeof(pr->qmat_chroma);
     pp->mb_params_sz     = pr->mb_width * pr->mb_height * sizeof(uint8_t);
 
     pp->slice_offsets_off = 0;
-    pp->mb_params_off     = FFALIGN(pp->slice_offsets_off + 
pp->slice_offsets_sz,
+    pp->qmat_off          = FFALIGN(pp->slice_offsets_off + 
pp->slice_offsets_sz,
+                                    
ctx->s.props.properties.limits.minStorageBufferOffsetAlignment);
+    pp->mb_params_off     = FFALIGN(pp->qmat_off + pp->qmat_sz,
                                     
ctx->s.props.properties.limits.minStorageBufferOffsetAlignment);
 
     /* Host map the input slices data if supported */
@@ -198,8 +198,10 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
         .bottom_field     = pr->first_field ^ (pr->frame_type == 1),
     };
 
-    memcpy(pd.qmat_luma,   pr->qmat_luma,   sizeof(pd.qmat_luma  ));
-    memcpy(pd.qmat_chroma, pr->qmat_chroma, sizeof(pd.qmat_chroma));
+    memcpy(metadata->mapped_mem + pp->qmat_off,
+           pr->qmat_luma,   sizeof(pr->qmat_luma));
+    memcpy(metadata->mapped_mem + pp->qmat_off + sizeof(pr->qmat_luma),
+           pr->qmat_chroma, sizeof(pr->qmat_chroma));
 
     FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
     RET(ff_vk_exec_start(&ctx->s, exec));
@@ -230,7 +232,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                             VK_ACCESS_2_TRANSFER_WRITE_BIT,
                             VK_IMAGE_LAYOUT_GENERAL,
                             VK_QUEUE_FAMILY_IGNORED);
-
         vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
             .pBufferMemoryBarriers    = buf_bar,
@@ -261,7 +262,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
-
     ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata,
                       ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
                       COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
@@ -310,7 +310,6 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
-
     ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata,
                       COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
                       COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
@@ -331,9 +330,15 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                                     pp->mb_params_off,
                                     pp->mb_params_sz,
                                     VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, &pv->idct,
+                                    0, 1, 0,
+                                    metadata,
+                                    pp->qmat_off,
+                                    pp->qmat_sz,
+                                    VK_FORMAT_UNDEFINED);
     ff_vk_shader_update_img_array(&ctx->s, exec, &pv->idct,
                                   f, vp->view.out,
-                                  0, 1,
+                                  0, 2,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_NULL_HANDLE);
 
@@ -433,6 +438,11 @@ static int init_idct_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
             .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
             .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
         },
+        {
+            .name        = "qmat_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
         {
             .name       = "dst",
             .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -440,7 +450,7 @@ static int init_idct_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
             .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0));
+    RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 3, 0, 0));
 
     RET(ff_vk_shader_link(s, shd,
                           ff_prores_idct_comp_spv_data,
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] vulkan/prores: fix for Apple M-series (PR #21502)

Reply via email to