PR #21319 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21319
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21319.patch

A macro and a few other changes.


>From 08c31b1d55395f5d23eb555689ee41156b08c6cf Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Sun, 28 Dec 2025 19:04:27 +0100
Subject: [PATCH 01/10] hwcontext_vulkan: enable subgroup extended types

Like, of course I want to use int16_t in subgroups, what a stupid
question was that?
---
 libavutil/hwcontext_vulkan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index aa5f72e7f2..bb767f6c96 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -305,6 +305,7 @@ static void 
device_features_copy_needed(VulkanDeviceFeatures *dst, VulkanDeviceF
     COPY_VAL(vulkan_1_2.vulkanMemoryModelDeviceScope);
     COPY_VAL(vulkan_1_2.uniformBufferStandardLayout);
     COPY_VAL(vulkan_1_2.runtimeDescriptorArray);
+    COPY_VAL(vulkan_1_2.shaderSubgroupExtendedTypes);
 
     COPY_VAL(vulkan_1_3.dynamicRendering);
     COPY_VAL(vulkan_1_3.maintenance4);
-- 
2.49.1


>From 99cba5a342406d84be78269d3d5eda2d3ad1997c Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Fri, 19 Dec 2025 23:49:43 +0000
Subject: [PATCH 02/10] vulkan: use HOST_CACHED memory flag only if such a heap
 exists

NVK does not offer such, so our code failed to allocate memory.
---
 libavcodec/ffv1enc_vulkan.c  | 5 ++---
 libavcodec/vulkan_encode.c   | 2 +-
 libavutil/hwcontext_vulkan.c | 2 +-
 libavutil/vulkan.c           | 4 ++++
 libavutil/vulkan.h           | 2 ++
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 86521af6c5..1dc6aa8e90 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -365,9 +365,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
                                 NULL, maxsize,
                                 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                                 (maxsize < fv->max_heap_size ?
-                                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0) |
-                                (!(fv->s.extensions & 
FF_VK_EXT_EXTERNAL_HOST_MEMORY) ?
-                                 VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0x0)));
+                                 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT :
+                                 fv->s.host_cached_flag)));
     out_data_buf = (FFVkBuffer *)fd->out_data_ref->data;
     ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1);
 
diff --git a/libavcodec/vulkan_encode.c b/libavcodec/vulkan_encode.c
index 7b534ffa30..5b84ad9db7 100644
--- a/libavcodec/vulkan_encode.c
+++ b/libavcodec/vulkan_encode.c
@@ -182,7 +182,7 @@ static int vulkan_encode_issue(AVCodecContext *avctx,
                                   VK_BUFFER_USAGE_VIDEO_ENCODE_DST_BIT_KHR,
                                   &ctx->profile_list, max_pkt_size,
                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
+                                  ctx->s.host_cached_flag);
     if (err < 0)
         return err;
 
diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
index bb767f6c96..313359a4af 100644
--- a/libavutil/hwcontext_vulkan.c
+++ b/libavutil/hwcontext_vulkan.c
@@ -4338,7 +4338,7 @@ static int get_plane_buf(AVHWFramesContext *hwfc, 
AVBufferRef **dst,
     err = ff_vk_get_pooled_buffer(&p->vkctx, &fp->tmp, dst, buf_usage,
                                   NULL, buf_offset,
                                   VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
+                                  p->vkctx.host_cached_flag);
     if (err < 0)
         return err;
 
diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index 7858e002ed..d4ac1544d1 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -212,6 +212,10 @@ int ff_vk_load_props(FFVulkanContext *s)
     vk->GetPhysicalDeviceMemoryProperties(s->hwctx->phys_dev, &s->mprops);
     vk->GetPhysicalDeviceFeatures2(s->hwctx->phys_dev, &s->feats);
 
+    for (int i = 0; i < s->mprops.memoryTypeCount; i++)
+        s->host_cached_flag |= s->mprops.memoryTypes[i].propertyFlags &
+                               VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+
     load_enabled_qfs(s);
 
     if (s->qf_props)
diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h
index 29116bcb2c..d42bf514fe 100644
--- a/libavutil/vulkan.h
+++ b/libavutil/vulkan.h
@@ -301,6 +301,8 @@ typedef struct FFVulkanContext {
     VkPhysicalDeviceVulkan12Features feats_12;
     VkPhysicalDeviceFeatures2 feats;
 
+    VkMemoryPropertyFlagBits host_cached_flag;
+
     AVBufferRef           *device_ref;
     AVHWDeviceContext     *device;
     AVVulkanDeviceContext *hwctx;
-- 
2.49.1


>From c99bfc4ee6e98b608d090be48fc34e314b589590 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Thu, 25 Dec 2025 00:18:13 +0100
Subject: [PATCH 03/10] vulkan_functions: add vkCmdDispatchBase

Its useful for multi-stage operations.
---
 libavutil/vulkan_functions.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h
index d2e3c77bb8..9aed48aab3 100644
--- a/libavutil/vulkan_functions.h
+++ b/libavutil/vulkan_functions.h
@@ -115,6 +115,7 @@ typedef uint64_t FFVulkanExtensions;
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              EndCommandBuffer)              
          \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              FreeCommandBuffers)            
          \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdDispatch)                   
          \
+    MACRO(1, 1, FF_VK_EXT_NO_FLAG,              CmdDispatchBase)               
                        \
                                                                                
          \
     /* Queue */                                                                
          \
     MACRO(1, 1, FF_VK_EXT_NO_FLAG,              GetDeviceQueue)                
          \
-- 
2.49.1


>From a48f37083a19ee7bacfe81f2273643b27d5d01a7 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 23 Dec 2025 19:03:45 +0100
Subject: [PATCH 04/10] vulkan: add ff_vk_buf_barrier()

This is a shorthand way of writing buffer barrier structures.
---
 libavutil/vulkan.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h
index d42bf514fe..115e9fc940 100644
--- a/libavutil/vulkan.h
+++ b/libavutil/vulkan.h
@@ -507,6 +507,25 @@ int ff_vk_create_imageviews(FFVulkanContext *s, 
FFVkExecContext *e,
                             VkImageView views[AV_NUM_DATA_POINTERS],
                             AVFrame *f, enum FFVkShaderRepFormat rep_fmt);
 
+#define ff_vk_buf_barrier(dst, vkb, s_stage, s_access, s_access2,              
\
+                          d_stage, d_access, d_access2, offs, bsz)             
\
+    do {                                                                       
\
+        dst = (VkBufferMemoryBarrier2) {                                       
\
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,                
\
+            .srcStageMask = VK_PIPELINE_STAGE_2_ ##s_stage,                    
\
+            .srcAccessMask = VK_ACCESS_2_ ##s_access |                         
\
+                             VK_ACCESS_2_ ##s_access2,                         
\
+            .dstStageMask = VK_PIPELINE_STAGE_2_ ##d_stage,                    
\
+            .dstAccessMask = VK_ACCESS_2_ ##d_access |                         
\
+                             VK_ACCESS_2_ ##d_access2,                         
\
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,                    
\
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,                    
\
+            .buffer = vkb->buf,                                                
\
+            .offset = offs,                                                    
\
+            .size = bsz                                                        
\
+        };                                                                     
\
+    } while(0)
+
 void ff_vk_frame_barrier(FFVulkanContext *s, FFVkExecContext *e,
                          AVFrame *pic, VkImageMemoryBarrier2 *bar, int *nb_bar,
                          VkPipelineStageFlags2 src_stage,
-- 
2.49.1


>From 2dbf1e1f7e1ad8923ddcba8cb40d8b54f6191909 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 23 Dec 2025 19:04:37 +0100
Subject: [PATCH 05/10] vulkan_ffv1: use ff_vk_buf_barrier()

---
 libavcodec/vulkan_ffv1.c | 90 +++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 48 deletions(-)

diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c
index 168871d5d9..7766d67511 100644
--- a/libavcodec/vulkan_ffv1.c
+++ b/libavcodec/vulkan_ffv1.c
@@ -366,21 +366,20 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
     RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->slice_offset_buf, 1, 0));
     fp->slice_offset_buf = NULL;
 
-    /* Entry barrier for the slice state */
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = slice_state->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = slice_state->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = slice_state->buf,
-        .offset = 0,
-        .size = fp->slice_data_size*f->slice_count,
-    };
-
+    /* Entry barrier for the slice state (not preserved between frames) */
+    if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY))
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                          ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          0, fp->slice_data_size*f->slice_count);
+    else
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          0, fp->slice_data_size*f->slice_count);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -388,8 +387,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
-    slice_state->stage = buf_bar[0].dstStageMask;
-    slice_state->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
     nb_img_bar = 0;
 
@@ -496,18 +493,23 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
                                    0, sizeof(pd_reset), &pd_reset);
 
     /* Sync between setup and reset shaders */
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = slice_state->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = slice_state->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = slice_state->buf,
-        .offset = 0,
-        .size = fp->slice_data_size*f->slice_count,
-    };
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                          SHADER_STORAGE_WRITE_BIT,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
+                      0, fp->slice_data_size*f->slice_count);
+    /* Probability data barrier */
+    if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY))
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                          ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, 
NONE_KHR,
+                          fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE);
+    else
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, 
NONE_KHR,
+                          fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -515,8 +517,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
-    slice_state->stage = buf_bar[0].dstStageMask;
-    slice_state->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
     nb_img_bar = 0;
 
@@ -552,21 +552,17 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
 
-    /* Sync between reset and decode shaders */
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = slice_state->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = slice_state->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = slice_state->buf,
-        .offset = fp->slice_data_size*f->slice_count,
-        .size = f->slice_count*(fp->slice_state_size - fp->slice_data_size),
-    };
-
+    /* Sync probabilities between reset and decode shaders */
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                          SHADER_STORAGE_WRITE_BIT,
+                      0, fp->slice_data_size*f->slice_count);
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                          SHADER_STORAGE_WRITE_BIT,
+                      fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE);
     /* Input frame barrier */
     ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar,
                         VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
@@ -590,8 +586,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx)
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
-    slice_state->stage = buf_bar[0].dstStageMask;
-    slice_state->access = buf_bar[0].dstAccessMask;
     nb_img_bar = 0;
     nb_buf_bar = 0;
 
-- 
2.49.1


>From dfe7656dbeeb246e0bb5de90b98c740dcde9cd41 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 23 Dec 2025 19:05:14 +0100
Subject: [PATCH 06/10] nlmeans_vulkan: use ff_vk_buf_barrier()

---
 libavfilter/vf_nlmeans_vulkan.c | 181 ++++++++++++--------------------
 1 file changed, 67 insertions(+), 114 deletions(-)

diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index b69e8ac0a2..7a765d9f31 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -740,8 +740,6 @@ static int denoise_pass(NLMeansVulkanContext *s, 
FFVkExecContext *exec,
 {
     FFVulkanContext *vkctx = &s->vkctx;
     FFVulkanFunctions *vk = &vkctx->vkfn;
-    VkBufferMemoryBarrier2 buf_bar[2];
-    int nb_buf_bar = 0;
 
     DenoisePushData pd = {
         { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] },
@@ -761,26 +759,17 @@ static int denoise_pass(NLMeansVulkanContext *s, 
FFVkExecContext *exec,
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
 
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = ws_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = ws_vk->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = ws_vk->buf,
-        .size = ws_vk->size,
-        .offset = 0,
-    };
-
+    VkBufferMemoryBarrier2 buf_bar;
+    ff_vk_buf_barrier(buf_bar, ws_vk,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                          SHADER_STORAGE_WRITE_BIT,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = nb_buf_bar,
+            .pBufferMemoryBarriers = &buf_bar,
+            .bufferMemoryBarrierCount = 1,
         });
-    ws_vk->stage = buf_bar[0].dstStageMask;
-    ws_vk->access = buf_bar[0].dstAccessMask;
 
     /* End of denoise pass */
     vk->CmdDispatch(exec->buf,
@@ -924,20 +913,14 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
-    nb_buf_bar = 0;
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = ws_vk->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
-        .srcAccessMask = ws_vk->access,
-        .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = ws_vk->buf,
-        .size = ws_vk->size,
-        .offset = 0,
-    };
-
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], ws_vk,
+                      ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                      TRANSFER_BIT,     TRANSFER_WRITE_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk,
+                      ALL_COMMANDS_BIT,   NONE_KHR, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
             .pImageMemoryBarriers = img_bar,
@@ -945,8 +928,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, 
AVFrame *in)
             .pBufferMemoryBarriers = buf_bar,
             .bufferMemoryBarrierCount = nb_buf_bar,
         });
-    ws_vk->stage = buf_bar[0].dstStageMask;
-    ws_vk->access = buf_bar[0].dstAccessMask;
+    nb_buf_bar = 0;
+    nb_img_bar = 0;
 
     /* Buffer zeroing */
     vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0);
@@ -976,10 +959,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
                                         ws_vk, ws_size * s-> opts.t, ws_size * 
s-> opts.t,
                                         VK_FORMAT_UNDEFINED));
 
+    VkPipelineStageFlagBits2 ws_stage = VK_PIPELINE_STAGE_2_TRANSFER_BIT;
+    VkAccessFlagBits2 ws_access = VK_ACCESS_2_TRANSFER_WRITE_BIT;
     do {
         int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, 
s->opts.t);
-
-        /* Integral pipeline */
         IntegralPushData pd = {
             { plane_widths[0], plane_widths[1], plane_widths[2], 
plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], 
plane_heights[3] },
@@ -993,55 +976,68 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
             desc->nb_components,
         };
 
-        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical);
-        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical,
-                                       VK_SHADER_STAGE_COMPUTE_BIT,
-                                       0, sizeof(pd), &pd);
-
-        nb_buf_bar = 0;
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = integral_vk->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = integral_vk->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = integral_vk->buf,
-            .size = integral_vk->size,
-            .offset = 0,
-        };
+        /* Vertical pass */
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, 
NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, 
NONE_KHR,
+                          0, VK_WHOLE_SIZE);
         vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
             .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
             .pBufferMemoryBarriers = buf_bar,
             .bufferMemoryBarrierCount = nb_buf_bar,
         });
-        integral_vk->stage = buf_bar[0].dstStageMask;
-        integral_vk->access = buf_bar[0].dstAccessMask;
+        nb_buf_bar = 0;
 
-        /* End of vertical pass */
+        ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical);
+        ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical,
+                                       VK_SHADER_STAGE_COMPUTE_BIT,
+                                       0, sizeof(pd), &pd);
         vk->CmdDispatch(exec->buf,
-                        FFALIGN(vkctx->output_width, 
s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0],
+                        FFALIGN(vkctx->output_width, 
s->shd_vertical.lg_size[0]) /
+                            s->shd_vertical.lg_size[0],
                         desc->nb_components,
                         wg_invoc);
 
+        /* Horizontal pass */
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, 
NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          0, VK_WHOLE_SIZE);
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
+        nb_buf_bar = 0;
+
         ff_vk_exec_bind_shader(vkctx, exec, &s->shd_horizontal);
         ff_vk_shader_update_push_const(vkctx, exec, &s->shd_horizontal,
                                        VK_SHADER_STAGE_COMPUTE_BIT,
                                        0, sizeof(pd), &pd);
+        vk->CmdDispatch(exec->buf,
+                        FFALIGN(vkctx->output_height, 
s->shd_horizontal.lg_size[0]) /
+                            s->shd_horizontal.lg_size[0],
+                        desc->nb_components,
+                        wg_invoc);
 
-        nb_buf_bar = 0;
+        /* Weights pass */
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT,
+                                              SHADER_STORAGE_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, 
NONE_KHR,
+                          0, VK_WHOLE_SIZE);
         buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
             .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = integral_vk->stage,
+            .srcStageMask = ws_stage,
             .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = integral_vk->access,
+            .srcAccessMask = ws_access,
             .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
                                 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
             .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
             .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = integral_vk->buf,
-            .size = integral_vk->size,
+            .buffer = ws_vk->buf,
+            .size = ws_vk->size,
             .offset = 0,
         };
         vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
@@ -1049,16 +1045,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
             .pBufferMemoryBarriers = buf_bar,
             .bufferMemoryBarrierCount = nb_buf_bar,
         });
-        integral_vk->stage = buf_bar[0].dstStageMask;
-        integral_vk->access = buf_bar[0].dstAccessMask;
+        nb_buf_bar = 0;
+        ws_stage = buf_bar[1].dstStageMask;
+        ws_access = buf_bar[1].dstAccessMask;
 
-        /* End of horizontal pass */
-        vk->CmdDispatch(exec->buf,
-                        FFALIGN(vkctx->output_height, 
s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0],
-                        desc->nb_components,
-                        wg_invoc);
-
-        /* Weights pipeline */
         WeightsPushData wpd = {
             { plane_widths[0], plane_widths[1], plane_widths[2], 
plane_widths[3] },
             { plane_heights[0], plane_heights[1], plane_heights[2], 
plane_heights[3] },
@@ -1075,52 +1065,15 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink 
*link, AVFrame *in)
             ws_count,
             desc->nb_components,
         };
-
         ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights);
         ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights,
                                         VK_SHADER_STAGE_COMPUTE_BIT,
                                         0, sizeof(wpd), &wpd);
-
-        nb_buf_bar = 0;
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = integral_vk->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = integral_vk->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = integral_vk->buf,
-            .size = integral_vk->size,
-            .offset = 0,
-        };
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = ws_vk->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = ws_vk->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                                VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = ws_vk->buf,
-            .size = ws_vk->size,
-            .offset = 0,
-        };
-        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = nb_buf_bar,
-        });
-        integral_vk->stage = buf_bar[0].dstStageMask;
-        integral_vk->access = buf_bar[0].dstAccessMask;
-        ws_vk->stage = buf_bar[1].dstStageMask;
-        ws_vk->access = buf_bar[1].dstAccessMask;
-
-        /* End of weights pass */
         vk->CmdDispatch(exec->buf,
-                        FFALIGN(vkctx->output_width, 
s->shd_weights.lg_size[0])/s->shd_weights.lg_size[0],
-                        FFALIGN(vkctx->output_height, 
s->shd_weights.lg_size[1])/s->shd_weights.lg_size[1],
+                        FFALIGN(vkctx->output_width, 
s->shd_weights.lg_size[0]) /
+                            s->shd_weights.lg_size[0],
+                        FFALIGN(vkctx->output_height, 
s->shd_weights.lg_size[1]) /
+                            s->shd_weights.lg_size[1],
                         wg_invoc * desc->nb_components);
 
         offsets_dispatched += wg_invoc * TYPE_ELEMS;
-- 
2.49.1


>From e6f09619ec4d35384d3035faf59ca2f2f660ea79 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Wed, 24 Dec 2025 01:08:53 +0100
Subject: [PATCH 07/10] ffv1enc_vulkan: use ff_vk_buf_barrier()

---
 libavcodec/ffv1enc_vulkan.c | 220 +++++++++++++++---------------------
 1 file changed, 93 insertions(+), 127 deletions(-)

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 1dc6aa8e90..3f3da6bbae 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -414,41 +414,16 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
                                   VK_NULL_HANDLE);
 
     /* Add a buffer barrier between previous and current frame */
-    if (!f->key_frame) {
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = slice_data_buf->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = slice_data_buf->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                             VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = slice_data_buf->buf,
-            .size = VK_WHOLE_SIZE,
-            .offset = 0,
-        };
-    }
-
-    if (fv->optimize_rct) {
-        RET(run_rct_search(avctx, exec,
-                           src, src_views,
-                           slice_data_buf, slice_data_size));
-
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = slice_data_buf->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = slice_data_buf->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = slice_data_buf->buf,
-            .size = slice_data_size*f->slice_count,
-            .offset = 0,
-        };
-    }
-
+    if (!f->key_frame)
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                          ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          0, slice_data_size*f->slice_count);
+    else
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          0, slice_data_size*f->slice_count);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -457,9 +432,23 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
     nb_img_bar = 0;
-    if (nb_buf_bar) {
-        slice_data_buf->stage = buf_bar[0].dstStageMask;
-        slice_data_buf->access = buf_bar[0].dstAccessMask;
+    nb_buf_bar = 0;
+
+    if (fv->optimize_rct) {
+        RET(run_rct_search(avctx, exec,
+                           src, src_views,
+                           slice_data_buf, slice_data_size));
+
+        /* Make sure the writes are visible to the setup shader */
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          0, slice_data_size*f->slice_count);
+        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+            .pBufferMemoryBarriers = buf_bar,
+            .bufferMemoryBarrierCount = nb_buf_bar,
+        });
         nb_buf_bar = 0;
     }
 
@@ -526,87 +515,78 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
                                }));
     }
 
-    /* Setup shader modified the slice data buffer */
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = slice_data_buf->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask = slice_data_buf->access,
-        .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = slice_data_buf->buf,
-        .size = slice_data_size*f->slice_count,
-        .offset = 0,
+    /* Sync between setup and reset shaders */
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
+                      0, slice_data_size*f->slice_count);
+    /* Prepare the probabilities */
+    if (!f->key_frame)
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                          ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                          COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                          slice_data_size*f->slice_count, VK_WHOLE_SIZE);
+    else
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                          slice_data_size*f->slice_count, VK_WHOLE_SIZE);
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pBufferMemoryBarriers = buf_bar,
+        .bufferMemoryBarrierCount = nb_buf_bar,
+    });
+    nb_buf_bar = 0;
+
+    /* Run reset shader */
+    FFv1VkResetParameters pd_reset;
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->reset,
+                                    1, 0, 0,
+                                    slice_data_buf,
+                                    0, slice_data_size*f->slice_count,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->reset);
+    pd_reset = (FFv1VkResetParameters) {
+        .slice_state = slice_data_buf->address + f->slice_count*256,
+        .plane_state_size = plane_state_size,
+        .codec_planes = f->plane_count,
+        .key_frame = f->key_frame,
     };
+    for (int i = 0; i < f->quant_table_count; i++)
+        pd_reset.context_count[i] = f->context_count[i];
 
-    if (f->key_frame || f->version > 3) {
-        FFv1VkResetParameters pd_reset;
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->reset,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd_reset), &pd_reset);
+    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices,
+                    f->plane_count);
 
-        ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->reset,
-                                        1, 0, 0,
-                                        slice_data_buf,
-                                        0, slice_data_size*f->slice_count,
-                                        VK_FORMAT_UNDEFINED);
+    /* Sync between reset and encode shaders */
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT,
+                      0, slice_data_size*f->slice_count);
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT,
+                      slice_data_size*f->slice_count, VK_WHOLE_SIZE);
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], results_data_buf,
+                      ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf,
+                      ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
 
-        /* Run setup shader */
-        ff_vk_exec_bind_shader(&fv->s, exec, &fv->reset);
-        pd_reset = (FFv1VkResetParameters) {
-            .slice_state = slice_data_buf->address + f->slice_count*256,
-            .plane_state_size = plane_state_size,
-            .codec_planes = f->plane_count,
-            .key_frame = f->key_frame,
-        };
-        for (int i = 0; i < f->quant_table_count; i++)
-            pd_reset.context_count[i] = f->context_count[i];
-
-        ff_vk_shader_update_push_const(&fv->s, exec, &fv->reset,
-                                       VK_SHADER_STAGE_COMPUTE_BIT,
-                                       0, sizeof(pd_reset), &pd_reset);
-
-        /* Sync between setup and reset shaders */
-        vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
-            .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-            .pBufferMemoryBarriers = buf_bar,
-            .bufferMemoryBarrierCount = nb_buf_bar,
-        });
-        slice_data_buf->stage = buf_bar[0].dstStageMask;
-        slice_data_buf->access = buf_bar[0].dstAccessMask;
-        nb_buf_bar = 0;
-
-        vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices,
-                        f->plane_count);
-    }
-
-    /* If the reset shader ran, insert a barrier now. */
-    if (f->key_frame || f->version > 3) {
-        /* Reset shader modified the slice data buffer */
-        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-            .srcStageMask = slice_data_buf->stage,
-            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-            .srcAccessMask = slice_data_buf->access,
-            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
-                             VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = slice_data_buf->buf,
-            .size = slice_data_buf->size - slice_data_size*f->slice_count,
-            .offset = slice_data_size*f->slice_count,
-        };
-    }
-
-    if (fv->is_rgb) {
+    if (fv->is_rgb)
         ff_vk_frame_barrier(&fv->s, exec, tmp, img_bar, &nb_img_bar,
                             VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                             VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
                             VK_ACCESS_SHADER_READ_BIT | 
VK_ACCESS_SHADER_WRITE_BIT,
                             VK_IMAGE_LAYOUT_GENERAL,
                             VK_QUEUE_FAMILY_IGNORED);
-    }
 
-    /* Final barrier before encoding */
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -615,11 +595,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
     nb_img_bar = 0;
-    if (nb_buf_bar) {
-        slice_data_buf->stage = buf_bar[0].dstStageMask;
-        slice_data_buf->access = buf_bar[0].dstAccessMask;
-        nb_buf_bar = 0;
-    }
+    nb_buf_bar = 0;
 
     /* Main encode shader */
     ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->enc,
@@ -705,25 +681,15 @@ static int transfer_slices(AVCodecContext *avctx,
     mapped_ref = NULL; /* Ownership passed */
 
     /* Ensure the output buffer is finished */
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask = out_data_buf->stage,
-        .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
-        .srcAccessMask = out_data_buf->access,
-        .dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer = out_data_buf->buf,
-        .size = VK_WHOLE_SIZE,
-        .offset = 0,
-    };
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      TRANSFER_BIT, TRANSFER_READ_BIT, NONE_KHR,
+                      0, VK_WHOLE_SIZE);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pBufferMemoryBarriers = buf_bar,
         .bufferMemoryBarrierCount = nb_buf_bar,
     });
-    out_data_buf->stage = buf_bar[0].dstStageMask;
-    out_data_buf->access = buf_bar[0].dstAccessMask;
     nb_buf_bar = 0;
 
     for (int i = 0; i < nb_regions; i++)
-- 
2.49.1


>From 2226b5d0386c3ca7239220cb1e9afbf0c305d625 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Wed, 24 Dec 2025 01:27:59 +0100
Subject: [PATCH 08/10] vulkan_prores: use ff_vk_buf_barrier()

---
 libavcodec/vulkan_prores.c | 44 +++++++++++---------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index afea8857e8..7e7c2ace9c 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -250,27 +250,17 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
 
     /* Input barrier, or synchronization between clear and vld shader */
     ff_vk_frame_barrier(&ctx->s, exec, f, img_bar, &nb_img_bar,
-                        pr->first_field ? VK_PIPELINE_STAGE_2_CLEAR_BIT : 
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        pr->first_field ? VK_PIPELINE_STAGE_2_CLEAR_BIT :
+                                          VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                         VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
                         VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask        = metadata->stage,
-        .dstStageMask        = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask       = metadata->access,
-        .dstAccessMask       = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer              = metadata->buf,
-        .offset              = pp->slice_offsets_sz,
-        .size                = pp->mb_params_sz,
-    };
-    metadata->stage  = buf_bar[0].dstStageMask;
-    metadata->access = buf_bar[0].dstAccessMask;
-
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata,
+                      ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      pp->slice_offsets_sz, pp->mb_params_sz);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pBufferMemoryBarriers    = buf_bar,
@@ -302,7 +292,8 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                                    VK_SHADER_STAGE_COMPUTE_BIT,
                                    0, sizeof(pd), &pd);
 
-    vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 
3), AV_CEIL_RSHIFT(pr->mb_height, 3),
+    vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 
3),
+                    AV_CEIL_RSHIFT(pr->mb_height, 3),
                     3 + !!pr->alpha_info);
 
     /* Synchronize vld and idct shaders */
@@ -313,21 +304,10 @@ static int vk_prores_end_frame(AVCodecContext *avctx)
                         VK_IMAGE_LAYOUT_GENERAL,
                         VK_QUEUE_FAMILY_IGNORED);
 
-    buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
-        .sType               = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
-        .srcStageMask        = metadata->stage,
-        .dstStageMask        = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-        .srcAccessMask       = metadata->access,
-        .dstAccessMask       = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
-        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-        .buffer              = metadata->buf,
-        .offset              = pp->slice_offsets_sz,
-        .size                = pp->mb_params_sz,
-    };
-    metadata->stage  = buf_bar[0].dstStageMask;
-    metadata->access = buf_bar[0].dstAccessMask;
-
+    ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata,
+                      COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
+                      COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
+                      pp->slice_offsets_sz, pp->mb_params_sz);
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pBufferMemoryBarriers    = buf_bar,
-- 
2.49.1


>From 16e217541b4fec616b52b95e082f77513433be15 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Tue, 23 Dec 2025 19:08:04 +0100
Subject: [PATCH 09/10] vulkan: remove FFVkBuffer.stage and access

Keeping global state for every buffer is unncessary and possibly
suboptimal.
---
 libavutil/vulkan.c | 2 --
 libavutil/vulkan.h | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c
index d4ac1544d1..33d7e8aace 100644
--- a/libavutil/vulkan.c
+++ b/libavutil/vulkan.c
@@ -1309,8 +1309,6 @@ int ff_vk_get_pooled_buffer(FFVulkanContext *ctx, 
AVBufferPool **buf_pool,
         return AVERROR(ENOMEM);
 
     data = (FFVkBuffer *)ref->data;
-    data->stage = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
-    data->access = VK_ACCESS_2_NONE;
 
     if (data->size >= size)
         return 0;
diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h
index 115e9fc940..cde2876e46 100644
--- a/libavutil/vulkan.h
+++ b/libavutil/vulkan.h
@@ -91,10 +91,6 @@ typedef struct FFVkBuffer {
     size_t size;
     VkDeviceAddress address;
 
-    /* Local use only */
-    VkPipelineStageFlags2 stage;
-    VkAccessFlags2 access;
-
     /* Only valid when allocated via ff_vk_get_pooled_buffer with HOST_VISIBLE 
or
      * via ff_vk_host_map_buffer */
     uint8_t *mapped_mem;
-- 
2.49.1


>From 5a7e16ce2df5b9bcf6bde0fedbec39cbcf7f1f36 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Wed, 24 Dec 2025 04:10:39 +0100
Subject: [PATCH 10/10] prores_raw_idct: use the same prores_idct method for
 copying coeffs

This saves 2 barriers.
---
 libavcodec/vulkan/prores_raw_idct.comp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/libavcodec/vulkan/prores_raw_idct.comp 
b/libavcodec/vulkan/prores_raw_idct.comp
index ffd71d1d73..c9850d17d7 100644
--- a/libavcodec/vulkan/prores_raw_idct.comp
+++ b/libavcodec/vulkan/prores_raw_idct.comp
@@ -63,30 +63,32 @@ void main(void)
     uint8_t qmat_buf[64] = qmat;
 
     [[unroll]]
-    for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
-        int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + 
scan[i])[0]);
+    for (uint y = 0; y < 8; y++) {
+        uint block_off = y*8 + ROW_ID;
+        int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + 
scan[block_off])[0]);
         float vf = float(sign_extend(v, 16)) / 32768.0;
-        vf *= qmat_buf[i] * qscale;
-        blocks[BLOCK_ID][COMP_ID*64 + i] = (vf / (64*4.56)) *
-                                           idct_scale[i];
+        vf *= qmat_buf[block_off] * qscale;
+        blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = (vf / (64*4.56)) *
+                                                      idct_scale[block_off];
     }
 
+    /* Column-wise iDCT */
+    idct8(BLOCK_ID, COMP_ID*72 + ROW_ID, 9);
     barrier();
-    idct8(BLOCK_ID, COMP_ID*64 + ROW_ID*8, 1);
 
-    blocks[BLOCK_ID][COMP_ID*64 + ROW_ID] += 0.5;
+    blocks[BLOCK_ID][COMP_ID*72 + ROW_ID * 9] += 0.5f;
 
+    /* Row-wise iDCT */
+    idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1);
     barrier();
-    idct8(BLOCK_ID, COMP_ID*64 + ROW_ID, 8);
 
-    barrier();
     [[unroll]]
-    for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) {
-        int v = int(round(blocks[BLOCK_ID][COMP_ID*64 + i]*4095.0));
+    for (uint y = 0; y < 8; y++) {
+        int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0));
         v = clamp(v, 0, 4095);
         v <<= 4;
         imageStore(dst,
-                   offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3),
+                   offs + 2*ivec2(BLOCK_ID*8 + ROW_ID, y),
                    ivec4(v));
     }
 }
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to