PR #21319 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21319 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21319.patch
A macro and a few other changes. >From 08c31b1d55395f5d23eb555689ee41156b08c6cf Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Sun, 28 Dec 2025 19:04:27 +0100 Subject: [PATCH 01/10] hwcontext_vulkan: enable subgroup extended types Like, of course I want to use int16_t in subgroups, what a stupid question was that? --- libavutil/hwcontext_vulkan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c index aa5f72e7f2..bb767f6c96 100644 --- a/libavutil/hwcontext_vulkan.c +++ b/libavutil/hwcontext_vulkan.c @@ -305,6 +305,7 @@ static void device_features_copy_needed(VulkanDeviceFeatures *dst, VulkanDeviceF COPY_VAL(vulkan_1_2.vulkanMemoryModelDeviceScope); COPY_VAL(vulkan_1_2.uniformBufferStandardLayout); COPY_VAL(vulkan_1_2.runtimeDescriptorArray); + COPY_VAL(vulkan_1_2.shaderSubgroupExtendedTypes); COPY_VAL(vulkan_1_3.dynamicRendering); COPY_VAL(vulkan_1_3.maintenance4); -- 2.49.1 >From 99cba5a342406d84be78269d3d5eda2d3ad1997c Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Fri, 19 Dec 2025 23:49:43 +0000 Subject: [PATCH 02/10] vulkan: use HOST_CACHED memory flag only if such a heap exists NVK does not offer such, so our code failed to allocate memory. --- libavcodec/ffv1enc_vulkan.c | 5 ++--- libavcodec/vulkan_encode.c | 2 +- libavutil/hwcontext_vulkan.c | 2 +- libavutil/vulkan.c | 4 ++++ libavutil/vulkan.h | 2 ++ 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index 86521af6c5..1dc6aa8e90 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -365,9 +365,8 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, NULL, maxsize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | (maxsize < fv->max_heap_size ? - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : 0x0) | - (!(fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) ? - VK_MEMORY_PROPERTY_HOST_CACHED_BIT : 0x0))); + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT : + fv->s.host_cached_flag))); out_data_buf = (FFVkBuffer *)fd->out_data_ref->data; ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1); diff --git a/libavcodec/vulkan_encode.c b/libavcodec/vulkan_encode.c index 7b534ffa30..5b84ad9db7 100644 --- a/libavcodec/vulkan_encode.c +++ b/libavcodec/vulkan_encode.c @@ -182,7 +182,7 @@ static int vulkan_encode_issue(AVCodecContext *avctx, VK_BUFFER_USAGE_VIDEO_ENCODE_DST_BIT_KHR, &ctx->profile_list, max_pkt_size, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + ctx->s.host_cached_flag); if (err < 0) return err; diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c index bb767f6c96..313359a4af 100644 --- a/libavutil/hwcontext_vulkan.c +++ b/libavutil/hwcontext_vulkan.c @@ -4338,7 +4338,7 @@ static int get_plane_buf(AVHWFramesContext *hwfc, AVBufferRef **dst, err = ff_vk_get_pooled_buffer(&p->vkctx, &fp->tmp, dst, buf_usage, NULL, buf_offset, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + p->vkctx.host_cached_flag); if (err < 0) return err; diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c index 7858e002ed..d4ac1544d1 100644 --- a/libavutil/vulkan.c +++ b/libavutil/vulkan.c @@ -212,6 +212,10 @@ int ff_vk_load_props(FFVulkanContext *s) vk->GetPhysicalDeviceMemoryProperties(s->hwctx->phys_dev, &s->mprops); vk->GetPhysicalDeviceFeatures2(s->hwctx->phys_dev, &s->feats); + for (int i = 0; i < s->mprops.memoryTypeCount; i++) + s->host_cached_flag |= s->mprops.memoryTypes[i].propertyFlags & + VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + load_enabled_qfs(s); if (s->qf_props) diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h index 29116bcb2c..d42bf514fe 100644 --- a/libavutil/vulkan.h +++ b/libavutil/vulkan.h @@ -301,6 +301,8 @@ typedef struct FFVulkanContext { VkPhysicalDeviceVulkan12Features feats_12; VkPhysicalDeviceFeatures2 feats; + VkMemoryPropertyFlagBits host_cached_flag; + AVBufferRef *device_ref; AVHWDeviceContext *device; AVVulkanDeviceContext *hwctx; -- 2.49.1 >From c99bfc4ee6e98b608d090be48fc34e314b589590 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 25 Dec 2025 00:18:13 +0100 Subject: [PATCH 03/10] vulkan_functions: add vkCmdDispatchBase Its useful for multi-stage operations. --- libavutil/vulkan_functions.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libavutil/vulkan_functions.h b/libavutil/vulkan_functions.h index d2e3c77bb8..9aed48aab3 100644 --- a/libavutil/vulkan_functions.h +++ b/libavutil/vulkan_functions.h @@ -115,6 +115,7 @@ typedef uint64_t FFVulkanExtensions; MACRO(1, 1, FF_VK_EXT_NO_FLAG, EndCommandBuffer) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, FreeCommandBuffers) \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdDispatch) \ + MACRO(1, 1, FF_VK_EXT_NO_FLAG, CmdDispatchBase) \ \ /* Queue */ \ MACRO(1, 1, FF_VK_EXT_NO_FLAG, GetDeviceQueue) \ -- 2.49.1 >From a48f37083a19ee7bacfe81f2273643b27d5d01a7 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 23 Dec 2025 19:03:45 +0100 Subject: [PATCH 04/10] vulkan: add ff_vk_buf_barrier() This is a shorthand way of writing buffer barrier structures. --- libavutil/vulkan.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h index d42bf514fe..115e9fc940 100644 --- a/libavutil/vulkan.h +++ b/libavutil/vulkan.h @@ -507,6 +507,25 @@ int ff_vk_create_imageviews(FFVulkanContext *s, FFVkExecContext *e, VkImageView views[AV_NUM_DATA_POINTERS], AVFrame *f, enum FFVkShaderRepFormat rep_fmt); +#define ff_vk_buf_barrier(dst, vkb, s_stage, s_access, s_access2, \ + d_stage, d_access, d_access2, offs, bsz) \ + do { \ + dst = (VkBufferMemoryBarrier2) { \ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, \ + .srcStageMask = VK_PIPELINE_STAGE_2_ ##s_stage, \ + .srcAccessMask = VK_ACCESS_2_ ##s_access | \ + VK_ACCESS_2_ ##s_access2, \ + .dstStageMask = VK_PIPELINE_STAGE_2_ ##d_stage, \ + .dstAccessMask = VK_ACCESS_2_ ##d_access | \ + VK_ACCESS_2_ ##d_access2, \ + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, \ + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, \ + .buffer = vkb->buf, \ + .offset = offs, \ + .size = bsz \ + }; \ + } while(0) + void ff_vk_frame_barrier(FFVulkanContext *s, FFVkExecContext *e, AVFrame *pic, VkImageMemoryBarrier2 *bar, int *nb_bar, VkPipelineStageFlags2 src_stage, -- 2.49.1 >From 2dbf1e1f7e1ad8923ddcba8cb40d8b54f6191909 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 23 Dec 2025 19:04:37 +0100 Subject: [PATCH 05/10] vulkan_ffv1: use ff_vk_buf_barrier() --- libavcodec/vulkan_ffv1.c | 90 +++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/libavcodec/vulkan_ffv1.c b/libavcodec/vulkan_ffv1.c index 168871d5d9..7766d67511 100644 --- a/libavcodec/vulkan_ffv1.c +++ b/libavcodec/vulkan_ffv1.c @@ -366,21 +366,20 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &fp->slice_offset_buf, 1, 0)); fp->slice_offset_buf = NULL; - /* Entry barrier for the slice state */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_state->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_state->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_state->buf, - .offset = 0, - .size = fp->slice_data_size*f->slice_count, - }; - + /* Entry barrier for the slice state (not preserved between frames) */ + if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + 0, fp->slice_data_size*f->slice_count); + else + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + 0, fp->slice_data_size*f->slice_count); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -388,8 +387,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - slice_state->stage = buf_bar[0].dstStageMask; - slice_state->access = buf_bar[0].dstAccessMask; nb_buf_bar = 0; nb_img_bar = 0; @@ -496,18 +493,23 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) 0, sizeof(pd_reset), &pd_reset); /* Sync between setup and reset shaders */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_state->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_state->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_state->buf, - .offset = 0, - .size = fp->slice_data_size*f->slice_count, - }; + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + 0, fp->slice_data_size*f->slice_count); + /* Probability data barrier */ + if (!(f->picture.f->flags & AV_FRAME_FLAG_KEY)) + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR, + fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE); + else + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR, + fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -515,8 +517,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - slice_state->stage = buf_bar[0].dstStageMask; - slice_state->access = buf_bar[0].dstAccessMask; nb_buf_bar = 0; nb_img_bar = 0; @@ -552,21 +552,17 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pd), &pd); - /* Sync between reset and decode shaders */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_state->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_state->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_state->buf, - .offset = fp->slice_data_size*f->slice_count, - .size = f->slice_count*(fp->slice_state_size - fp->slice_data_size), - }; - + /* Sync probabilities between reset and decode shaders */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + 0, fp->slice_data_size*f->slice_count); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_state, + COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + fp->slice_data_size*f->slice_count, VK_WHOLE_SIZE); /* Input frame barrier */ ff_vk_frame_barrier(&ctx->s, exec, f->picture.f, img_bar, &nb_img_bar, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, @@ -590,8 +586,6 @@ static int vk_ffv1_end_frame(AVCodecContext *avctx) .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - slice_state->stage = buf_bar[0].dstStageMask; - slice_state->access = buf_bar[0].dstAccessMask; nb_img_bar = 0; nb_buf_bar = 0; -- 2.49.1 >From dfe7656dbeeb246e0bb5de90b98c740dcde9cd41 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 23 Dec 2025 19:05:14 +0100 Subject: [PATCH 06/10] nlmeans_vulkan: use ff_vk_buf_barrier() --- libavfilter/vf_nlmeans_vulkan.c | 181 ++++++++++++-------------------- 1 file changed, 67 insertions(+), 114 deletions(-) diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c index b69e8ac0a2..7a765d9f31 100644 --- a/libavfilter/vf_nlmeans_vulkan.c +++ b/libavfilter/vf_nlmeans_vulkan.c @@ -740,8 +740,6 @@ static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec, { FFVulkanContext *vkctx = &s->vkctx; FFVulkanFunctions *vk = &vkctx->vkfn; - VkBufferMemoryBarrier2 buf_bar[2]; - int nb_buf_bar = 0; DenoisePushData pd = { { comp_offs[0], comp_offs[1], comp_offs[2], comp_offs[3] }, @@ -761,26 +759,17 @@ static int denoise_pass(NLMeansVulkanContext *s, FFVkExecContext *exec, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pd), &pd); - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = ws_vk->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = ws_vk->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = ws_vk->buf, - .size = ws_vk->size, - .offset = 0, - }; - + VkBufferMemoryBarrier2 buf_bar; + ff_vk_buf_barrier(buf_bar, ws_vk, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .pBufferMemoryBarriers = buf_bar, - .bufferMemoryBarrierCount = nb_buf_bar, + .pBufferMemoryBarriers = &buf_bar, + .bufferMemoryBarrierCount = 1, }); - ws_vk->stage = buf_bar[0].dstStageMask; - ws_vk->access = buf_bar[0].dstAccessMask; /* End of denoise pass */ vk->CmdDispatch(exec->buf, @@ -924,20 +913,14 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); - nb_buf_bar = 0; - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = ws_vk->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT, - .srcAccessMask = ws_vk->access, - .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = ws_vk->buf, - .size = ws_vk->size, - .offset = 0, - }; - + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], ws_vk, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + TRANSFER_BIT, TRANSFER_WRITE_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -945,8 +928,8 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - ws_vk->stage = buf_bar[0].dstStageMask; - ws_vk->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + nb_img_bar = 0; /* Buffer zeroing */ vk->CmdFillBuffer(exec->buf, ws_vk->buf, 0, ws_vk->size, 0x0); @@ -976,10 +959,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) ws_vk, ws_size * s-> opts.t, ws_size * s-> opts.t, VK_FORMAT_UNDEFINED)); + VkPipelineStageFlagBits2 ws_stage = VK_PIPELINE_STAGE_2_TRANSFER_BIT; + VkAccessFlagBits2 ws_access = VK_ACCESS_2_TRANSFER_WRITE_BIT; do { int wg_invoc = FFMIN((s->nb_offsets - offsets_dispatched)/TYPE_ELEMS, s->opts.t); - - /* Integral pipeline */ IntegralPushData pd = { { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] }, { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] }, @@ -993,55 +976,68 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) desc->nb_components, }; - ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical); - ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical, - VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(pd), &pd); - - nb_buf_bar = 0; - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = integral_vk->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = integral_vk->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = integral_vk->buf, - .size = integral_vk->size, - .offset = 0, - }; + /* Vertical pass */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - integral_vk->stage = buf_bar[0].dstStageMask; - integral_vk->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; - /* End of vertical pass */ + ff_vk_exec_bind_shader(vkctx, exec, &s->shd_vertical); + ff_vk_shader_update_push_const(vkctx, exec, &s->shd_vertical, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); vk->CmdDispatch(exec->buf, - FFALIGN(vkctx->output_width, s->shd_vertical.lg_size[0])/s->shd_vertical.lg_size[0], + FFALIGN(vkctx->output_width, s->shd_vertical.lg_size[0]) / + s->shd_vertical.lg_size[0], desc->nb_components, wg_invoc); + /* Horizontal pass */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk, + COMPUTE_SHADER_BIT, SHADER_STORAGE_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + 0, VK_WHOLE_SIZE); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + ff_vk_exec_bind_shader(vkctx, exec, &s->shd_horizontal); ff_vk_shader_update_push_const(vkctx, exec, &s->shd_horizontal, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pd), &pd); + vk->CmdDispatch(exec->buf, + FFALIGN(vkctx->output_height, s->shd_horizontal.lg_size[0]) / + s->shd_horizontal.lg_size[0], + desc->nb_components, + wg_invoc); - nb_buf_bar = 0; + /* Weights pass */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], integral_vk, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, + SHADER_STORAGE_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_STORAGE_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = integral_vk->stage, + .srcStageMask = ws_stage, .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = integral_vk->access, + .srcAccessMask = ws_access, .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = integral_vk->buf, - .size = integral_vk->size, + .buffer = ws_vk->buf, + .size = ws_vk->size, .offset = 0, }; vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { @@ -1049,16 +1045,10 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - integral_vk->stage = buf_bar[0].dstStageMask; - integral_vk->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + ws_stage = buf_bar[1].dstStageMask; + ws_access = buf_bar[1].dstAccessMask; - /* End of horizontal pass */ - vk->CmdDispatch(exec->buf, - FFALIGN(vkctx->output_height, s->shd_horizontal.lg_size[0])/s->shd_horizontal.lg_size[0], - desc->nb_components, - wg_invoc); - - /* Weights pipeline */ WeightsPushData wpd = { { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] }, { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] }, @@ -1075,52 +1065,15 @@ static int nlmeans_vulkan_filter_frame(AVFilterLink *link, AVFrame *in) ws_count, desc->nb_components, }; - ff_vk_exec_bind_shader(vkctx, exec, &s->shd_weights); ff_vk_shader_update_push_const(vkctx, exec, &s->shd_weights, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(wpd), &wpd); - - nb_buf_bar = 0; - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = integral_vk->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = integral_vk->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = integral_vk->buf, - .size = integral_vk->size, - .offset = 0, - }; - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = ws_vk->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = ws_vk->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = ws_vk->buf, - .size = ws_vk->size, - .offset = 0, - }; - vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .pBufferMemoryBarriers = buf_bar, - .bufferMemoryBarrierCount = nb_buf_bar, - }); - integral_vk->stage = buf_bar[0].dstStageMask; - integral_vk->access = buf_bar[0].dstAccessMask; - ws_vk->stage = buf_bar[1].dstStageMask; - ws_vk->access = buf_bar[1].dstAccessMask; - - /* End of weights pass */ vk->CmdDispatch(exec->buf, - FFALIGN(vkctx->output_width, s->shd_weights.lg_size[0])/s->shd_weights.lg_size[0], - FFALIGN(vkctx->output_height, s->shd_weights.lg_size[1])/s->shd_weights.lg_size[1], + FFALIGN(vkctx->output_width, s->shd_weights.lg_size[0]) / + s->shd_weights.lg_size[0], + FFALIGN(vkctx->output_height, s->shd_weights.lg_size[1]) / + s->shd_weights.lg_size[1], wg_invoc * desc->nb_components); offsets_dispatched += wg_invoc * TYPE_ELEMS; -- 2.49.1 >From e6f09619ec4d35384d3035faf59ca2f2f660ea79 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Wed, 24 Dec 2025 01:08:53 +0100 Subject: [PATCH 07/10] ffv1enc_vulkan: use ff_vk_buf_barrier() --- libavcodec/ffv1enc_vulkan.c | 220 +++++++++++++++--------------------- 1 file changed, 93 insertions(+), 127 deletions(-) diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index 1dc6aa8e90..3f3da6bbae 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -414,41 +414,16 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, VK_NULL_HANDLE); /* Add a buffer barrier between previous and current frame */ - if (!f->key_frame) { - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_data_buf->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_data_buf->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_data_buf->buf, - .size = VK_WHOLE_SIZE, - .offset = 0, - }; - } - - if (fv->optimize_rct) { - RET(run_rct_search(avctx, exec, - src, src_views, - slice_data_buf, slice_data_size)); - - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_data_buf->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_data_buf->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_data_buf->buf, - .size = slice_data_size*f->slice_count, - .offset = 0, - }; - } - + if (!f->key_frame) + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + 0, slice_data_size*f->slice_count); + else + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + 0, slice_data_size*f->slice_count); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -457,9 +432,23 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, .bufferMemoryBarrierCount = nb_buf_bar, }); nb_img_bar = 0; - if (nb_buf_bar) { - slice_data_buf->stage = buf_bar[0].dstStageMask; - slice_data_buf->access = buf_bar[0].dstAccessMask; + nb_buf_bar = 0; + + if (fv->optimize_rct) { + RET(run_rct_search(avctx, exec, + src, src_views, + slice_data_buf, slice_data_size)); + + /* Make sure the writes are visible to the setup shader */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + 0, slice_data_size*f->slice_count); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); nb_buf_bar = 0; } @@ -526,87 +515,78 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, })); } - /* Setup shader modified the slice data buffer */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_data_buf->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_data_buf->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_data_buf->buf, - .size = slice_data_size*f->slice_count, - .offset = 0, + /* Sync between setup and reset shaders */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + 0, slice_data_size*f->slice_count); + /* Prepare the probabilities */ + if (!f->key_frame) + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + slice_data_size*f->slice_count, VK_WHOLE_SIZE); + else + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + slice_data_size*f->slice_count, VK_WHOLE_SIZE); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + /* Run reset shader */ + FFv1VkResetParameters pd_reset; + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->reset, + 1, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(&fv->s, exec, &fv->reset); + pd_reset = (FFv1VkResetParameters) { + .slice_state = slice_data_buf->address + f->slice_count*256, + .plane_state_size = plane_state_size, + .codec_planes = f->plane_count, + .key_frame = f->key_frame, }; + for (int i = 0; i < f->quant_table_count; i++) + pd_reset.context_count[i] = f->context_count[i]; - if (f->key_frame || f->version > 3) { - FFv1VkResetParameters pd_reset; + ff_vk_shader_update_push_const(&fv->s, exec, &fv->reset, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd_reset), &pd_reset); + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, + f->plane_count); - ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->reset, - 1, 0, 0, - slice_data_buf, - 0, slice_data_size*f->slice_count, - VK_FORMAT_UNDEFINED); + /* Sync between reset and encode shaders */ + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + 0, slice_data_size*f->slice_count); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT, + slice_data_size*f->slice_count, VK_WHOLE_SIZE); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], results_data_buf, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); - /* Run setup shader */ - ff_vk_exec_bind_shader(&fv->s, exec, &fv->reset); - pd_reset = (FFv1VkResetParameters) { - .slice_state = slice_data_buf->address + f->slice_count*256, - .plane_state_size = plane_state_size, - .codec_planes = f->plane_count, - .key_frame = f->key_frame, - }; - for (int i = 0; i < f->quant_table_count; i++) - pd_reset.context_count[i] = f->context_count[i]; - - ff_vk_shader_update_push_const(&fv->s, exec, &fv->reset, - VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(pd_reset), &pd_reset); - - /* Sync between setup and reset shaders */ - vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .pBufferMemoryBarriers = buf_bar, - .bufferMemoryBarrierCount = nb_buf_bar, - }); - slice_data_buf->stage = buf_bar[0].dstStageMask; - slice_data_buf->access = buf_bar[0].dstAccessMask; - nb_buf_bar = 0; - - vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, - f->plane_count); - } - - /* If the reset shader ran, insert a barrier now. */ - if (f->key_frame || f->version > 3) { - /* Reset shader modified the slice data buffer */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = slice_data_buf->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = slice_data_buf->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT | - VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = slice_data_buf->buf, - .size = slice_data_buf->size - slice_data_size*f->slice_count, - .offset = slice_data_size*f->slice_count, - }; - } - - if (fv->is_rgb) { + if (fv->is_rgb) ff_vk_frame_barrier(&fv->s, exec, tmp, img_bar, &nb_img_bar, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); - } - /* Final barrier before encoding */ vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -615,11 +595,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, .bufferMemoryBarrierCount = nb_buf_bar, }); nb_img_bar = 0; - if (nb_buf_bar) { - slice_data_buf->stage = buf_bar[0].dstStageMask; - slice_data_buf->access = buf_bar[0].dstAccessMask; - nb_buf_bar = 0; - } + nb_buf_bar = 0; /* Main encode shader */ ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->enc, @@ -705,25 +681,15 @@ static int transfer_slices(AVCodecContext *avctx, mapped_ref = NULL; /* Ownership passed */ /* Ensure the output buffer is finished */ - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = out_data_buf->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT, - .srcAccessMask = out_data_buf->access, - .dstAccessMask = VK_ACCESS_2_TRANSFER_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = out_data_buf->buf, - .size = VK_WHOLE_SIZE, - .offset = 0, - }; + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + TRANSFER_BIT, TRANSFER_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, .bufferMemoryBarrierCount = nb_buf_bar, }); - out_data_buf->stage = buf_bar[0].dstStageMask; - out_data_buf->access = buf_bar[0].dstAccessMask; nb_buf_bar = 0; for (int i = 0; i < nb_regions; i++) -- 2.49.1 >From 2226b5d0386c3ca7239220cb1e9afbf0c305d625 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Wed, 24 Dec 2025 01:27:59 +0100 Subject: [PATCH 08/10] vulkan_prores: use ff_vk_buf_barrier() --- libavcodec/vulkan_prores.c | 44 +++++++++++--------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c index afea8857e8..7e7c2ace9c 100644 --- a/libavcodec/vulkan_prores.c +++ b/libavcodec/vulkan_prores.c @@ -250,27 +250,17 @@ static int vk_prores_end_frame(AVCodecContext *avctx) /* Input barrier, or synchronization between clear and vld shader */ ff_vk_frame_barrier(&ctx->s, exec, f, img_bar, &nb_img_bar, - pr->first_field ? VK_PIPELINE_STAGE_2_CLEAR_BIT : VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + pr->first_field ? VK_PIPELINE_STAGE_2_CLEAR_BIT : + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = metadata->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = metadata->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = metadata->buf, - .offset = pp->slice_offsets_sz, - .size = pp->mb_params_sz, - }; - metadata->stage = buf_bar[0].dstStageMask; - metadata->access = buf_bar[0].dstAccessMask; - + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata, + ALL_COMMANDS_BIT, NONE_KHR, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + pp->slice_offsets_sz, pp->mb_params_sz); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, @@ -302,7 +292,8 @@ static int vk_prores_end_frame(AVCodecContext *avctx) VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pd), &pd); - vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 3), AV_CEIL_RSHIFT(pr->mb_height, 3), + vk->CmdDispatch(exec->buf, AV_CEIL_RSHIFT(pr->slice_count / pr->mb_height, 3), + AV_CEIL_RSHIFT(pr->mb_height, 3), 3 + !!pr->alpha_info); /* Synchronize vld and idct shaders */ @@ -313,21 +304,10 @@ static int vk_prores_end_frame(AVCodecContext *avctx) VK_IMAGE_LAYOUT_GENERAL, VK_QUEUE_FAMILY_IGNORED); - buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, - .srcStageMask = metadata->stage, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = metadata->access, - .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = metadata->buf, - .offset = pp->slice_offsets_sz, - .size = pp->mb_params_sz, - }; - metadata->stage = buf_bar[0].dstStageMask; - metadata->access = buf_bar[0].dstAccessMask; - + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], metadata, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + pp->slice_offsets_sz, pp->mb_params_sz); vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pBufferMemoryBarriers = buf_bar, -- 2.49.1 >From 16e217541b4fec616b52b95e082f77513433be15 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Tue, 23 Dec 2025 19:08:04 +0100 Subject: [PATCH 09/10] vulkan: remove FFVkBuffer.stage and access Keeping global state for every buffer is unncessary and possibly suboptimal. --- libavutil/vulkan.c | 2 -- libavutil/vulkan.h | 4 ---- 2 files changed, 6 deletions(-) diff --git a/libavutil/vulkan.c b/libavutil/vulkan.c index d4ac1544d1..33d7e8aace 100644 --- a/libavutil/vulkan.c +++ b/libavutil/vulkan.c @@ -1309,8 +1309,6 @@ int ff_vk_get_pooled_buffer(FFVulkanContext *ctx, AVBufferPool **buf_pool, return AVERROR(ENOMEM); data = (FFVkBuffer *)ref->data; - data->stage = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; - data->access = VK_ACCESS_2_NONE; if (data->size >= size) return 0; diff --git a/libavutil/vulkan.h b/libavutil/vulkan.h index 115e9fc940..cde2876e46 100644 --- a/libavutil/vulkan.h +++ b/libavutil/vulkan.h @@ -91,10 +91,6 @@ typedef struct FFVkBuffer { size_t size; VkDeviceAddress address; - /* Local use only */ - VkPipelineStageFlags2 stage; - VkAccessFlags2 access; - /* Only valid when allocated via ff_vk_get_pooled_buffer with HOST_VISIBLE or * via ff_vk_host_map_buffer */ uint8_t *mapped_mem; -- 2.49.1 >From 5a7e16ce2df5b9bcf6bde0fedbec39cbcf7f1f36 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Wed, 24 Dec 2025 04:10:39 +0100 Subject: [PATCH 10/10] prores_raw_idct: use the same prores_idct method for copying coeffs This saves 2 barriers. --- libavcodec/vulkan/prores_raw_idct.comp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/libavcodec/vulkan/prores_raw_idct.comp b/libavcodec/vulkan/prores_raw_idct.comp index ffd71d1d73..c9850d17d7 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp +++ b/libavcodec/vulkan/prores_raw_idct.comp @@ -63,30 +63,32 @@ void main(void) uint8_t qmat_buf[64] = qmat; [[unroll]] - for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) { - int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[i])[0]); + for (uint y = 0; y < 8; y++) { + uint block_off = y*8 + ROW_ID; + int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[block_off])[0]); float vf = float(sign_extend(v, 16)) / 32768.0; - vf *= qmat_buf[i] * qscale; - blocks[BLOCK_ID][COMP_ID*64 + i] = (vf / (64*4.56)) * - idct_scale[i]; + vf *= qmat_buf[block_off] * qscale; + blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = (vf / (64*4.56)) * + idct_scale[block_off]; } + /* Column-wise iDCT */ + idct8(BLOCK_ID, COMP_ID*72 + ROW_ID, 9); barrier(); - idct8(BLOCK_ID, COMP_ID*64 + ROW_ID*8, 1); - blocks[BLOCK_ID][COMP_ID*64 + ROW_ID] += 0.5; + blocks[BLOCK_ID][COMP_ID*72 + ROW_ID * 9] += 0.5f; + /* Row-wise iDCT */ + idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1); barrier(); - idct8(BLOCK_ID, COMP_ID*64 + ROW_ID, 8); - barrier(); [[unroll]] - for (uint i = gl_LocalInvocationID.x; i < 64; i += gl_WorkGroupSize.x) { - int v = int(round(blocks[BLOCK_ID][COMP_ID*64 + i]*4095.0)); + for (uint y = 0; y < 8; y++) { + int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0)); v = clamp(v, 0, 4095); v <<= 4; imageStore(dst, - offs + 2*ivec2(BLOCK_ID*8 + (i & 7), i >> 3), + offs + 2*ivec2(BLOCK_ID*8 + ROW_ID, y), ivec4(v)); } } -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
