PR #21514 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21514 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21514.patch
The main reason this was written was due to Nvidia. Nvidia always has a fickle upload path, and seemed to have a shortcut for the host image upload path. This seems to have been patched out of recent driver versions. This upload path relies on the driver keeping the same layout, down to the stride for the images. Which is an assumption that's not portable. Rather than relying on this fickle upload path, what we'd like when we want pure bandwidth is to decouple uploads to a separate queue, and let the GPU pull the data from RAM via uploads. It'll be slower with a single-threaded decoder, but currently all of our compute-based decoders and the decoders that sit underneath them support frame threading. >From 2b309836266b969ec0f4f6d7feefdf4c54e7dcc1 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Fri, 16 Jan 2026 16:09:05 +0100 Subject: [PATCH 1/3] vulkan: remove IS_WITHIN macro This is the more correct GLSL solution. --- libavcodec/vulkan/common.comp | 3 --- libavcodec/vulkan/dpx_unpack.comp.glsl | 2 +- libavfilter/vulkan/avgblur.comp.glsl | 3 +-- libavfilter/vulkan/bwdif.comp.glsl | 4 ++-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp index 8a658f8524..1ec9ae7e7c 100644 --- a/libavcodec/vulkan/common.comp +++ b/libavcodec/vulkan/common.comp @@ -107,9 +107,6 @@ layout(buffer_reference, buffer_reference_align = 8) buffer u64buf { #define ceil_rshift(a, b) \ (-((-(a)) >> (b))) -#define IS_WITHIN(v1, v2) \ - ((v1.x < v2.x) && (v1.y < v2.y)) - /* TODO: optimize */ uint align(uint src, uint a) { diff --git a/libavcodec/vulkan/dpx_unpack.comp.glsl b/libavcodec/vulkan/dpx_unpack.comp.glsl index 93fda6142d..3850cbf3e9 100644 --- a/libavcodec/vulkan/dpx_unpack.comp.glsl +++ b/libavcodec/vulkan/dpx_unpack.comp.glsl @@ -91,7 +91,7 @@ i16vec4 parse_packed_in_32(ivec2 pos, int stride) void main(void) { ivec2 pos = ivec2(gl_GlobalInvocationID.xy); - if (!IS_WITHIN(pos, imageSize(dst[0]))) + if (any(greaterThanEqual(pos, imageSize(dst[0])))) return; i16vec4 p; diff --git a/libavfilter/vulkan/avgblur.comp.glsl b/libavfilter/vulkan/avgblur.comp.glsl index e7a476b98c..b53ec4092c 100644 --- a/libavfilter/vulkan/avgblur.comp.glsl +++ b/libavfilter/vulkan/avgblur.comp.glsl @@ -40,9 +40,8 @@ void main() { const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); -#define IS_WITHIN(v1, v2) ((v1.x < v2.x) && (v1.y < v2.y)) ivec2 size = imageSize(output_img[nonuniformEXT(gl_LocalInvocationID.z)]); - if (!IS_WITHIN(pos, size)) + if (any(greaterThanEqual(pos, size))) return; if ((planes & (1 << gl_LocalInvocationID.z)) == 0) { diff --git a/libavfilter/vulkan/bwdif.comp.glsl b/libavfilter/vulkan/bwdif.comp.glsl index 043ded0d24..fb18af3915 100644 --- a/libavfilter/vulkan/bwdif.comp.glsl +++ b/libavfilter/vulkan/bwdif.comp.glsl @@ -151,8 +151,8 @@ void main() bool filter_field = ((pos.y ^ parity) & 1) == 1; bool is_intra = filter_field && (current_field == 0); -#define IS_WITHIN(v1, v2) ((v1.x < v2.x) && (v1.y < v2.y)) - if (!IS_WITHIN(pos, imageSize(dst[nonuniformEXT(gl_LocalInvocationID.z)]))) { + ivec2 size = imageSize(dst[nonuniformEXT(gl_LocalInvocationID.z)]); + if (any(greaterThanEqual(pos, size))) { return; } else if (is_intra) { process_plane_intra(pos); -- 2.52.0 >From a6e641ab6e08bf8a794e256e5082335794fc6b68 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Mon, 19 Jan 2026 11:33:02 +0100 Subject: [PATCH 2/3] vulkan_decode: do not align single-plane images to subsampling Unlike multiplane images, single-plane images do not need to be aligned to chroma width. Saves a bit of memory. --- libavcodec/vulkan_decode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c index 5ed963eacc..9ab8d45aa9 100644 --- a/libavcodec/vulkan_decode.c +++ b/libavcodec/vulkan_decode.c @@ -1150,7 +1150,10 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) if (err < 0) return err; + frames_ctx->format = AV_PIX_FMT_VULKAN; frames_ctx->sw_format = avctx->sw_pix_fmt; + frames_ctx->width = avctx->coded_width; + frames_ctx->height = avctx->coded_height; if (!DECODER_IS_SDR(avctx->codec_id)) { prof = av_mallocz(sizeof(FFVulkanDecodeProfileData)); @@ -1166,6 +1169,9 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) return err; } + const AVPixFmtDescriptor *pdesc = av_pix_fmt_desc_get(frames_ctx->sw_format); + frames_ctx->width = FFALIGN(frames_ctx->width, 1 << pdesc->log2_chroma_w); + frames_ctx->height = FFALIGN(frames_ctx->height, 1 << pdesc->log2_chroma_h); frames_ctx->user_opaque = prof; frames_ctx->free = free_profile_data; @@ -1211,11 +1217,6 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) } } - const AVPixFmtDescriptor *pdesc = av_pix_fmt_desc_get(frames_ctx->sw_format); - frames_ctx->width = FFALIGN(avctx->coded_width, 1 << pdesc->log2_chroma_w); - frames_ctx->height = FFALIGN(avctx->coded_height, 1 << pdesc->log2_chroma_h); - frames_ctx->format = AV_PIX_FMT_VULKAN; - hwfc->tiling = VK_IMAGE_TILING_OPTIMAL; hwfc->usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT | -- 2.52.0 >From f0d2e7de30ff8d9b7b0b1c8210337fc849eb82ef Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Mon, 19 Jan 2026 11:51:30 +0100 Subject: [PATCH 3/3] vulkan_dpx: remove host image upload path The main reason this was written was due to Nvidia. Nvidia always has a fickle upload path, and seemed to have a shortcut for the host image upload path. This seems to have been patched out of recent driver versions. This upload path relies on the driver keeping the same layout, down to the stride for the images. Which is an assumption that's not portable. Rather than relying on this fickle upload path, what we'd like when we want pure bandwidth is to decouple uploads to a separate queue, and let the GPU pull the data from RAM via uploads. It'll be slower with a single-threaded decoder, but currently all of our compute-based decoders and the decoders that sit underneath them support frame threading. --- libavcodec/vulkan_dpx.c | 103 ---------------------------------------- 1 file changed, 103 deletions(-) diff --git a/libavcodec/vulkan_dpx.c b/libavcodec/vulkan_dpx.c index cf53a0f4df..17f91c6ce4 100644 --- a/libavcodec/vulkan_dpx.c +++ b/libavcodec/vulkan_dpx.c @@ -54,106 +54,6 @@ typedef struct DecodePushData { int shift; } DecodePushData; -static int host_upload_image(AVCodecContext *avctx, - FFVulkanDecodeContext *dec, DPXDecContext *dpx, - const uint8_t *src, uint32_t size) -{ - int err; - VkImage temp; - - FFVulkanDecodeShared *ctx = dec->shared_ctx; - DPXVulkanDecodeContext *dxv = ctx->sd_ctx; - VkPhysicalDeviceLimits *limits = &ctx->s.props.properties.limits; - FFVulkanFunctions *vk = &ctx->s.vkfn; - - DPXVulkanDecodePicture *pp = dpx->hwaccel_picture_private; - FFVulkanDecodePicture *vp = &pp->vp; - - int unpack = (avctx->bits_per_raw_sample == 12 && !dpx->packing) || - avctx->bits_per_raw_sample == 10; - if (unpack) - return 0; - - VkImageCreateInfo create_info = { - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .imageType = VK_IMAGE_TYPE_2D, - .format = avctx->bits_per_raw_sample == 8 ? VK_FORMAT_R8_UINT : - avctx->bits_per_raw_sample == 32 ? VK_FORMAT_R32_UINT : - VK_FORMAT_R16_UINT, - .extent.width = dpx->frame->width*dpx->components, - .extent.height = dpx->frame->height, - .extent.depth = 1, - .mipLevels = 1, - .arrayLayers = 1, - .tiling = VK_IMAGE_TILING_LINEAR, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT, - .samples = VK_SAMPLE_COUNT_1_BIT, - .pQueueFamilyIndices = &ctx->qf[0].idx, - .queueFamilyIndexCount = 1, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - }; - - if (create_info.extent.width >= limits->maxImageDimension2D || - create_info.extent.height >= limits->maxImageDimension2D) - return 0; - - vk->CreateImage(ctx->s.hwctx->act_dev, &create_info, ctx->s.hwctx->alloc, - &temp); - - err = ff_vk_get_pooled_buffer(&ctx->s, &dxv->frame_data_pool, - &vp->slices_buf, - VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, - NULL, size, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - if (err < 0) - return err; - - FFVkBuffer *vkb = (FFVkBuffer *)vp->slices_buf->data; - VkBindImageMemoryInfo bind_info = { - .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO, - .image = temp, - .memory = vkb->mem, - }; - vk->BindImageMemory2(ctx->s.hwctx->act_dev, 1, &bind_info); - - VkHostImageLayoutTransitionInfo layout_change = { - .sType = VK_STRUCTURE_TYPE_HOST_IMAGE_LAYOUT_TRANSITION_INFO, - .image = temp, - .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .newLayout = VK_IMAGE_LAYOUT_GENERAL, - .subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .subresourceRange.layerCount = 1, - .subresourceRange.levelCount = 1, - }; - vk->TransitionImageLayoutEXT(ctx->s.hwctx->act_dev, 1, &layout_change); - - VkMemoryToImageCopy copy_region = { - .sType = VK_STRUCTURE_TYPE_MEMORY_TO_IMAGE_COPY, - .pHostPointer = src, - .imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .imageSubresource.layerCount = 1, - .imageExtent = (VkExtent3D){ dpx->frame->width*dpx->components, - dpx->frame->height, - 1 }, - }; - VkCopyMemoryToImageInfo copy_info = { - .sType = VK_STRUCTURE_TYPE_COPY_MEMORY_TO_IMAGE_INFO, - .flags = VK_HOST_IMAGE_COPY_MEMCPY_EXT, - .dstImage = temp, - .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL, - .regionCount = 1, - .pRegions = ©_region, - }; - vk->CopyMemoryToImageEXT(ctx->s.hwctx->act_dev, ©_info); - - vk->DestroyImage(ctx->s.hwctx->act_dev, temp, ctx->s.hwctx->alloc); - - return 0; -} - static int vk_dpx_start_frame(AVCodecContext *avctx, const AVBufferRef *buffer_ref, av_unused const uint8_t *buffer, @@ -167,9 +67,6 @@ static int vk_dpx_start_frame(AVCodecContext *avctx, DPXVulkanDecodePicture *pp = dpx->hwaccel_picture_private; FFVulkanDecodePicture *vp = &pp->vp; - if (ctx->s.extensions & FF_VK_EXT_HOST_IMAGE_COPY) - host_upload_image(avctx, dec, dpx, buffer, size); - /* Host map the frame data if supported */ if (!vp->slices_buf && ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
