[FFmpeg-devel] [PR] vulkan_dpx: remove host image upload path (PR #21514)

Lynne via ffmpeg-devel Mon, 19 Jan 2026 03:00:05 -0800

PR #21514 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21514
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21514.patch


The main reason this was written was due to Nvidia. Nvidia always
has a fickle upload path, and seemed to have a shortcut for the
host image upload path. This seems to have been patched out of
recent driver versions.

This upload path relies on the driver keeping the same layout,
down to the stride for the images. Which is an assumption that's
not portable.

Rather than relying on this fickle upload path, what we'd like when
we want pure bandwidth is to decouple uploads to a separate queue,
and let the GPU pull the data from RAM via uploads.

It'll be slower with a single-threaded decoder, but currently all
of our compute-based decoders and the decoders that sit underneath
them support frame threading.


>From 2b309836266b969ec0f4f6d7feefdf4c54e7dcc1 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Fri, 16 Jan 2026 16:09:05 +0100
Subject: [PATCH 1/3] vulkan: remove IS_WITHIN macro

This is the more correct GLSL solution.
---
 libavcodec/vulkan/common.comp          | 3 ---
 libavcodec/vulkan/dpx_unpack.comp.glsl | 2 +-
 libavfilter/vulkan/avgblur.comp.glsl   | 3 +--
 libavfilter/vulkan/bwdif.comp.glsl     | 4 ++--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/libavcodec/vulkan/common.comp b/libavcodec/vulkan/common.comp
index 8a658f8524..1ec9ae7e7c 100644
--- a/libavcodec/vulkan/common.comp
+++ b/libavcodec/vulkan/common.comp
@@ -107,9 +107,6 @@ layout(buffer_reference, buffer_reference_align = 8) buffer 
u64buf {
 #define ceil_rshift(a, b) \
     (-((-(a)) >> (b)))
 
-#define IS_WITHIN(v1, v2) \
-    ((v1.x < v2.x) && (v1.y < v2.y))
-
 /* TODO: optimize */
 uint align(uint src, uint a)
 {
diff --git a/libavcodec/vulkan/dpx_unpack.comp.glsl 
b/libavcodec/vulkan/dpx_unpack.comp.glsl
index 93fda6142d..3850cbf3e9 100644
--- a/libavcodec/vulkan/dpx_unpack.comp.glsl
+++ b/libavcodec/vulkan/dpx_unpack.comp.glsl
@@ -91,7 +91,7 @@ i16vec4 parse_packed_in_32(ivec2 pos, int stride)
 void main(void)
 {
     ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
-    if (!IS_WITHIN(pos, imageSize(dst[0])))
+    if (any(greaterThanEqual(pos, imageSize(dst[0]))))
         return;
 
     i16vec4 p;
diff --git a/libavfilter/vulkan/avgblur.comp.glsl 
b/libavfilter/vulkan/avgblur.comp.glsl
index e7a476b98c..b53ec4092c 100644
--- a/libavfilter/vulkan/avgblur.comp.glsl
+++ b/libavfilter/vulkan/avgblur.comp.glsl
@@ -40,9 +40,8 @@ void main()
 {
     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
 
-#define IS_WITHIN(v1, v2) ((v1.x < v2.x) && (v1.y < v2.y))
     ivec2 size = imageSize(output_img[nonuniformEXT(gl_LocalInvocationID.z)]);
-    if (!IS_WITHIN(pos, size))
+    if (any(greaterThanEqual(pos, size)))
         return;
 
     if ((planes & (1 << gl_LocalInvocationID.z)) == 0) {
diff --git a/libavfilter/vulkan/bwdif.comp.glsl 
b/libavfilter/vulkan/bwdif.comp.glsl
index 043ded0d24..fb18af3915 100644
--- a/libavfilter/vulkan/bwdif.comp.glsl
+++ b/libavfilter/vulkan/bwdif.comp.glsl
@@ -151,8 +151,8 @@ void main()
     bool filter_field = ((pos.y ^ parity) & 1) == 1;
     bool is_intra = filter_field && (current_field == 0);
 
-#define IS_WITHIN(v1, v2) ((v1.x < v2.x) && (v1.y < v2.y))
-    if (!IS_WITHIN(pos, 
imageSize(dst[nonuniformEXT(gl_LocalInvocationID.z)]))) {
+    ivec2 size = imageSize(dst[nonuniformEXT(gl_LocalInvocationID.z)]);
+    if (any(greaterThanEqual(pos, size))) {
         return;
     } else if (is_intra) {
         process_plane_intra(pos);
-- 
2.52.0


>From a6e641ab6e08bf8a794e256e5082335794fc6b68 Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Mon, 19 Jan 2026 11:33:02 +0100
Subject: [PATCH 2/3] vulkan_decode: do not align single-plane images to
 subsampling

Unlike multiplane images, single-plane images do not need to be
aligned to chroma width.
Saves a bit of memory.
---
 libavcodec/vulkan_decode.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libavcodec/vulkan_decode.c b/libavcodec/vulkan_decode.c
index 5ed963eacc..9ab8d45aa9 100644
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@@ -1150,7 +1150,10 @@ int ff_vk_frame_params(AVCodecContext *avctx, 
AVBufferRef *hw_frames_ctx)
     if (err < 0)
         return err;
 
+    frames_ctx->format = AV_PIX_FMT_VULKAN;
     frames_ctx->sw_format = avctx->sw_pix_fmt;
+    frames_ctx->width  = avctx->coded_width;
+    frames_ctx->height = avctx->coded_height;
 
     if (!DECODER_IS_SDR(avctx->codec_id)) {
         prof = av_mallocz(sizeof(FFVulkanDecodeProfileData));
@@ -1166,6 +1169,9 @@ int ff_vk_frame_params(AVCodecContext *avctx, AVBufferRef 
*hw_frames_ctx)
             return err;
         }
 
+        const AVPixFmtDescriptor *pdesc = 
av_pix_fmt_desc_get(frames_ctx->sw_format);
+        frames_ctx->width       = FFALIGN(frames_ctx->width, 1 << 
pdesc->log2_chroma_w);
+        frames_ctx->height      = FFALIGN(frames_ctx->height, 1 << 
pdesc->log2_chroma_h);
         frames_ctx->user_opaque = prof;
         frames_ctx->free        = free_profile_data;
 
@@ -1211,11 +1217,6 @@ int ff_vk_frame_params(AVCodecContext *avctx, 
AVBufferRef *hw_frames_ctx)
         }
     }
 
-    const AVPixFmtDescriptor *pdesc = 
av_pix_fmt_desc_get(frames_ctx->sw_format);
-    frames_ctx->width  = FFALIGN(avctx->coded_width, 1 << 
pdesc->log2_chroma_w);
-    frames_ctx->height = FFALIGN(avctx->coded_height, 1 << 
pdesc->log2_chroma_h);
-    frames_ctx->format = AV_PIX_FMT_VULKAN;
-
     hwfc->tiling = VK_IMAGE_TILING_OPTIMAL;
     hwfc->usage  = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                    VK_IMAGE_USAGE_STORAGE_BIT      |
-- 
2.52.0


>From f0d2e7de30ff8d9b7b0b1c8210337fc849eb82ef Mon Sep 17 00:00:00 2001
From: Lynne <[email protected]>
Date: Mon, 19 Jan 2026 11:51:30 +0100
Subject: [PATCH 3/3] vulkan_dpx: remove host image upload path

The main reason this was written was due to Nvidia. Nvidia always
has a fickle upload path, and seemed to have a shortcut for the
host image upload path. This seems to have been patched out of
recent driver versions.

This upload path relies on the driver keeping the same layout,
down to the stride for the images. Which is an assumption that's
not portable.

Rather than relying on this fickle upload path, what we'd like when
we want pure bandwidth is to decouple uploads to a separate queue,
and let the GPU pull the data from RAM via uploads.

It'll be slower with a single-threaded decoder, but currently all
of our compute-based decoders and the decoders that sit underneath
them support frame threading.
---
 libavcodec/vulkan_dpx.c | 103 ----------------------------------------
 1 file changed, 103 deletions(-)

diff --git a/libavcodec/vulkan_dpx.c b/libavcodec/vulkan_dpx.c
index cf53a0f4df..17f91c6ce4 100644
--- a/libavcodec/vulkan_dpx.c
+++ b/libavcodec/vulkan_dpx.c
@@ -54,106 +54,6 @@ typedef struct DecodePushData {
     int shift;
 } DecodePushData;
 
-static int host_upload_image(AVCodecContext *avctx,
-                             FFVulkanDecodeContext *dec, DPXDecContext *dpx,
-                             const uint8_t *src, uint32_t size)
-{
-    int err;
-    VkImage temp;
-
-    FFVulkanDecodeShared *ctx = dec->shared_ctx;
-    DPXVulkanDecodeContext *dxv = ctx->sd_ctx;
-    VkPhysicalDeviceLimits *limits = &ctx->s.props.properties.limits;
-    FFVulkanFunctions *vk = &ctx->s.vkfn;
-
-    DPXVulkanDecodePicture *pp = dpx->hwaccel_picture_private;
-    FFVulkanDecodePicture *vp = &pp->vp;
-
-    int unpack = (avctx->bits_per_raw_sample == 12 && !dpx->packing) ||
-                 avctx->bits_per_raw_sample == 10;
-    if (unpack)
-        return 0;
-
-    VkImageCreateInfo create_info = {
-        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-        .imageType = VK_IMAGE_TYPE_2D,
-        .format = avctx->bits_per_raw_sample == 8 ? VK_FORMAT_R8_UINT :
-                  avctx->bits_per_raw_sample == 32 ? VK_FORMAT_R32_UINT :
-                                                     VK_FORMAT_R16_UINT,
-        .extent.width = dpx->frame->width*dpx->components,
-        .extent.height = dpx->frame->height,
-        .extent.depth = 1,
-        .mipLevels = 1,
-        .arrayLayers = 1,
-        .tiling = VK_IMAGE_TILING_LINEAR,
-        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-        .usage = VK_IMAGE_USAGE_STORAGE_BIT | 
VK_IMAGE_USAGE_HOST_TRANSFER_BIT_EXT,
-        .samples = VK_SAMPLE_COUNT_1_BIT,
-        .pQueueFamilyIndices = &ctx->qf[0].idx,
-        .queueFamilyIndexCount = 1,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-    };
-
-    if (create_info.extent.width >= limits->maxImageDimension2D ||
-        create_info.extent.height >= limits->maxImageDimension2D)
-        return 0;
-
-    vk->CreateImage(ctx->s.hwctx->act_dev, &create_info, ctx->s.hwctx->alloc,
-                    &temp);
-
-    err = ff_vk_get_pooled_buffer(&ctx->s, &dxv->frame_data_pool,
-                                  &vp->slices_buf,
-                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-                                      
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
-                                  NULL, size,
-                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
-    if (err < 0)
-        return err;
-
-    FFVkBuffer *vkb = (FFVkBuffer *)vp->slices_buf->data;
-    VkBindImageMemoryInfo bind_info = {
-        .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
-        .image = temp,
-        .memory = vkb->mem,
-    };
-    vk->BindImageMemory2(ctx->s.hwctx->act_dev, 1, &bind_info);
-
-    VkHostImageLayoutTransitionInfo layout_change = {
-        .sType = VK_STRUCTURE_TYPE_HOST_IMAGE_LAYOUT_TRANSITION_INFO,
-        .image = temp,
-        .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-        .newLayout = VK_IMAGE_LAYOUT_GENERAL,
-        .subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-        .subresourceRange.layerCount = 1,
-        .subresourceRange.levelCount = 1,
-    };
-    vk->TransitionImageLayoutEXT(ctx->s.hwctx->act_dev, 1, &layout_change);
-
-    VkMemoryToImageCopy copy_region = {
-        .sType = VK_STRUCTURE_TYPE_MEMORY_TO_IMAGE_COPY,
-        .pHostPointer = src,
-        .imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-        .imageSubresource.layerCount = 1,
-        .imageExtent = (VkExtent3D){ dpx->frame->width*dpx->components,
-                                     dpx->frame->height,
-                                     1 },
-    };
-    VkCopyMemoryToImageInfo copy_info = {
-        .sType = VK_STRUCTURE_TYPE_COPY_MEMORY_TO_IMAGE_INFO,
-        .flags = VK_HOST_IMAGE_COPY_MEMCPY_EXT,
-        .dstImage = temp,
-        .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
-        .regionCount = 1,
-        .pRegions = &copy_region,
-    };
-    vk->CopyMemoryToImageEXT(ctx->s.hwctx->act_dev, &copy_info);
-
-    vk->DestroyImage(ctx->s.hwctx->act_dev, temp, ctx->s.hwctx->alloc);
-
-    return 0;
-}
-
 static int vk_dpx_start_frame(AVCodecContext          *avctx,
                               const AVBufferRef       *buffer_ref,
                               av_unused const uint8_t *buffer,
@@ -167,9 +67,6 @@ static int vk_dpx_start_frame(AVCodecContext          *avctx,
     DPXVulkanDecodePicture *pp = dpx->hwaccel_picture_private;
     FFVulkanDecodePicture *vp = &pp->vp;
 
-    if (ctx->s.extensions & FF_VK_EXT_HOST_IMAGE_COPY)
-        host_upload_image(avctx, dec, dpx, buffer, size);
-
     /* Host map the frame data if supported */
     if (!vp->slices_buf &&
         ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] vulkan_dpx: remove host image upload path (PR #21514)

Reply via email to