This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit d040f5d9857af76d3cad3ae679c66e0377ef728f Author: Lynne <[email protected]> AuthorDate: Fri Jun 12 13:32:52 2026 +0900 Commit: Lynne <[email protected]> CommitDate: Thu Jul 2 16:45:08 2026 +0900 ffv1enc_vulkan: gather slices on the GPU, drop the transfer copy Uses the gather shader from the APV encoder to copy the packet data on the GPU with a second pass. Replace all of that with a seg_gather compute dispatch in the same submission as the encode. It packs the per-slice slots tight and in order into a contiguous host-visible buffer, so the final packet is just the sum of the per-slice sizes and the gathered buffer is handed straight to pkt->buf with no extra submit, no copy, no CPU involvement. The slots are sized with the version-4 worst-case formula (the realistic lossless bound, ~17x tighter than the version-3 budget this encoder never approaches; the emitted bitstream version is unchanged) and rounded down to a 16-byte multiple so the gather reads them with aligned wide loads. Removes the transfer queue family, its exec pool, and the buf_regions scratch array. --- libavcodec/ffv1enc_vulkan.c | 291 +++++++++++++++++++++++--------------------- libavcodec/vulkan/Makefile | 3 +- 2 files changed, 156 insertions(+), 138 deletions(-) diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index 2c83e6869e..a3c67e7f43 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -42,6 +42,7 @@ typedef struct VulkanEncodeFFv1FrameData { /* Output data */ AVBufferRef *out_data_ref; + AVBufferRef *compacted_data_ref; /* Copied from the source */ int64_t pts; @@ -61,10 +62,6 @@ typedef struct VulkanEncodeFFv1Context { AVVulkanDeviceQueueFamily *qf; FFVkExecPool exec_pool; - AVVulkanDeviceQueueFamily *transfer_qf; - FFVkExecPool transfer_exec_pool; - - VkBufferCopy *buf_regions; VulkanEncodeFFv1FrameData *exec_ctx_info; int in_flight; int async_depth; @@ -76,6 +73,7 @@ typedef struct VulkanEncodeFFv1Context { FFVulkanShader setup; FFVulkanShader reset; FFVulkanShader enc; + FFVulkanShader gather; /* Constant read-only buffers */ FFVkBuffer consts_buf; @@ -93,6 +91,9 @@ typedef struct VulkanEncodeFFv1Context { /* Output data buffer */ AVBufferPool *out_data_pool; + /* Gathered (contiguous) output buffer pool */ + AVBufferPool *compacted_data_pool; + /* Intermediate frame pool */ AVBufferRef *intermediate_frames_ref; @@ -108,6 +109,12 @@ typedef struct VulkanEncodeFFv1Context { int chunks; } VulkanEncodeFFv1Context; +typedef struct SegGatherPushData { + VkDeviceAddress sparse; + VkDeviceAddress compacted; + uint32_t slot_size; +} SegGatherPushData; + extern const char *ff_source_common_comp; extern const char *ff_source_rangecoder_comp; extern const char *ff_source_ffv1_vlc_comp; @@ -156,6 +163,25 @@ extern const unsigned int ff_ffv1_enc_bayer_comp_spv_len; extern const unsigned char ff_ffv1_enc_bayer_golomb_comp_spv_data[]; extern const unsigned int ff_ffv1_enc_bayer_golomb_comp_spv_len; +extern const unsigned char ff_seg_gather_comp_spv_data[]; +extern const unsigned int ff_seg_gather_comp_spv_len; + +/* Size output slots with the v4 worst case; v3's ~(2*bits+5) bytes/sample runs + * to gigabytes on large frames. The bitstream version is unaffected. */ +static size_t ffv1_vk_buffer_size(AVCodecContext *avctx) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + int version = f->version; + size_t size; + + f->version = 4; + size = ff_ffv1_encode_buffer_size(avctx); + f->version = version; + + return size; +} + static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec, AVFrame *enc_in, VkImageView *enc_in_views, FFVkBuffer *slice_data_buf, uint32_t slice_data_size, @@ -275,6 +301,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, /* Output data */ size_t maxsize; FFVkBuffer *out_data_buf; + FFVkBuffer *compacted_buf; int has_inter = avctx->gop_size > 1; uint32_t context_count = f->context_count[f->context_model]; @@ -342,20 +369,17 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, } /* Output buffer size */ - maxsize = ff_ffv1_encode_buffer_size(avctx); + maxsize = ffv1_vk_buffer_size(avctx); maxsize = FFMIN(maxsize, fv->s.props_11.maxMemoryAllocationSize); - /* Allocate output buffer */ + /* Sparse per-slice output: written by encode, read by gather, never by the + * CPU, so device-local unless it won't fit in VRAM. */ VkMemoryPropertyFlagBits out_buf_flags; - if (maxsize < fv->max_heap_size) { + if (maxsize < fv->max_heap_size) out_buf_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - /* If we can't map host memory, we can't let the GPU copy its buffer. */ - if (!(fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)) - out_buf_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - } else { + else out_buf_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | fv->s.host_cached_flag; - } RET(ff_vk_get_pooled_buffer(&fv->s, &fv->out_data_pool, &fd->out_data_ref, @@ -365,6 +389,16 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, NULL, maxsize, out_buf_flags)); out_data_buf = (FFVkBuffer *)fd->out_data_ref->data; + /* Contiguous gathered output, read back by the CPU. */ + RET(ff_vk_get_pooled_buffer(&fv->s, &fv->compacted_data_pool, + &fd->compacted_data_ref, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT, + NULL, maxsize, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + fv->s.host_cached_flag)); + compacted_buf = (FFVkBuffer *)fd->compacted_data_ref->data; + /* Image views */ AVFrame *src = (AVFrame *)pict; VkImageView src_views[AV_NUM_DATA_POINTERS]; @@ -397,7 +431,9 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, .sar[1] = pict->sample_aspect_ratio.den, .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 : !(pict->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 2 : 1, - .slice_size_max = out_data_buf->size / f->slice_count, + + /* 16-byte aligned so the gather can use wide loads */ + .slice_size_max = (out_data_buf->size / f->slice_count) & ~(size_t)15, .max_pixels_per_slice = fv->max_pixels_per_slice, }; @@ -428,6 +464,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, ff_vk_exec_add_dep_buf(&fv->s, exec, &slice_data_ref, 1, has_inter); ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 1); + ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->compacted_data_ref, 1, 1); if (f->remap_mode) { ff_vk_exec_add_dep_buf(&fv->s, exec, &remap_data_ref, 1, 0); remap_data_ref = NULL; @@ -662,6 +699,40 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, 0, sizeof(FFv1ShaderParams), &pd); vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + /* Gather the per-slice slots into one contiguous host-visible buffer, + * in the same submission (no separate transfer pass). */ + FFVkBuffer *results_buf = &fv->results_buf; + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); + ff_vk_buf_barrier(buf_bar[nb_buf_bar++], results_buf, + COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, + COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR, + 0, VK_WHOLE_SIZE); + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .pBufferMemoryBarriers = buf_bar, + .bufferMemoryBarrierCount = nb_buf_bar, + }); + nb_buf_bar = 0; + + SegGatherPushData gather_pd = { + .sparse = out_data_buf->address, + .compacted = compacted_buf->address, + .slot_size = (uint32_t)((out_data_buf->size / f->slice_count) & ~(size_t)15), + }; + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->gather, 0, 0, 0, + &fv->results_buf, + fd->idx*f->max_slice_count*sizeof(uint32_t), + f->slice_count*sizeof(uint32_t), + VK_FORMAT_UNDEFINED); + ff_vk_exec_bind_shader(&fv->s, exec, &fv->gather); + ff_vk_shader_update_push_const(&fv->s, exec, &fv->gather, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(gather_pd), &gather_pd); + vk->CmdDispatch(exec->buf, f->slice_count, 1, 1); + /* Submit */ err = ff_vk_exec_submit(&fv->s, exec); if (err < 0) @@ -681,87 +752,27 @@ fail: return err; } -static int transfer_slices(AVCodecContext *avctx, - VkBufferCopy *buf_regions, int nb_regions, - VulkanEncodeFFv1FrameData *fd, - uint8_t *dst, AVBufferRef *dst_ref) +/* Return the gathered-output buffer to its pool when the packet is freed. */ +static void ffv1_vk_packet_free(void *opaque, uint8_t *data) { - int err; - VulkanEncodeFFv1Context *fv = avctx->priv_data; - FFVulkanFunctions *vk = &fv->s.vkfn; - FFVkExecContext *exec; - - FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data; - - AVBufferRef *mapped_ref; - FFVkBuffer *mapped_buf; - - VkBufferMemoryBarrier2 buf_bar[8]; - int nb_buf_bar = 0; - - err = ff_vk_host_map_buffer(&fv->s, &mapped_ref, dst, dst_ref, - VK_BUFFER_USAGE_TRANSFER_DST_BIT); - if (err < 0) - return err; - - mapped_buf = (FFVkBuffer *)mapped_ref->data; - - /* Transfer the slices */ - exec = ff_vk_exec_get(&fv->s, &fv->transfer_exec_pool); - ff_vk_exec_start(&fv->s, exec); - - ff_vk_exec_add_dep_buf(&fv->s, exec, &fd->out_data_ref, 1, 0); - fd->out_data_ref = NULL; /* Ownership passed */ - - ff_vk_exec_add_dep_buf(&fv->s, exec, &mapped_ref, 1, 0); - mapped_ref = NULL; /* Ownership passed */ - - /* Ensure the output buffer is finished */ - ff_vk_buf_barrier(buf_bar[nb_buf_bar++], out_data_buf, - COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR, - TRANSFER_BIT, TRANSFER_READ_BIT, NONE_KHR, - 0, VK_WHOLE_SIZE); - vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .pBufferMemoryBarriers = buf_bar, - .bufferMemoryBarrierCount = nb_buf_bar, - }); - nb_buf_bar = 0; - - for (int i = 0; i < nb_regions; i++) - buf_regions[i].dstOffset += mapped_buf->virtual_offset; - - vk->CmdCopyBuffer(exec->buf, - out_data_buf->buf, mapped_buf->buf, - nb_regions, buf_regions); - - /* Submit */ - err = ff_vk_exec_submit(&fv->s, exec); - if (err < 0) - return err; - - /* We need the encoded data immediately */ - ff_vk_exec_wait(&fv->s, exec); - - return 0; + AVBufferRef *buf_ref = opaque; + av_buffer_unref(&buf_ref); } static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, AVPacket *pkt) { - int err; VulkanEncodeFFv1Context *fv = avctx->priv_data; FFV1Context *f = &fv->ctx; FFVulkanFunctions *vk = &fv->s.vkfn; VulkanEncodeFFv1FrameData *fd = exec->opaque; - FFVkBuffer *out_data_buf = (FFVkBuffer *)fd->out_data_ref->data; - uint32_t slice_size_max = out_data_buf->size / f->slice_count; + FFVkBuffer *compacted_buf = (FFVkBuffer *)fd->compacted_data_ref->data; - /* Make sure encoding's done */ + /* Make sure the encode + gather submission is done */ ff_vk_exec_wait(&fv->s, exec); - /* Invalidate slice/output data if needed */ + /* Invalidate the per-slice sizes if needed */ uint32_t rb_off = fd->idx*f->max_slice_count*sizeof(uint32_t); if (!(fv->results_buf.flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { VkMappedMemoryRange invalidate_data = { @@ -774,25 +785,39 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, 1, &invalidate_data); } - /* Calculate final size */ + /* The gather packed the slices tight and in order; sum their lengths. */ pkt->size = 0; uint8_t *rb = fv->results_buf.mapped_mem + rb_off; for (int i = 0; i < f->slice_count; i++) { uint32_t sl_len = AV_RN32(rb + i*4); av_log(avctx, AV_LOG_DEBUG, "Slice %i size = %u\n", i, sl_len); - - fv->buf_regions[i] = (VkBufferCopy) { - .srcOffset = i*slice_size_max, - .dstOffset = pkt->size, - .size = sl_len, - }; pkt->size += sl_len; } av_log(avctx, AV_LOG_VERBOSE, "Encoded data: %iMiB\n", pkt->size / (1024*1024)); - /* Allocate packet */ - if ((err = ff_get_encode_buffer(avctx, pkt, pkt->size, 0)) < 0) - return err; + /* Invalidate the gathered bitstream if needed */ + if (!(compacted_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + VkMappedMemoryRange invalidate_data = { + .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, + .memory = compacted_buf->mem, + .offset = 0, + .size = VK_WHOLE_SIZE, + }; + vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev, + 1, &invalidate_data); + } + + /* Hand the gathered buffer to the packet with no copy: pkt->buf references + * the pooled Vulkan buffer, returned to its pool when the packet is freed. */ + pkt->buf = av_buffer_create(compacted_buf->mapped_mem, compacted_buf->size, + ffv1_vk_packet_free, fd->compacted_data_ref, 0); + if (!pkt->buf) { + av_buffer_unref(&fd->out_data_ref); + av_buffer_unref(&fd->compacted_data_ref); + return AVERROR(ENOMEM); + } + fd->compacted_data_ref = NULL; /* ownership passed to pkt->buf */ + pkt->data = compacted_buf->mapped_mem; pkt->pts = fd->pts; pkt->dts = fd->pts; @@ -805,34 +830,6 @@ static int get_packet(AVCodecContext *avctx, FFVkExecContext *exec, fd->frame_opaque_ref = NULL; } - /* Try using host mapped memory transfers first */ - if (fv->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY) { - err = transfer_slices(avctx, fv->buf_regions, f->slice_count, fd, - pkt->data, pkt->buf); - if (err >= 0) - return err; - } - - /* Invalidate slice/output data if needed */ - if (!(out_data_buf->flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { - VkMappedMemoryRange invalidate_data = { - .sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, - .memory = out_data_buf->mem, - .offset = 0, - .size = VK_WHOLE_SIZE, - }; - vk->InvalidateMappedMemoryRanges(fv->s.hwctx->act_dev, - 1, &invalidate_data); - } - - /* Copy each slice */ - for (int i = 0; i < f->slice_count; i++) { - VkBufferCopy *region = &fv->buf_regions[i]; - memcpy(pkt->data + region->dstOffset, - out_data_buf->mapped_mem + region->srcOffset, - region->size); - } - av_buffer_unref(&fd->out_data_ref); return 0; @@ -1260,6 +1257,36 @@ fail: return err; } +static int init_gather_shader(AVCodecContext *avctx) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->gather; + + ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, + (uint32_t []) { 256, 1, 1 }, 0); + + ff_vk_shader_add_push_const(shd, 0, sizeof(SegGatherPushData), + VK_SHADER_STAGE_COMPUTE_BIT); + + const FFVulkanDescriptorSetBinding desc_set[] = { + { /* sizes_buf */ + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 1, 0, 0); + + RET(ff_vk_shader_link(&fv->s, shd, + ff_seg_gather_comp_spv_data, + ff_seg_gather_comp_spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + return err; +} + static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) { int err; @@ -1384,7 +1411,7 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) } fv->max_heap_size = max_heap_size; - maxsize = ff_ffv1_encode_buffer_size(avctx); + maxsize = ffv1_vk_buffer_size(avctx); if (maxsize > fv->s.props_11.maxMemoryAllocationSize) { av_log(avctx, AV_LOG_WARNING, "Encoding buffer size (%zu) larger " "than maximum device allocation (%"PRIu64"), clipping\n", @@ -1415,18 +1442,6 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) if (err < 0) return err; - fv->transfer_qf = ff_vk_qf_find(&fv->s, VK_QUEUE_TRANSFER_BIT, 0); - if (!fv->transfer_qf) { - av_log(avctx, AV_LOG_ERROR, "Device has no transfer queues!\n"); - return err; - } - - err = ff_vk_exec_pool_init(&fv->s, fv->transfer_qf, &fv->transfer_exec_pool, - 1, - 0, 0, 0, NULL); - if (err < 0) - return err; - /* Detect the special RGB coding mode */ fv->is_rgb = !(f->colorspace == 0 && avctx->sw_pix_fmt != AV_PIX_FMT_YA8) && !(avctx->sw_pix_fmt == AV_PIX_FMT_YA8); @@ -1499,6 +1514,11 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) if (err < 0) return err; + /* Gather shader */ + err = init_gather_shader(avctx); + if (err < 0) + return err; + /* Constant data */ err = ff_ffv1_vk_init_consts(&fv->s, &fv->consts_buf, f); if (err < 0) @@ -1542,10 +1562,6 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) for (int i = 0; i < fv->async_depth; i++) fv->exec_pool.contexts[i].opaque = &fv->exec_ctx_info[i]; - fv->buf_regions = av_malloc_array(f->max_slice_count, sizeof(*fv->buf_regions)); - if (!fv->buf_regions) - return AVERROR(ENOMEM); - /* Buffers */ RET(ff_vk_create_buf(&fv->s, &fv->results_buf, fv->async_depth*f->max_slice_count*sizeof(uint32_t), @@ -1564,9 +1580,9 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) VulkanEncodeFFv1Context *fv = avctx->priv_data; ff_vk_exec_pool_free(&fv->s, &fv->exec_pool); - ff_vk_exec_pool_free(&fv->s, &fv->transfer_exec_pool); ff_vk_shader_free(&fv->s, &fv->enc); + ff_vk_shader_free(&fv->s, &fv->gather); ff_vk_shader_free(&fv->s, &fv->reset); ff_vk_shader_free(&fv->s, &fv->setup); ff_vk_shader_free(&fv->s, &fv->remap); @@ -1577,6 +1593,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) for (int i = 0; i < fv->async_depth; i++) { VulkanEncodeFFv1FrameData *fd = &fv->exec_ctx_info[i]; av_buffer_unref(&fd->out_data_ref); + av_buffer_unref(&fd->compacted_data_ref); av_buffer_unref(&fd->frame_opaque_ref); } } @@ -1585,6 +1602,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) av_buffer_unref(&fv->intermediate_frames_ref); av_buffer_pool_uninit(&fv->out_data_pool); + av_buffer_pool_uninit(&fv->compacted_data_pool); av_buffer_unref(&fv->keyframe_slice_data_ref); av_buffer_pool_uninit(&fv->slice_data_pool); @@ -1594,7 +1612,6 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) ff_vk_free_buf(&fv->s, &fv->consts_buf); - av_free(fv->buf_regions); av_frame_free(&fv->frame); ff_vk_uninit(&fv->s); diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index ec2f773382..48afd639f4 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -21,7 +21,8 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \ vulkan/ffv1_enc_rgb_float_golomb.comp.spv.o \ vulkan/ffv1_enc_sort32.comp.spv.o \ vulkan/ffv1_enc_bayer.comp.spv.o \ - vulkan/ffv1_enc_bayer_golomb.comp.spv.o + vulkan/ffv1_enc_bayer_golomb.comp.spv.o \ + vulkan/seg_gather.comp.spv.o OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \ vulkan/ffv1_dec_reset.comp.spv.o \ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
