--- libavcodec/ffv1enc_vulkan.c | 204 ++++++++++++++++++++++++- libavcodec/vulkan/Makefile | 2 +- libavcodec/vulkan/ffv1_enc_setup.comp | 6 +- libavcodec/vulkan/ffv1_rct_search.comp | 139 +++++++++++++++++ 4 files changed, 346 insertions(+), 5 deletions(-) create mode 100644 libavcodec/vulkan/ffv1_rct_search.comp
diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c index 5de16d5b02..d9e12f5fae 100644 --- a/libavcodec/ffv1enc_vulkan.c +++ b/libavcodec/ffv1enc_vulkan.c @@ -74,6 +74,7 @@ typedef struct VulkanEncodeFFv1Context { size_t max_heap_size; FFVulkanShader setup; + FFVulkanShader rct_search; FFVulkanShader reset; FFVulkanShader enc; @@ -101,6 +102,7 @@ typedef struct VulkanEncodeFFv1Context { int num_h_slices; int num_v_slices; int force_pcm; + int optimize_rct; int is_rgb; int ppi; @@ -112,6 +114,7 @@ extern const char *ff_source_rangecoder_comp; extern const char *ff_source_ffv1_vlc_comp; extern const char *ff_source_ffv1_common_comp; extern const char *ff_source_ffv1_reset_comp; +extern const char *ff_source_ffv1_rct_search_comp; extern const char *ff_source_ffv1_enc_setup_comp; extern const char *ff_source_ffv1_enc_comp; @@ -147,7 +150,8 @@ typedef struct FFv1VkParameters { uint8_t ec; uint8_t ppi; uint8_t chunks; - uint8_t padding[4]; + uint8_t rct_search; + uint8_t padding[3]; } FFv1VkParameters; static void add_push_data(FFVulkanShader *shd) @@ -184,12 +188,76 @@ static void add_push_data(FFVulkanShader *shd) GLSLC(1, uint8_t ec; ); GLSLC(1, uint8_t ppi; ); GLSLC(1, uint8_t chunks; ); - GLSLC(1, uint8_t padding[4]; ); + GLSLC(1, uint8_t rct_search; ); + GLSLC(1, uint8_t padding[3]; ); GLSLC(0, }; ); ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters), VK_SHADER_STAGE_COMPUTE_BIT); } +typedef struct FFv1VkRCTSearchParameters { + int fmt_lut[4]; + int rct_offset; + uint8_t planar_rgb; + uint8_t transparency; + uint8_t key_frame; + uint8_t force_pcm; + uint8_t version; + uint8_t micro_version; + uint8_t padding[2]; +} FFv1VkRCTSearchParameters; + +static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec, + AVFrame *enc_in, VkImageView *enc_in_views, + FFVkBuffer *slice_data_buf, uint32_t slice_data_size) +{ + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFV1Context *f = &fv->ctx; + FFVulkanFunctions *vk = &fv->s.vkfn; + AVHWFramesContext *src_hwfc = (AVHWFramesContext *)enc_in->hw_frames_ctx->data; + FFv1VkRCTSearchParameters pd; + + /* Update descriptors */ + ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct_search, + 0, 0, 0, + slice_data_buf, + 0, slice_data_size*f->slice_count, + VK_FORMAT_UNDEFINED); + ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search, + enc_in, enc_in_views, + 0, 1, + VK_IMAGE_LAYOUT_GENERAL, + VK_NULL_HANDLE); + + ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct_search); + + pd = (FFv1VkRCTSearchParameters) { + .rct_offset = 1 << f->bits_per_raw_sample, + .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) && + (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1), + .transparency = f->transparency, + .key_frame = f->key_frame, + .force_pcm = fv->force_pcm, + .version = f->version, + .micro_version = f->micro_version, + }; + + if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 || + avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 || + avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14) + memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int)); + else + ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1); + + ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct_search, + VK_SHADER_STAGE_COMPUTE_BIT, + 0, sizeof(pd), &pd); + + vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1); + + return 0; +} + static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, FFVkExecContext *exec, const AVFrame *pict) @@ -366,6 +434,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, }; } + if (fv->optimize_rct) { + RET(run_rct_search(avctx, exec, + src, src_views, + slice_data_buf, slice_data_size)); + + buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2, + .srcStageMask = slice_data_buf->stage, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = slice_data_buf->access, + .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = slice_data_buf->buf, + .size = slice_data_size*f->slice_count, + .offset = 0, + }; + } + vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) { .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, .pImageMemoryBarriers = img_bar, @@ -412,6 +499,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx, .ec = f->ec, .ppi = fv->ppi, .chunks = fv->chunks, + .rct_search = fv->optimize_rct, }; /* For some reason the C FFv1 encoder/decoder treats these differently */ @@ -920,6 +1008,103 @@ static void define_shared_code(AVCodecContext *avctx, FFVulkanShader *shd) GLSLD(ff_source_ffv1_common_comp); } +static int init_rct_search_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) +{ + int err; + VulkanEncodeFFv1Context *fv = avctx->priv_data; + FFVulkanShader *shd = &fv->rct_search; + FFVulkanDescriptorSetBinding *desc_set; + + uint8_t *spv_data; + size_t spv_len; + void *spv_opaque = NULL; + + RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct_search", + VK_SHADER_STAGE_COMPUTE_BIT, + (const char *[]) { "GL_EXT_buffer_reference", + "GL_EXT_buffer_reference2", + "GL_EXT_null_initializer" }, 3, + 32, 32, 1, + 0)); + + /* Common codec header */ + GLSLD(ff_source_common_comp); + + GLSLC(0, layout(push_constant, scalar) uniform pushConstants { ); + GLSLC(1, ivec4 fmt_lut; ); + GLSLC(1, int rct_offset; ); + GLSLC(1, uint8_t planar_rgb; ); + GLSLC(1, uint8_t transparency; ); + GLSLC(1, uint8_t key_frame; ); + GLSLC(1, uint8_t force_pcm; ); + GLSLC(1, uint8_t version; ); + GLSLC(1, uint8_t micro_version; ); + GLSLC(1, uint8_t padding[3]; ); + GLSLC(0, }; ); + ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters), + VK_SHADER_STAGE_COMPUTE_BIT); + + av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES); + av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", MAX_CONTEXT_INPUTS); + av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", MAX_QUANT_TABLE_SIZE); + + /* Never used */ + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "rangecoder_static_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "uint8_t zero_one_state[512];", + }, + { + .name = "quant_buf", + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .mem_layout = "scalar", + .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]" + "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];", + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 1)); + + define_shared_code(avctx, shd); + + desc_set = (FFVulkanDescriptorSetBinding []) { + { + .name = "slice_data_buf", + .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + .buf_content = "SliceContext slice_ctx[1024];", + }, + { + .name = "src", + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .dimensions = 2, + .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format, + fv->rep_fmt), + .elems = av_pix_fmt_count_planes(fv->s.frames->sw_format), + .mem_quali = "readonly", + .stages = VK_SHADER_STAGE_COMPUTE_BIT, + }, + }; + RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0)); + + GLSLD(ff_source_ffv1_rct_search_comp); + + RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main", + &spv_opaque)); + RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main")); + + RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd)); + +fail: + if (spv_opaque) + spv->free_shader(spv, &spv_opaque); + + return err; +} + static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv) { int err; @@ -1417,6 +1602,17 @@ static av_cold int vulkan_encode_ffv1_init(AVCodecContext *avctx) if (!fv->is_rgb && f->bits_per_raw_sample > 8) fv->rep_fmt = FF_VK_REP_INT; + /* Init rct search shader */ + fv->optimize_rct = fv->is_rgb && f->version >= 4 && + !fv->force_pcm && fv->optimize_rct; + if (fv->optimize_rct) { + err = init_rct_search_shader(avctx, spv); + if (err < 0) { + spv->uninit(&spv); + return err; + } + } + /* Init setup shader */ err = init_setup_shader(avctx, spv); if (err < 0) { @@ -1528,6 +1724,7 @@ static av_cold int vulkan_encode_ffv1_close(AVCodecContext *avctx) ff_vk_shader_free(&fv->s, &fv->enc); ff_vk_shader_free(&fv->s, &fv->reset); ff_vk_shader_free(&fv->s, &fv->setup); + ff_vk_shader_free(&fv->s, &fv->rct_search); if (fv->exec_ctx_info) { for (int i = 0; i < fv->async_depth; i++) { @@ -1591,6 +1788,9 @@ static const AVOption vulkan_encode_ffv1_options[] = { { "force_pcm", "Code all slices with no prediction", OFFSET(force_pcm), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE }, + { "rct_search", "Run a search for RCT parameters (level 4 only)", OFFSET(optimize_rct), AV_OPT_TYPE_BOOL, + { .i64 = 1 }, 0, 1, VE }, + { "async_depth", "Internal parallelization depth", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, INT_MAX, VE }, diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile index 4bbcb38c6a..729cb4f15c 100644 --- a/libavcodec/vulkan/Makefile +++ b/libavcodec/vulkan/Makefile @@ -7,7 +7,7 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/common.o \ vulkan/rangecoder.o vulkan/ffv1_vlc.o \ vulkan/ffv1_common.o vulkan/ffv1_reset.o \ vulkan/ffv1_enc_rct.o vulkan/ffv1_enc_setup.o \ - vulkan/ffv1_enc.o + vulkan/ffv1_rct_search.o vulkan/ffv1_enc.o OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/common.o \ vulkan/rangecoder.o vulkan/ffv1_vlc.o \ diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp b/libavcodec/vulkan/ffv1_enc_setup.comp index 6f21e47523..5f8e6704b0 100644 --- a/libavcodec/vulkan/ffv1_enc_setup.comp +++ b/libavcodec/vulkan/ffv1_enc_setup.comp @@ -22,7 +22,7 @@ uint8_t state[CONTEXT_SIZE]; -void init_slice(out SliceContext sc, const uint slice_idx) +void init_slice(inout SliceContext sc, const uint slice_idx) { /* Set coordinates */ uvec2 img_size = imageSize(src[0]); @@ -37,11 +37,13 @@ void init_slice(out SliceContext sc, const uint slice_idx) sc.slice_pos = ivec2(sxs, sys); sc.slice_dim = ivec2(sxe - sxs, sye - sys); - sc.slice_rct_coef = ivec2(1, 1); sc.slice_coding_mode = int(force_pcm == 1); sc.slice_reset_contexts = sc.slice_coding_mode == 1; sc.quant_table_idx = u8vec3(context_model); + if ((rct_search == 0) || (sc.slice_coding_mode == 1)) + sc.slice_rct_coef = ivec2(1, 1); + rac_init(sc.c, OFFBUF(u8buf, out_data, slice_idx * slice_size_max), slice_size_max); diff --git a/libavcodec/vulkan/ffv1_rct_search.comp b/libavcodec/vulkan/ffv1_rct_search.comp new file mode 100644 index 0000000000..055bde46c4 --- /dev/null +++ b/libavcodec/vulkan/ffv1_rct_search.comp @@ -0,0 +1,139 @@ +/* + * FFv1 codec + * + * Copyright (c) 2024 Lynne <d...@lynne.ee> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +ivec3 load_components(ivec2 pos) +{ + ivec3 pix = ivec3(imageLoad(src[0], pos)); + if (planar_rgb != 0) { + for (int i = 1; i < 3; i++) + pix[i] = int(imageLoad(src[i], pos)[0]); + } + + return ivec3(pix[fmt_lut[0]], pix[fmt_lut[1]], pix[fmt_lut[2]]); +} + +#define NUM_CHECKS 15 +const ivec2 rct_y_coeff[NUM_CHECKS] = { + ivec2(0, 0), // 4G + + ivec2(0, 1), // 3G + B + ivec2(1, 0), // R + 3G + ivec2(1, 1), // R + 2G + B + + ivec2(0, 2), // 2G + 2B + ivec2(2, 0), // 2R + 2G + ivec2(2, 2), // 2R + 2B + + ivec2(0, 3), // 1G + 3B + ivec2(3, 0), // 3R + 1G + + ivec2(0, 4), // 4B + ivec2(4, 0), // 4R + + ivec2(1, 2), // R + G + 2B + ivec2(2, 1), // 2R + G + B + + ivec2(3, 1), // 3R + B + ivec2(1, 3), // R + 3B +}; + +shared ivec3 pix_buf[gl_WorkGroupSize.x + 1][gl_WorkGroupSize.y + 1] = { }; + +ivec3 transform_sample(ivec3 pix, ivec2 rct_coef) +{ + pix.b -= pix.g; + pix.r -= pix.g; + pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2; + pix.b += rct_offset; + pix.r += rct_offset; + return pix; +} + +uint get_dist(ivec3 cur) +{ + ivec3 LL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 1]; + ivec3 TL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 0]; + ivec3 TT = pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 0]; + + ivec3 pred = ivec3(predict(LL.r, ivec2(TL.r, TT.r)), + predict(LL.g, ivec2(TL.g, TT.g)), + predict(LL.b, ivec2(TL.b, TT.b))); + + uvec3 c = abs(pred - cur); + return mid_pred(c.r, c.g, c.b); +} + +shared uint score_cols[gl_WorkGroupSize.y] = { }; +shared uint score_mode[16] = { }; + +void process(ivec2 pos) +{ + ivec3 pix = load_components(pos); + + for (int i = 0; i < NUM_CHECKS; i++) { + ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]); + pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = tx_pix; + memoryBarrierShared(); + + uint dist = get_dist(tx_pix); + atomicAdd(score_mode[i], dist); + } +} + +void coeff_search(inout SliceContext sc) +{ + uvec2 img_size = imageSize(src[0]); + uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0, + gl_NumWorkGroups.x, 0); + uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1, + gl_NumWorkGroups.x, 0); + uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0, + gl_NumWorkGroups.y, 0); + uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1, + gl_NumWorkGroups.y, 0); + + for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += gl_WorkGroupSize.y) { + for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += gl_WorkGroupSize.x) { + process(ivec2(x, y)); + } + } + + if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) { + uint min_score = 0xFFFFFFFF; + uint min_idx = 3; + for (int i = 0; i < NUM_CHECKS; i++) { + if (score_mode[i] < min_score) { + min_score = score_mode[i]; + min_idx = i; + } + } + sc.slice_rct_coef = rct_y_coeff[min_idx]; + } +} + +void main(void) +{ + if (force_pcm == 1) + return; + const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; + coeff_search(slice_ctx[slice_idx]); +} -- 2.49.0.395.g12beb8f557c _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".