---
 libavcodec/ffv1enc_vulkan.c            | 204 ++++++++++++++++++++++++-
 libavcodec/vulkan/Makefile             |   2 +-
 libavcodec/vulkan/ffv1_enc_setup.comp  |   6 +-
 libavcodec/vulkan/ffv1_rct_search.comp | 139 +++++++++++++++++
 4 files changed, 346 insertions(+), 5 deletions(-)
 create mode 100644 libavcodec/vulkan/ffv1_rct_search.comp

diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 5de16d5b02..d9e12f5fae 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -74,6 +74,7 @@ typedef struct VulkanEncodeFFv1Context {
     size_t max_heap_size;
 
     FFVulkanShader setup;
+    FFVulkanShader rct_search;
     FFVulkanShader reset;
     FFVulkanShader enc;
 
@@ -101,6 +102,7 @@ typedef struct VulkanEncodeFFv1Context {
     int num_h_slices;
     int num_v_slices;
     int force_pcm;
+    int optimize_rct;
 
     int is_rgb;
     int ppi;
@@ -112,6 +114,7 @@ extern const char *ff_source_rangecoder_comp;
 extern const char *ff_source_ffv1_vlc_comp;
 extern const char *ff_source_ffv1_common_comp;
 extern const char *ff_source_ffv1_reset_comp;
+extern const char *ff_source_ffv1_rct_search_comp;
 extern const char *ff_source_ffv1_enc_setup_comp;
 extern const char *ff_source_ffv1_enc_comp;
 
@@ -147,7 +150,8 @@ typedef struct FFv1VkParameters {
     uint8_t ec;
     uint8_t ppi;
     uint8_t chunks;
-    uint8_t padding[4];
+    uint8_t rct_search;
+    uint8_t padding[3];
 } FFv1VkParameters;
 
 static void add_push_data(FFVulkanShader *shd)
@@ -184,12 +188,76 @@ static void add_push_data(FFVulkanShader *shd)
     GLSLC(1,    uint8_t ec;                                                   
);
     GLSLC(1,    uint8_t ppi;                                                  
);
     GLSLC(1,    uint8_t chunks;                                               
);
-    GLSLC(1,    uint8_t padding[4];                                           
);
+    GLSLC(1,    uint8_t rct_search;                                           
);
+    GLSLC(1,    uint8_t padding[3];                                           
);
     GLSLC(0, };                                                               
);
     ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkParameters),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 }
 
+typedef struct FFv1VkRCTSearchParameters {
+    int fmt_lut[4];
+    int rct_offset;
+    uint8_t planar_rgb;
+    uint8_t transparency;
+    uint8_t key_frame;
+    uint8_t force_pcm;
+    uint8_t version;
+    uint8_t micro_version;
+    uint8_t padding[2];
+} FFv1VkRCTSearchParameters;
+
+static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
+                          AVFrame *enc_in, VkImageView *enc_in_views,
+                          FFVkBuffer *slice_data_buf, uint32_t slice_data_size)
+{
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanFunctions *vk = &fv->s.vkfn;
+    AVHWFramesContext *src_hwfc = (AVHWFramesContext 
*)enc_in->hw_frames_ctx->data;
+    FFv1VkRCTSearchParameters pd;
+
+    /* Update descriptors */
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->rct_search,
+                                    0, 0, 0,
+                                    slice_data_buf,
+                                    0, slice_data_size*f->slice_count,
+                                    VK_FORMAT_UNDEFINED);
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->rct_search,
+                                  enc_in, enc_in_views,
+                                  0, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->rct_search);
+
+    pd = (FFv1VkRCTSearchParameters) {
+        .rct_offset = 1 << f->bits_per_raw_sample,
+        .planar_rgb = ff_vk_mt_is_np_rgb(src_hwfc->sw_format) &&
+                      (ff_vk_count_images((AVVkFrame *)enc_in->data[0]) > 1),
+        .transparency = f->transparency,
+        .key_frame = f->key_frame,
+        .force_pcm = fv->force_pcm,
+        .version = f->version,
+        .micro_version = f->micro_version,
+    };
+
+    if (avctx->sw_pix_fmt == AV_PIX_FMT_GBRP10 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP12 ||
+        avctx->sw_pix_fmt == AV_PIX_FMT_GBRP14)
+        memcpy(pd.fmt_lut, (int [4]) { 2, 1, 0, 3 }, 4*sizeof(int));
+    else
+        ff_vk_set_perm(avctx->sw_pix_fmt, pd.fmt_lut, 1);
+
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->rct_search,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd), &pd);
+
+    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
+
+    return 0;
+}
+
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                            FFVkExecContext *exec,
                                            const AVFrame *pict)
@@ -366,6 +434,25 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         };
     }
 
+    if (fv->optimize_rct) {
+        RET(run_rct_search(avctx, exec,
+                           src, src_views,
+                           slice_data_buf, slice_data_size));
+
+        buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
+            .srcStageMask = slice_data_buf->stage,
+            .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+            .srcAccessMask = slice_data_buf->access,
+            .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = slice_data_buf->buf,
+            .size = slice_data_size*f->slice_count,
+            .offset = 0,
+        };
+    }
+
     vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .pImageMemoryBarriers = img_bar,
@@ -412,6 +499,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         .ec = f->ec,
         .ppi = fv->ppi,
         .chunks = fv->chunks,
+        .rct_search = fv->optimize_rct,
     };
 
     /* For some reason the C FFv1 encoder/decoder treats these differently */
@@ -920,6 +1008,103 @@ static void define_shared_code(AVCodecContext *avctx, 
FFVulkanShader *shd)
     GLSLD(ff_source_ffv1_common_comp);
 }
 
+static int init_rct_search_shader(AVCodecContext *avctx, FFVkSPIRVCompiler 
*spv)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFVulkanShader *shd = &fv->rct_search;
+    FFVulkanDescriptorSetBinding *desc_set;
+
+    uint8_t *spv_data;
+    size_t spv_len;
+    void *spv_opaque = NULL;
+
+    RET(ff_vk_shader_init(&fv->s, shd, "ffv1_rct_search",
+                          VK_SHADER_STAGE_COMPUTE_BIT,
+                          (const char *[]) { "GL_EXT_buffer_reference",
+                                             "GL_EXT_buffer_reference2",
+                                             "GL_EXT_null_initializer" }, 3,
+                          32, 32, 1,
+                          0));
+
+    /* Common codec header */
+    GLSLD(ff_source_common_comp);
+
+    GLSLC(0, layout(push_constant, scalar) uniform pushConstants {             
);
+    GLSLC(1,    ivec4 fmt_lut;                                                 
);
+    GLSLC(1,    int rct_offset;                                                
);
+    GLSLC(1,    uint8_t planar_rgb;                                            
);
+    GLSLC(1,    uint8_t transparency;                                          
);
+    GLSLC(1,    uint8_t key_frame;                                             
);
+    GLSLC(1,    uint8_t force_pcm;                                             
);
+    GLSLC(1,    uint8_t version;                                               
);
+    GLSLC(1,    uint8_t micro_version;                                         
);
+    GLSLC(1,    uint8_t padding[3];                                            
);
+    GLSLC(0, };                                                                
);
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1VkResetParameters),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLES %i\n", MAX_QUANT_TABLES);
+    av_bprintf(&shd->src, "#define MAX_CONTEXT_INPUTS %i\n", 
MAX_CONTEXT_INPUTS);
+    av_bprintf(&shd->src, "#define MAX_QUANT_TABLE_SIZE %i\n", 
MAX_QUANT_TABLE_SIZE);
+
+    /* Never used */
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "rangecoder_static_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "uint8_t zero_one_state[512];",
+        },
+        {
+            .name        = "quant_buf",
+            .type        = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .mem_layout  = "scalar",
+            .buf_content = "int16_t quant_table[MAX_QUANT_TABLES]"
+                           "[MAX_CONTEXT_INPUTS][MAX_QUANT_TABLE_SIZE];",
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 1, 1));
+
+    define_shared_code(avctx, shd);
+
+    desc_set = (FFVulkanDescriptorSetBinding []) {
+        {
+            .name        = "slice_data_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+            .buf_content = "SliceContext slice_ctx[1024];",
+        },
+        {
+            .name       = "src",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .dimensions = 2,
+            .mem_layout = ff_vk_shader_rep_fmt(fv->s.frames->sw_format,
+                                               fv->rep_fmt),
+            .elems      = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+            .mem_quali  = "readonly",
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    RET(ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 2, 0, 0));
+
+    GLSLD(ff_source_ffv1_rct_search_comp);
+
+    RET(spv->compile_shader(&fv->s, spv, shd, &spv_data, &spv_len, "main",
+                            &spv_opaque));
+    RET(ff_vk_shader_link(&fv->s, shd, spv_data, spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
+
+fail:
+    if (spv_opaque)
+        spv->free_shader(spv, &spv_opaque);
+
+    return err;
+}
+
 static int init_setup_shader(AVCodecContext *avctx, FFVkSPIRVCompiler *spv)
 {
     int err;
@@ -1417,6 +1602,17 @@ static av_cold int 
vulkan_encode_ffv1_init(AVCodecContext *avctx)
     if (!fv->is_rgb && f->bits_per_raw_sample > 8)
         fv->rep_fmt = FF_VK_REP_INT;
 
+    /* Init rct search shader */
+    fv->optimize_rct = fv->is_rgb && f->version >= 4 &&
+                       !fv->force_pcm && fv->optimize_rct;
+    if (fv->optimize_rct) {
+        err = init_rct_search_shader(avctx, spv);
+        if (err < 0) {
+            spv->uninit(&spv);
+            return err;
+        }
+    }
+
     /* Init setup shader */
     err = init_setup_shader(avctx, spv);
     if (err < 0) {
@@ -1528,6 +1724,7 @@ static av_cold int 
vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_shader_free(&fv->s, &fv->enc);
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
+    ff_vk_shader_free(&fv->s, &fv->rct_search);
 
     if (fv->exec_ctx_info) {
         for (int i = 0; i < fv->async_depth; i++) {
@@ -1591,6 +1788,9 @@ static const AVOption vulkan_encode_ffv1_options[] = {
     { "force_pcm", "Code all slices with no prediction", OFFSET(force_pcm), 
AV_OPT_TYPE_BOOL,
             { .i64 = 0 }, 0, 1, VE },
 
+    { "rct_search", "Run a search for RCT parameters (level 4 only)", 
OFFSET(optimize_rct), AV_OPT_TYPE_BOOL,
+            { .i64 = 1 }, 0, 1, VE },
+
     { "async_depth", "Internal parallelization depth", OFFSET(async_depth), 
AV_OPT_TYPE_INT,
             { .i64 = 1 }, 1, INT_MAX, VE },
 
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index 4bbcb38c6a..729cb4f15c 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -7,7 +7,7 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER)  +=  vulkan/common.o \
                                        vulkan/rangecoder.o vulkan/ffv1_vlc.o \
                                        vulkan/ffv1_common.o 
vulkan/ffv1_reset.o \
                                        vulkan/ffv1_enc_rct.o 
vulkan/ffv1_enc_setup.o \
-                                       vulkan/ffv1_enc.o
+                                       vulkan/ffv1_rct_search.o 
vulkan/ffv1_enc.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
                                        vulkan/rangecoder.o vulkan/ffv1_vlc.o \
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp 
b/libavcodec/vulkan/ffv1_enc_setup.comp
index 6f21e47523..5f8e6704b0 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp
@@ -22,7 +22,7 @@
 
 uint8_t state[CONTEXT_SIZE];
 
-void init_slice(out SliceContext sc, const uint slice_idx)
+void init_slice(inout SliceContext sc, const uint slice_idx)
 {
     /* Set coordinates */
     uvec2 img_size = imageSize(src[0]);
@@ -37,11 +37,13 @@ void init_slice(out SliceContext sc, const uint slice_idx)
 
     sc.slice_pos = ivec2(sxs, sys);
     sc.slice_dim = ivec2(sxe - sxs, sye - sys);
-    sc.slice_rct_coef = ivec2(1, 1);
     sc.slice_coding_mode = int(force_pcm == 1);
     sc.slice_reset_contexts = sc.slice_coding_mode == 1;
     sc.quant_table_idx = u8vec3(context_model);
 
+    if ((rct_search == 0) || (sc.slice_coding_mode == 1))
+        sc.slice_rct_coef = ivec2(1, 1);
+
     rac_init(sc.c,
              OFFBUF(u8buf, out_data, slice_idx * slice_size_max),
              slice_size_max);
diff --git a/libavcodec/vulkan/ffv1_rct_search.comp 
b/libavcodec/vulkan/ffv1_rct_search.comp
new file mode 100644
index 0000000000..055bde46c4
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_rct_search.comp
@@ -0,0 +1,139 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2024 Lynne <d...@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ivec3 load_components(ivec2 pos)
+{
+    ivec3 pix = ivec3(imageLoad(src[0], pos));
+    if (planar_rgb != 0) {
+        for (int i = 1; i < 3; i++)
+            pix[i] = int(imageLoad(src[i], pos)[0]);
+    }
+
+    return ivec3(pix[fmt_lut[0]], pix[fmt_lut[1]], pix[fmt_lut[2]]);
+}
+
+#define NUM_CHECKS 15
+const ivec2 rct_y_coeff[NUM_CHECKS] = {
+    ivec2(0, 0), //      4G
+
+    ivec2(0, 1), //      3G +  B
+    ivec2(1, 0), //  R + 3G
+    ivec2(1, 1), //  R + 2G + B
+
+    ivec2(0, 2), //      2G + 2B
+    ivec2(2, 0), // 2R + 2G
+    ivec2(2, 2), // 2R      + 2B
+
+    ivec2(0, 3), //      1G + 3B
+    ivec2(3, 0), // 3R + 1G
+
+    ivec2(0, 4), //           4B
+    ivec2(4, 0), // 4R
+
+    ivec2(1, 2), //  R +  G + 2B
+    ivec2(2, 1), // 2R +  G +  B
+
+    ivec2(3, 1), // 3R      +  B
+    ivec2(1, 3), //  R      + 3B
+};
+
+shared ivec3 pix_buf[gl_WorkGroupSize.x + 1][gl_WorkGroupSize.y + 1] = { };
+
+ivec3 transform_sample(ivec3 pix, ivec2 rct_coef)
+{
+    pix.b -= pix.g;
+    pix.r -= pix.g;
+    pix.g += (pix.r*rct_coef.x + pix.b*rct_coef.y) >> 2;
+    pix.b += rct_offset;
+    pix.r += rct_offset;
+    return pix;
+}
+
+uint get_dist(ivec3 cur)
+{
+    ivec3 LL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 1];
+    ivec3 TL = pix_buf[gl_LocalInvocationID.x + 0][gl_LocalInvocationID.y + 0];
+    ivec3 TT = pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 0];
+
+    ivec3 pred = ivec3(predict(LL.r, ivec2(TL.r, TT.r)),
+                       predict(LL.g, ivec2(TL.g, TT.g)),
+                       predict(LL.b, ivec2(TL.b, TT.b)));
+
+    uvec3 c = abs(pred - cur);
+    return mid_pred(c.r, c.g, c.b);
+}
+
+shared uint score_cols[gl_WorkGroupSize.y] = { };
+shared uint score_mode[16] = { };
+
+void process(ivec2 pos)
+{
+    ivec3 pix = load_components(pos);
+
+    for (int i = 0; i < NUM_CHECKS; i++) {
+        ivec3 tx_pix = transform_sample(pix, rct_y_coeff[i]);
+        pix_buf[gl_LocalInvocationID.x + 1][gl_LocalInvocationID.y + 1] = 
tx_pix;
+        memoryBarrierShared();
+
+        uint dist = get_dist(tx_pix);
+        atomicAdd(score_mode[i], dist);
+    }
+}
+
+void coeff_search(inout SliceContext sc)
+{
+    uvec2 img_size = imageSize(src[0]);
+    uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
+                           gl_NumWorkGroups.x, 0);
+    uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
+                           gl_NumWorkGroups.x, 0);
+    uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
+                           gl_NumWorkGroups.y, 0);
+    uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
+                           gl_NumWorkGroups.y, 0);
+
+    for (uint y = sys + gl_LocalInvocationID.y; y < sye; y += 
gl_WorkGroupSize.y) {
+        for (uint x = sxs + gl_LocalInvocationID.x; x < sxe; x += 
gl_WorkGroupSize.x) {
+            process(ivec2(x, y));
+        }
+    }
+
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
+        uint min_score = 0xFFFFFFFF;
+        uint min_idx = 3;
+        for (int i = 0; i < NUM_CHECKS; i++) {
+            if (score_mode[i] < min_score) {
+                min_score = score_mode[i];
+                min_idx = i;
+            }
+        }
+        sc.slice_rct_coef = rct_y_coeff[min_idx];
+    }
+}
+
+void main(void)
+{
+    if (force_pcm == 1)
+        return;
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
+    coeff_search(slice_ctx[slice_idx]);
+}
-- 
2.49.0.395.g12beb8f557c
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to