This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit d66552e676317308473ddb5049ad837d89e3742d
Author:     Lynne <[email protected]>
AuthorDate: Tue May 26 11:38:29 2026 +0900
Commit:     Lynne <[email protected]>
CommitDate: Sat May 30 12:10:01 2026 +0900

    vulkan/ffv1: add 32-bit float RGB encoding and a rice + remap path
    
    This implements 32-bit float RGB encoding and makes the Vulkan 
implementation
    on-par with the C implementation.
    
    Sponsored-by: Sovereign Tech Fund
---
 libavcodec/ffv1_vulkan.h                           |   1 +
 libavcodec/ffv1enc_vulkan.c                        | 153 +++++++++++++++++++--
 libavcodec/vulkan/Makefile                         |   4 +-
 libavcodec/vulkan/ffv1_common.glsl                 |   1 +
 libavcodec/vulkan/ffv1_enc.comp.glsl               |  27 +++-
 ...mp.glsl => ffv1_enc_rgb_float_golomb.comp.glsl} |   2 +
 libavcodec/vulkan/ffv1_enc_setup.comp.glsl         | 127 ++++++++++++++++-
 libavcodec/vulkan/ffv1_enc_sort32.comp.glsl        | 153 +++++++++++++++++++++
 8 files changed, 444 insertions(+), 24 deletions(-)

diff --git a/libavcodec/ffv1_vulkan.h b/libavcodec/ffv1_vulkan.h
index 9a206afaca..d6ae0f3fee 100644
--- a/libavcodec/ffv1_vulkan.h
+++ b/libavcodec/ffv1_vulkan.h
@@ -48,6 +48,7 @@ typedef struct FFv1ShaderParams {
     int sar[2];
     int pic_mode;
     uint32_t slice_size_max;
+    uint32_t max_pixels_per_slice;
 } FFv1ShaderParams;
 
 #endif /* AVCODEC_FFV1_VULKAN_H */
diff --git a/libavcodec/ffv1enc_vulkan.c b/libavcodec/ffv1enc_vulkan.c
index 92d46f7ddf..7c22ced785 100644
--- a/libavcodec/ffv1enc_vulkan.c
+++ b/libavcodec/ffv1enc_vulkan.c
@@ -72,6 +72,7 @@ typedef struct VulkanEncodeFFv1Context {
 
     FFVulkanShader rct_search;
     FFVulkanShader remap;
+    FFVulkanShader sort32;
     FFVulkanShader setup;
     FFVulkanShader reset;
     FFVulkanShader enc;
@@ -101,6 +102,8 @@ typedef struct VulkanEncodeFFv1Context {
     int optimize_rct;
 
     int is_rgb;
+    int is_float32;
+    uint32_t max_pixels_per_slice;
     int ppi;
     int chunks;
 } VulkanEncodeFFv1Context;
@@ -141,6 +144,12 @@ extern const unsigned int ff_ffv1_enc_remap_comp_spv_len;
 extern const unsigned char ff_ffv1_enc_rgb_float_comp_spv_data[];
 extern const unsigned int ff_ffv1_enc_rgb_float_comp_spv_len;
 
+extern const unsigned char ff_ffv1_enc_rgb_float_golomb_comp_spv_data[];
+extern const unsigned int ff_ffv1_enc_rgb_float_golomb_comp_spv_len;
+
+extern const unsigned char ff_ffv1_enc_sort32_comp_spv_data[];
+extern const unsigned int ff_ffv1_enc_sort32_comp_spv_len;
+
 static int run_rct_search(AVCodecContext *avctx, FFVkExecContext *exec,
                           AVFrame *enc_in, VkImageView *enc_in_views,
                           FFVkBuffer *slice_data_buf, uint32_t slice_data_size,
@@ -203,6 +212,37 @@ static int run_remap(AVCodecContext *avctx, 
FFVkExecContext *exec,
     return 0;
 }
 
+static int run_sort32(AVCodecContext *avctx, FFVkExecContext *exec,
+                      AVFrame *enc_in, VkImageView *enc_in_views,
+                      FFVkBuffer *units_buf, uint32_t units_size,
+                      FFv1ShaderParams *pd)
+{
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFV1Context *f = &fv->ctx;
+    FFVulkanFunctions *vk = &fv->s.vkfn;
+
+    /* Update descriptors */
+    ff_vk_shader_update_img_array(&fv->s, exec, &fv->sort32,
+                                  enc_in, enc_in_views,
+                                  1, 1,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&fv->s, exec, &fv->sort32,
+                                    1, 2, 0,
+                                    units_buf,
+                                    0, units_size*f->slice_count,
+                                    VK_FORMAT_UNDEFINED);
+
+    ff_vk_exec_bind_shader(&fv->s, exec, &fv->sort32);
+    ff_vk_shader_update_push_const(&fv->s, exec, &fv->sort32,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(FFv1ShaderParams), pd);
+
+    vk->CmdDispatch(exec->buf, fv->ctx.num_h_slices, fv->ctx.num_v_slices, 1);
+
+    return 0;
+}
+
 static int vulkan_encode_ffv1_submit_frame(AVCodecContext *avctx,
                                            FFVkExecContext *exec,
                                            const AVFrame *pict)
@@ -279,15 +319,19 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
     slice_data_buf = (FFVkBuffer *)slice_data_ref->data;
 
     if (f->remap_mode) {
-        const AVPixFmtDescriptor *desc = 
av_pix_fmt_desc_get(fv->s.frames->sw_format);
-        remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t);
+        if (fv->is_float32) {
+            /* Per (slice, plane): [units : max_pixels*2 uints] + [bitmap : 
max_pixels uints]. */
+            remap_data_size = 4*fv->max_pixels_per_slice*3*sizeof(uint32_t);
+        } else {
+            const AVPixFmtDescriptor *desc = 
av_pix_fmt_desc_get(fv->s.frames->sw_format);
+            remap_data_size = 4*(1 << desc->comp[0].depth)*sizeof(uint32_t);
+        }
 
         RET(ff_vk_get_pooled_buffer(&fv->s, &fv->remap_data_pool,
                                     &remap_data_ref,
                                     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
                                     NULL, remap_data_size*f->slice_count,
                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
-
         remap_data_buf = (FFVkBuffer *)remap_data_ref->data;
     }
 
@@ -348,6 +392,7 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
         .pic_mode = !(pict->flags & AV_FRAME_FLAG_INTERLACED) ? 3 :
                     !(pict->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 2 : 1,
         .slice_size_max = out_data_buf->size / f->slice_count,
+        .max_pixels_per_slice = fv->max_pixels_per_slice,
     };
 
     for (int i = 0; i < f->quant_table_count; i++) {
@@ -420,8 +465,13 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
     }
 
     if (f->remap_mode) {
-        RET(run_remap(avctx, exec, src, src_views,
-                      remap_data_buf, remap_data_size, &pd));
+        if (fv->is_float32) {
+            RET(run_sort32(avctx, exec, src, src_views,
+                           remap_data_buf, remap_data_size, &pd));
+        } else {
+            RET(run_remap(avctx, exec, src, src_views,
+                          remap_data_buf, remap_data_size, &pd));
+        }
 
         /* Make sure the writes are visible to the setup shader */
         ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf,
@@ -519,6 +569,14 @@ static int vulkan_encode_ffv1_submit_frame(AVCodecContext 
*avctx,
                       COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
                       COMPUTE_SHADER_BIT, SHADER_READ_BIT, SHADER_WRITE_BIT,
                       0, slice_data_size*f->slice_count);
+
+    /* Setup writes the per-pixel compact_idx (or compact_idx-of-value)
+     * back into the remap buffer; the encode shader reads it. */
+    if (f->remap_mode)
+        ff_vk_buf_barrier(buf_bar[nb_buf_bar++], remap_data_buf,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, 
SHADER_WRITE_BIT,
+                          COMPUTE_SHADER_BIT, SHADER_READ_BIT, NONE_KHR,
+                          0, remap_data_size*f->slice_count);
     if (f->key_frame || fv->force_pcm)
         ff_vk_buf_barrier(buf_bar[nb_buf_bar++], slice_data_buf,
                           COMPUTE_SHADER_BIT, SHADER_WRITE_BIT, NONE_KHR,
@@ -906,6 +964,54 @@ fail:
     return err;
 }
 
+static int init_sort32_shader(AVCodecContext *avctx, VkSpecializationInfo *sl)
+{
+    int err;
+    VulkanEncodeFFv1Context *fv = avctx->priv_data;
+    FFVulkanShader *shd = &fv->sort32;
+
+    uint32_t wg_x = FFMIN(fv->max_pixels_per_slice, 256);
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
+                      (uint32_t []) { wg_x, 1, 1 }, 0);
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(FFv1ShaderParams),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    const FFVulkanDescriptorSetBinding desc_set_const[] = {
+        { /* rangecoder_buf */
+            .type   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set_const, 1, 1, 0);
+
+    const FFVulkanDescriptorSetBinding desc_set[] = {
+        { /* slice_data_buf */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        { /* src */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = av_pix_fmt_count_planes(fv->s.frames->sw_format),
+        },
+        { /* units */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    ff_vk_shader_add_descriptor_set(&fv->s, shd, desc_set, 3, 0, 0);
+
+    RET(ff_vk_shader_link(&fv->s, shd,
+                          ff_ffv1_enc_sort32_comp_spv_data,
+                          ff_ffv1_enc_sort32_comp_spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(&fv->s, &fv->exec_pool, shd));
+
+fail:
+    return err;
+}
+
 static int init_remap_shader(AVCodecContext *avctx, VkSpecializationInfo *sl)
 {
     int err;
@@ -1105,9 +1211,14 @@ static int init_encode_shader(AVCodecContext *avctx, 
VkSpecializationInfo *sl)
                                     4 + fv->is_rgb + !!f->remap_mode, 0, 0);
 
     if (f->remap_mode) {
-        ff_vk_shader_link(&fv->s, shd,
-                          ff_ffv1_enc_rgb_float_comp_spv_data,
-                          ff_ffv1_enc_rgb_float_comp_spv_len, "main");
+        if (fv->ctx.ac == AC_GOLOMB_RICE)
+            ff_vk_shader_link(&fv->s, shd,
+                              ff_ffv1_enc_rgb_float_golomb_comp_spv_data,
+                              ff_ffv1_enc_rgb_float_golomb_comp_spv_len, 
"main");
+        else
+            ff_vk_shader_link(&fv->s, shd,
+                              ff_ffv1_enc_rgb_float_comp_spv_data,
+                              ff_ffv1_enc_rgb_float_comp_spv_len, "main");
     } else if (fv->ctx.ac == AC_GOLOMB_RICE) {
         if (fv->is_rgb)
             ff_vk_shader_link(&fv->s, shd,
@@ -1304,6 +1415,26 @@ static av_cold int 
vulkan_encode_ffv1_init(AVCodecContext *avctx)
     fv->is_rgb = !(f->colorspace == 0 && avctx->sw_pix_fmt != AV_PIX_FMT_YA8) 
&&
                  !(avctx->sw_pix_fmt == AV_PIX_FMT_YA8);
 
+    fv->is_float32 = (avctx->sw_pix_fmt == AV_PIX_FMT_GBRPF32 ||
+                      avctx->sw_pix_fmt == AV_PIX_FMT_GBRAPF32);
+
+    if (fv->is_float32) {
+        /* Compute the worst-case slice geometry. With version >= 4 the slice
+         * boundaries are computed via slice_coord() which rounds up, so any
+         * single slice has at most ceil(width/num_h_slices) * 
ceil(height/num_v_slices)
+         * pixels. */
+        uint32_t mw = (avctx->width  + f->num_h_slices - 1) / f->num_h_slices;
+        uint32_t mh = (avctx->height + f->num_v_slices - 1) / f->num_v_slices;
+        /* Round up to next pow2 for bitonic sort */
+        uint32_t n = 1;
+        uint32_t pn = mw*mh;
+        while (n < pn)
+            n <<= 1;
+        if (n < 2)
+            n = 2;
+        fv->max_pixels_per_slice = n;
+    }
+
     /* Init rct search shader */
     fv->optimize_rct = fv->is_rgb && f->version >= 4 &&
                        !fv->force_pcm && fv->optimize_rct;
@@ -1325,7 +1456,10 @@ static av_cold int 
vulkan_encode_ffv1_init(AVCodecContext *avctx)
     }
 
     if (f->remap_mode) {
-        err = init_remap_shader(avctx, sl);
+        if (fv->is_float32)
+            err = init_sort32_shader(avctx, sl);
+        else
+            err = init_remap_shader(avctx, sl);
         if (err < 0)
             return err;
     }
@@ -1420,6 +1554,7 @@ static av_cold int 
vulkan_encode_ffv1_close(AVCodecContext *avctx)
     ff_vk_shader_free(&fv->s, &fv->reset);
     ff_vk_shader_free(&fv->s, &fv->setup);
     ff_vk_shader_free(&fv->s, &fv->remap);
+    ff_vk_shader_free(&fv->s, &fv->sort32);
     ff_vk_shader_free(&fv->s, &fv->rct_search);
 
     if (fv->exec_ctx_info) {
diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index c6817967c7..f86931727d 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -13,7 +13,9 @@ OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += 
vulkan/ffv1_enc_setup.comp.spv.o \
                                       vulkan/ffv1_enc_rgb_golomb.comp.spv.o \
                                       vulkan/ffv1_enc_rct_search.comp.spv.o \
                                       vulkan/ffv1_enc_remap.comp.spv.o \
-                                      vulkan/ffv1_enc_rgb_float.comp.spv.o
+                                      vulkan/ffv1_enc_rgb_float.comp.spv.o \
+                                      
vulkan/ffv1_enc_rgb_float_golomb.comp.spv.o \
+                                      vulkan/ffv1_enc_sort32.comp.spv.o
 
 OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL) += vulkan/ffv1_dec_setup.comp.spv.o \
                                       vulkan/ffv1_dec_reset.comp.spv.o \
diff --git a/libavcodec/vulkan/ffv1_common.glsl 
b/libavcodec/vulkan/ffv1_common.glsl
index 8580a0777f..3d3b6753c6 100644
--- a/libavcodec/vulkan/ffv1_common.glsl
+++ b/libavcodec/vulkan/ffv1_common.glsl
@@ -75,6 +75,7 @@ layout (push_constant, scalar) uniform pushConstants {
     ivec2 sar;
     int pic_mode;
     uint slice_size_max;
+    uint max_pixels_per_slice;
 };
 
 #include "rangecoder.glsl"
diff --git a/libavcodec/vulkan/ffv1_enc.comp.glsl 
b/libavcodec/vulkan/ffv1_enc.comp.glsl
index 90ce8293b9..1c30e91828 100644
--- a/libavcodec/vulkan/ffv1_enc.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc.comp.glsl
@@ -40,8 +40,8 @@ layout (set = 1, binding = 1, scalar) writeonly buffer 
slice_results_buf {
  * denormals before we get to look at them. */
 layout (set = 1, binding = 3) uniform uimage2D src[];
 #ifdef FLOAT
-layout (set = 1, binding = 5) readonly buffer fltmap_buf {
-    uint fltmap[][4][65536];
+layout (set = 1, binding = 5, scalar) readonly buffer fltmap_buf {
+    uint fltmap[];
 };
 #endif
 
@@ -239,11 +239,24 @@ ivec4 load_components(uint slice_idx, in SliceContext sc, 
ivec2 pos)
 {
     ivec4 pix;
 #ifdef FLOAT
-    /* Source view is r16_uint so imageLoad returns the raw fp16 bit pattern
-     * in .x; no conversion is performed and denormals survive. */
-    for (int i = 0; i < color_planes; i++) {
-        uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu;
-        pix[i] = int(fltmap[slice_idx][i][iv]);
+    if (c_bits >= 32) {
+        /* 32-bit float: per-pixel-position bitmap lookup. The bitmap region
+         * follows the units region in the same buffer. */
+        ivec2 rel = pos - sc.slice_pos;
+        uint pixel_idx = uint(rel.x + sc.slice_dim.x*rel.y);
+        uint plane_stride = max_pixels_per_slice*3u;
+        for (int i = 0; i < color_planes; i++) {
+            uint base = (slice_idx*4u + uint(i))*plane_stride
+                        + max_pixels_per_slice*2u;
+            pix[i] = int(fltmap[base + pixel_idx]);
+        }
+    } else {
+        /* 16-bit float: value-indexed lookup. Source view is r16_uint so
+         * imageLoad returns the raw fp16 bit pattern in .x. */
+        for (int i = 0; i < color_planes; i++) {
+            uint iv = imageLoad(src[i], pos)[0] & 0xFFFFu;
+            pix[i] = int(fltmap[(slice_idx*4u + uint(i))*65536u + iv]);
+        }
     }
 #else
     pix = ivec4(imageLoad(src[0], pos));
diff --git a/libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
similarity index 96%
copy from libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl
copy to libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
index c66440601a..e4535eb08f 100644
--- a/libavcodec/vulkan/ffv1_enc_rgb_float.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_rgb_float_golomb.comp.glsl
@@ -26,6 +26,8 @@
 
 layout (set = 1, binding = 4) uniform uimage2D tmp;
 
+#define PB_UNALIGNED
+#define GOLOMB
 #define FLOAT
 #define RGB
 #include "ffv1_enc.comp.glsl"
diff --git a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
index 53a8d7f13f..f1db2aed8a 100644
--- a/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
+++ b/libavcodec/vulkan/ffv1_enc_setup.comp.glsl
@@ -23,13 +23,13 @@
 #pragma shader_stage(compute)
 #extension GL_GOOGLE_include_directive : require
 
-#define NB_CONTEXTS 2
+#define NB_CONTEXTS 6
 #define FULL_RENORM
 #include "common.glsl"
 #include "ffv1_common.glsl"
 
-layout (set = 1, binding = 1) buffer fltmap_buf {
-    uint fltmap[][4][65536];
+layout (set = 1, binding = 1, scalar) buffer fltmap_buf {
+    uint fltmap[];
 };
 
 void init_slice(inout SliceContext sc, uint slice_idx)
@@ -81,6 +81,7 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
     const int flip = (remap_mode == 2) ? 0x7FFF : 0;
 
     for (int p = 0; p < color_planes; p++) {
+        const uint base = (slice_idx*4u + uint(p))*65536u;
         uint j = 0;
         uint lu = 0;
         int run = 0;
@@ -90,15 +91,15 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
 
         put_usymbol(0, 0);
 
-        for (int i = 0; i < NB_CONTEXTS; i++)
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
             rc_state[i] = uint8_t(128);
 
         int cnt = 0;
         for (int i = 0; i < rct_offset; i++) {
             int ri = i ^ (((i & 0x8000) != 0) ? 0 : flip);
-            uint u = uint(fltmap[slice_idx][p][ri] != 0);
+            uint u = uint(fltmap[base + uint(ri)] != 0u);
 
-            fltmap[slice_idx][p][ri] = uint16_t(j);
+            fltmap[base + uint(ri)] = j;
             j += u;
 
             if (lu == u) {
@@ -117,6 +118,115 @@ void encode_histogram_remap(uint slice_idx, inout 
SliceContext sc)
     }
 }
 
+/* The 32-bit float remap uses 6 contexts: state[lu][category][bit] with
+ * lu = 0,1 and category = 0 (run/step-1), 1 (delta, unused here), 2 (mul). */
+#define CTX_F32(lu, cat) ((uint(lu)*3u + uint(cat))*CONTEXT_SIZE)
+
+void encode_float32_remap(uint slice_idx, inout SliceContext sc)
+{
+    const uint slice_w = uint(sc.slice_dim.x);
+    const uint slice_h = uint(sc.slice_dim.y);
+    const uint pixel_num = slice_w * slice_h;
+    const uint plane_stride = max_pixels_per_slice*3u;
+
+    for (int p = 0; p < color_planes; p++) {
+        /* Layout: per (slice, plane) we have units (max_pixels*8 bytes)
+         * followed by bitmap (max_pixels*4 bytes). The units region is
+         * read-only here, the bitmap region is written. */
+        const uint plane_base = (slice_idx*4u + uint(p))*plane_stride;
+        const uint bitmap_base = plane_base + max_pixels_per_slice*2u;
+
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
+            rc_state[i] = uint8_t(128);
+
+        put_usymbol(1, CTX_F32(0, 0));
+
+        for (int i = 0; i < NB_CONTEXTS*CONTEXT_SIZE; i++)
+            rc_state[i] = uint8_t(128);
+
+        /* last_val is the last unique value (or 0xFFFFFFFF as the "before
+         * any value" sentinel, this lets step = val - last_val give val+1
+         * for the first emission via unsigned wraparound). */
+        uint last_val = 0xFFFFFFFFu;
+        uint lu = 0;
+        uint run = 0;
+        int ci = -1;
+        bool emit_first_mul = true;
+
+        for (uint i = 0; i < pixel_num; i++) {
+            uint u_val = fltmap[plane_base + 2u*i + 0u];
+            uint u_ndx = fltmap[plane_base + 2u*i + 1u];
+
+            /* Duplicate of the previous unique value? Reuse ci. */
+            if (i > 0u && last_val == u_val) {
+                fltmap[bitmap_base + u_ndx] = uint(ci);
+                continue;
+            }
+
+            uint step = u_val - last_val;
+
+            if (lu == 0u) {
+                put_usymbol(step - 1u, CTX_F32(0, 0));
+
+                if (emit_first_mul) {
+                    put_usymbol(1, CTX_F32(0, 2));
+                    emit_first_mul = false;
+                }
+
+                last_val = u_val;
+                if (step == 1u) {
+                    lu = 1;
+                    run = 0;
+                }
+            } else {
+                if (step == 1u) {
+                    run++;
+                    last_val = u_val;
+                } else {
+                    if (run > 0u) {
+                        put_usymbol(run, CTX_F32(1, 0));
+                        put_usymbol(0, CTX_F32(1, 0));
+                        last_val += 2u;
+                    } else {
+                        put_usymbol(0, CTX_F32(1, 0));
+                        last_val += 1u;
+                    }
+                    lu = 0;
+                    run = 0;
+
+                    step = u_val - last_val;
+                    put_usymbol(step - 1u, CTX_F32(0, 0));
+
+                    last_val = u_val;
+                    if (step == 1u) {
+                        lu = 1;
+                        run = 0;
+                    }
+                }
+            }
+
+            ci++;
+            fltmap[bitmap_base + u_ndx] = uint(ci);
+        }
+
+        if (lu == 1u) {
+            if (run > 0u) {
+                put_usymbol(run, CTX_F32(1, 0));
+                put_usymbol(0, CTX_F32(1, 0));
+                last_val += 2u;
+            } else {
+                put_usymbol(0, CTX_F32(1, 0));
+                last_val += 1u;
+            }
+        }
+
+        if (last_val != 0xFFFFFFFFu)
+            put_usymbol(0xFFFFFFFFu - last_val, CTX_F32(0, 0));
+
+        sc.remap_count[p] = ci + 1;
+    }
+}
+
 void write_slice_header(uint slice_idx, inout SliceContext sc)
 {
     [[unroll]]
@@ -149,7 +259,10 @@ void write_slice_header(uint slice_idx, inout SliceContext 
sc)
 
         if (remap_mode != 0) {
             put_usymbol(remap_mode, 0);
-            encode_histogram_remap(slice_idx, sc);
+            if (c_bits >= 32)
+                encode_float32_remap(slice_idx, sc);
+            else
+                encode_histogram_remap(slice_idx, sc);
         }
     }
 }
diff --git a/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl 
b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl
new file mode 100644
index 0000000000..4d40d94577
--- /dev/null
+++ b/libavcodec/vulkan/ffv1_enc_sort32.comp.glsl
@@ -0,0 +1,153 @@
+/*
+ * FFv1 codec
+ *
+ * Copyright (c) 2026 Lynne <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+#extension GL_GOOGLE_include_directive : require
+
+#define SB_QUALI readonly
+#include "common.glsl"
+#include "ffv1_common.glsl"
+
+layout (set = 1, binding = 1) uniform uimage2D src[];
+
+layout (set = 1, binding = 2, scalar) buffer fltmap_buf {
+    uint fltmap[];
+};
+
+/* The shared fltmap_buf is laid out per (slice, plane) as a
+ * max_pixels_per_slice*3 uint block, where the first
+ * max_pixels_per_slice*2 entries hold interleaved (val, ndx) pairs and
+ * the trailing [max_pixels_per_slice] entries are the bitmap region used
+ * by the setup/encode shaders. Padding past pixel_num is the sentinel
+ * (UINT32_MAX, UINT32_MAX) so it sorts at the end. */
+
+/* Per-workgroup bitonic-sort buffer. Limits a slice's pow2 size; large
+ * slices fall back to working in global memory */
+shared u32vec2 smem[8192];
+
+void main(void)
+{
+    const uint slice_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
+    uvec2 img_size = imageSize(src[0]);
+
+    uint sxs = slice_coord(img_size.x, gl_WorkGroupID.x + 0,
+                           gl_NumWorkGroups.x, 0);
+    uint sxe = slice_coord(img_size.x, gl_WorkGroupID.x + 1,
+                           gl_NumWorkGroups.x, 0);
+    uint sys = slice_coord(img_size.y, gl_WorkGroupID.y + 0,
+                           gl_NumWorkGroups.y, 0);
+    uint sye = slice_coord(img_size.y, gl_WorkGroupID.y + 1,
+                           gl_NumWorkGroups.y, 0);
+
+    uint slice_w = sxe - sxs;
+    uint slice_h = sye - sys;
+    uint pixel_num = slice_w * slice_h;
+
+    /* Round up to next pow2 for bitonic sort */
+    uint N = 1;
+    while (N < pixel_num)
+        N <<= 1;
+    N = max(N, 2);
+    if (N > max_pixels_per_slice)
+        N = max_pixels_per_slice;
+
+    const uint plane_stride = max_pixels_per_slice*3u;
+    const bool use_smem = N <= 8192u;
+
+    for (int p = 0; p < color_planes; p++) {
+        uint base = (slice_idx*4u + uint(p))*plane_stride;
+
+        /* Load pixels */
+        for (uint i = gl_LocalInvocationIndex; i < N;
+             i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+            uint v, ndx;
+            if (i < pixel_num) {
+                uint y = i / slice_w;
+                uint x = i - y*slice_w;
+                v = imageLoad(src[p], ivec2(sxs + x, sys + y))[0];
+                if (remap_mode == 2)
+                    v = ((v & 0x80000000u) != 0u) ? v : (v ^ 0x7FFFFFFFu);
+                ndx = i;
+            } else {
+                v = 0xFFFFFFFFu;
+                ndx = 0xFFFFFFFFu;
+            }
+            if (use_smem) {
+                smem[i] = u32vec2(v, ndx);
+            } else {
+                fltmap[base + 2u*i + 0u] = v;
+                fltmap[base + 2u*i + 1u] = ndx;
+            }
+        }
+        barrier();
+        if (!use_smem) memoryBarrierBuffer();
+
+        /* Bitonic sort of the (val, ndx) pairs. */
+        for (uint k = 2; k <= N; k <<= 1) {
+            for (uint j = k >> 1; j > 0; j >>= 1) {
+                for (uint i = gl_LocalInvocationIndex; i < N;
+                     i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+                    uint partner = i ^ j;
+                    if (partner > i) {
+                        bool ascending = (i & k) == 0;
+                        u32vec2 a, b;
+                        if (use_smem) {
+                            a = smem[i];
+                            b = smem[partner];
+                        } else {
+                            a = u32vec2(fltmap[base + 2u*i + 0u],
+                                        fltmap[base + 2u*i + 1u]);
+                            b = u32vec2(fltmap[base + 2u*partner + 0u],
+                                        fltmap[base + 2u*partner + 1u]);
+                        }
+                        bool a_gt_b = (a.x > b.x) ||
+                                      (a.x == b.x && a.y > b.y);
+                        if (a_gt_b == ascending) {
+                            if (use_smem) {
+                                smem[i] = b;
+                                smem[partner] = a;
+                            } else {
+                                fltmap[base + 2u*i + 0u] = b.x;
+                                fltmap[base + 2u*i + 1u] = b.y;
+                                fltmap[base + 2u*partner + 0u] = a.x;
+                                fltmap[base + 2u*partner + 1u] = a.y;
+                            }
+                        }
+                    }
+                }
+                barrier();
+                if (!use_smem) memoryBarrierBuffer();
+            }
+        }
+
+        /* Write sorted pairs back to global */
+        if (use_smem) {
+            for (uint i = gl_LocalInvocationIndex; i < N;
+                 i += gl_WorkGroupSize.x * gl_WorkGroupSize.y) {
+                u32vec2 u = smem[i];
+                fltmap[base + 2u*i + 0u] = u.x;
+                fltmap[base + 2u*i + 1u] = u.y;
+            }
+            barrier();
+        }
+    }
+}

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to