This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 2e25da3121ce5942847bc23bf8c950fc75cbb4b1
Author:     Lynne <[email protected]>
AuthorDate: Thu May 14 17:09:50 2026 +0900
Commit:     Lynne <[email protected]>
CommitDate: Fri May 22 14:06:01 2026 +0900

    vf_nlmeans_vulkan: port to compile-time SPIR-V generation
---
 configure                                          |   2 +-
 libavfilter/vf_nlmeans_vulkan.c                    | 552 ++++-----------------
 libavfilter/vulkan/Makefile                        |   4 +
 ...avgblur.comp.glsl => nlmeans_denoise.comp.glsl} |  65 ++-
 libavfilter/vulkan/nlmeans_horizontal.comp.glsl    | 104 ++++
 libavfilter/vulkan/nlmeans_vertical.comp.glsl      | 122 +++++
 libavfilter/vulkan/nlmeans_weights.comp.glsl       | 144 ++++++
 7 files changed, 518 insertions(+), 475 deletions(-)

diff --git a/configure b/configure
index 3ba8723856..18fe7271aa 100755
--- a/configure
+++ b/configure
@@ -4227,7 +4227,7 @@ mptestsrc_filter_deps="gpl"
 msad_filter_select="scene_sad"
 negate_filter_deps="lut_filter"
 nlmeans_opencl_filter_deps="opencl"
-nlmeans_vulkan_filter_deps="vulkan spirv_library"
+nlmeans_vulkan_filter_deps="vulkan spirv_compiler"
 nnedi_filter_deps="gpl"
 ocr_filter_deps="libtesseract"
 ocv_filter_deps="libopencv"
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index c1430707b7..902c072669 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -19,19 +19,24 @@
  */
 
 #include "libavutil/mem.h"
-#include "libavutil/random_seed.h"
-#include "libavutil/vulkan_spirv.h"
 #include "libavutil/opt.h"
 #include "vulkan_filter.h"
 
 #include "filters.h"
 #include "video.h"
 
-#define TYPE_NAME  "vec4"
+extern const unsigned char ff_nlmeans_horizontal_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_horizontal_comp_spv_len;
+extern const unsigned char ff_nlmeans_vertical_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_vertical_comp_spv_len;
+extern const unsigned char ff_nlmeans_weights_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_weights_comp_spv_len;
+extern const unsigned char ff_nlmeans_denoise_comp_spv_data[];
+extern const unsigned int  ff_nlmeans_denoise_comp_spv_len;
+
+/* Must be kept in sync with the definitions in the nlmeans_* shaders */
 #define TYPE_ELEMS 4
 #define TYPE_SIZE  (TYPE_ELEMS*4)
-#define TYPE_BLOCK_ELEMS 16
-#define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
 #define WG_SIZE 32
 
 typedef struct NLMeansVulkanContext {
@@ -80,210 +85,60 @@ typedef struct IntegralPushData {
     uint32_t nb_components;
 } IntegralPushData;
 
-static void shared_shd_def(FFVulkanShader *shd) {
-    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
-    GLSLC(0,                                                                  
);
-    GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
-    GLSLF(0, #define T_ALIGN %i                                               
,TYPE_SIZE);
-    GLSLF(0, #define T_BLOCK_ELEMS %i                                         
,TYPE_BLOCK_ELEMS);
-    GLSLF(0, #define T_BLOCK_ALIGN %i                                         
,TYPE_BLOCK_SIZE);
-    GLSLC(0,                                                                  
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
-    GLSLC(1,     DTYPE v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, struct Block {                                                   
);
-    GLSLC(1,     DTYPE data[T_BLOCK_ELEMS];                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN) 
buffer BlockBuffer {  );
-    GLSLC(1,     Block v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
-    GLSLC(1,     uvec4 width;                                                 
);
-    GLSLC(1,     uvec4 height;                                                
);
-    GLSLC(1,     vec4 strength;                                               
);
-    GLSLC(1,     uvec4 comp_off;                                              
);
-    GLSLC(1,     uvec4 comp_plane;                                            
);
-    GLSLC(1,     DataBuffer integral_base;                                    
);
-    GLSLC(1,     uint64_t integral_size;                                      
);
-    GLSLC(1,     uint64_t int_stride;                                         
);
-    GLSLC(1,     uint xyoffs_start;                                           
);
-    GLSLC(1,     uint nb_components;                                          
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0,                                                                  
);
-
-    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
-                                VK_SHADER_STAGE_COMPUTE_BIT);
-}
-
 static av_cold int init_integral_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
                                           FFVulkanShader *shd_horizontal,
                                           FFVulkanShader *shd_vertical,
-                                          FFVkSPIRVCompiler *spv,
-                                          const AVPixFmtDescriptor *desc, int 
planes)
+                                          int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
     FFVulkanShader *shd;
-    FFVulkanDescriptorSetBinding *desc_set;
 
+    /* Horizontal pass */
     shd = shd_horizontal;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_horizontal",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, 1, 1,
-                          0));
-    shared_shd_def(shd);
-
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     BlockBuffer b_dst;                                            
  );
-    GLSLC(1,     Block block;                                                  
  );
-    GLSLC(1,     DTYPE s2;                                                     
  );
-    GLSLC(1,     DTYPE prefix_sum;                                             
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     int k;                                                        
  );
-    GLSLC(1,     int o;                                                        
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos.y = int(gl_GlobalInvocationID.x);                         
  );
-    GLSLC(1,     if (pos.y < height[c_plane]) {                                
  );
-    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
-    GLSLC(2,         offset = int_stride * uint64_t(pos.y);                    
  );
-    GLSLC(2,         b_dst = BlockBuffer(uint64_t(integral_data) + offset);    
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(2,         for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {    
  );
-    GLSLC(3,             block = b_dst.v[k];                                   
  );
-    GLSLC(3,             for (o = 0; o < T_BLOCK_ELEMS; o++) {                 
  );
-    GLSLC(4,                 s2 = block.data[o];                               
  );
-    GLSLC(4,                 block.data[o] = s2 + prefix_sum;                  
  );
-    GLSLC(4,                 prefix_sum += s2;                                 
  );
-    GLSLC(3,             }                                                     
  );
-    GLSLC(3,             b_dst.v[k] = block;                                   
  );
-    GLSLC(2,         }                                                         
  );
-    GLSLC(1,     }                                                             
  );
-    GLSLC(0, }                                                                 
  );
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, 1, 1 }, 0);
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_horizontal_comp_spv_data,
+                          ff_nlmeans_horizontal_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
+    /* Vertical pass */
     shd = shd_vertical;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_vertical",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, 1, 1,
-                          0));
-    shared_shd_def(shd);
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name       = "input_img",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali  = "readonly",
-            .dimensions = 2,
-            .elems      = planes,
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, 1, 1 }, 0);
+
+    ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    const FFVulkanDescriptorSetBinding desc_set_img[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 0, 0));
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "xyoffsets_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "ivec2 xyoffsets[];",
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 1, 0, 0);
+
+    const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+        { /* xyoffsets_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
-
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     float s1;                                                     
  );
-    GLSLC(1,     DTYPE s2;                                                     
  );
-    GLSLC(1,     DTYPE prefix_sum;                                             
  );
-    GLSLC(1,     uvec2 size;                                                   
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     ivec2 pos_off;                                                
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_off;                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.y);                       
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z);                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     if (strength[comp_idx] == 0.0)                                
  );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    for (int i = 0; i < TYPE_ELEMS; i++)
-        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos.x = int(gl_GlobalInvocationID.x);                         
  );
-    GLSLC(1,     if (pos.x < width[c_plane]) {                                 
  );
-    GLSLC(2,         prefix_sum = DTYPE(0);                                    
  );
-    GLSLC(2,         for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {       
  );
-    GLSLC(3,             offset = int_stride * uint64_t(pos.y);                
  );
-    GLSLC(3,             dst = DataBuffer(uint64_t(integral_data) + offset);   
  );
-    GLSLC(4,             s1 = imageLoad(input_img[c_plane], pos)[c_off];       
  );
-    for (int i = 0; i < TYPE_ELEMS; i++) {
-        GLSLF(4,         pos_off = pos + offs[%i];                             
  ,i);
-        GLSLC(4,         if (!IS_WITHIN(uvec2(pos_off), size))                 
  );
-        GLSLF(5,             s2[%i] = s1;                                      
  ,i);
-        GLSLC(4,         else                                                  
  );
-        GLSLF(5,             s2[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
-    }
-    GLSLC(4,             s2 = (s1 - s2) * (s1 - s2);                           
  );
-    GLSLC(3,             dst.v[pos.x] = s2 + prefix_sum;                       
  );
-    GLSLC(3,             prefix_sum += s2;                                     
  );
-    GLSLC(2,         }                                                         
  );
-    GLSLC(1,     }                                                             
  );
-    GLSLC(0, }                                                                 
  );
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
 
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_vertical_comp_spv_data,
+                          ff_nlmeans_vertical_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -305,172 +160,48 @@ typedef struct WeightsPushData {
 } WeightsPushData;
 
 static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
-                                         FFVulkanShader *shd,
-                                         FFVkSPIRVCompiler *spv,
-                                         const AVPixFmtDescriptor *desc,
-                                         int planes)
+                                         FFVulkanShader *shd, int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    FFVulkanDescriptorSetBinding *desc_set;
-
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, WG_SIZE, 1,
-                          0));
-
-    GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require                     
);
-    GLSLC(0,                                                                  
);
-    GLSLF(0, #define DTYPE %s                                                 
,TYPE_NAME);
-    GLSLF(0, #define T_ALIGN %i                                               
,TYPE_SIZE);
-    GLSLC(0,                                                                  
);
-    GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer 
DataBuffer {  );
-    GLSLC(1,     DTYPE v[];                                                   
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {            
);
-    GLSLC(1,     uvec4 width;                                                 
);
-    GLSLC(1,     uvec4 height;                                                
);
-    GLSLC(1,     uvec4 ws_offset;                                             
);
-    GLSLC(1,     uvec4 ws_stride;                                             
);
-    GLSLC(1,     ivec4 patch_size;                                            
);
-    GLSLC(1,     vec4 strength;                                               
);
-    GLSLC(1,     uvec4 comp_off;                                              
);
-    GLSLC(1,     uvec4 comp_plane;                                            
);
-    GLSLC(1,     DataBuffer integral_base;                                    
);
-    GLSLC(1,     uint64_t integral_size;                                      
);
-    GLSLC(1,     uint64_t int_stride;                                         
);
-    GLSLC(1,     uint xyoffs_start;                                           
);
-    GLSLC(1,     uint ws_count;                                               
);
-    GLSLC(1,     uint nb_components;                                          
);
-    GLSLC(0, };                                                               
);
-    GLSLC(0,                                                                  
);
+
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(WeightsPushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name       = "input_img",
-            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali  = "readonly",
-            .dimensions = 2,
-            .elems      = planes,
-            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+    const FFVulkanDescriptorSetBinding desc_set[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
-        {
-            .name        = "weights_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights[];",
+        { /* weights_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
-        {
-            .name        = "sums_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums[];",
+        { /* sums_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0));
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "xyoffsets_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "ivec2 xyoffsets[];",
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0);
+
+    const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+        { /* xyoffsets_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
-
-    GLSLC(0,                                                                   
  );
-    GLSLC(0, void main()                                                       
  );
-    GLSLC(0, {                                                                 
  );
-    GLSLC(1,     uint64_t offset;                                              
  );
-    GLSLC(1,     DataBuffer dst;                                               
  );
-    GLSLC(1,     uvec2 size;                                                   
  );
-    GLSLC(1,     ivec2 pos;                                                    
  );
-    GLSLC(1,     ivec2 pos_off;                                                
  );
-    GLSLC(1,     int p;                                                        
  );
-    GLSLC(1,     float s;                                                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DataBuffer integral_data;                                     
  );
-    GLSLF(1,     ivec2 offs[%i];                                               
  ,TYPE_ELEMS);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     uint c_off;                                                   
  );
-    GLSLC(1,     uint c_plane;                                                 
  );
-    GLSLC(1,     uint ws_off;                                                  
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     pos = ivec2(gl_GlobalInvocationID.xy);                        
  );
-    GLSLC(1,     uint comp_idx = uint(gl_WorkGroupID.z) %% nb_components;      
  );
-    GLSLC(1,     uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     c_off = comp_off[comp_idx];                                   
  );
-    GLSLC(1,     c_plane = comp_plane[comp_idx];                               
  );
-    GLSLC(1,     p = patch_size[comp_idx];                                     
  );
-    GLSLC(1,     s = strength[comp_idx];                                       
  );
-    GLSLC(1,     if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= 
width[c_plane] - p || pos.y >= height[c_plane] - p) );
-    GLSLC(2,         return;                                                   
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     offset = integral_size * (invoc_idx * nb_components + 
comp_idx); );
-    GLSLC(1,     integral_data = DataBuffer(uint64_t(integral_base) + offset); 
  );
-    for (int i = 0; i < TYPE_ELEMS; i++)
-        GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i];       
  ,i,TYPE_ELEMS,i);
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x; );
-    GLSLC(1,     size = imageSize(input_img[c_plane]);                         
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DTYPE a;                                                      
  );
-    GLSLC(1,     DTYPE b;                                                      
  );
-    GLSLC(1,     DTYPE c;                                                      
  );
-    GLSLC(1,     DTYPE d;                                                      
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,     DTYPE patch_diff;                                             
  );
-    GLSLC(1,     vec4 src;                                                     
  );
-    GLSLC(1,     vec4 w;                                                       
  );
-    GLSLC(1,     float w_sum;                                                  
  );
-    GLSLC(1,     float sum;                                                    
  );
-    GLSLC(0,                                                                   
  );
-    for (int i = 0; i < 4; i++) {
-        GLSLF(1,     pos_off = pos + offs[%i];                                 
  ,i);
-        GLSLC(1,     if (!IS_WITHIN(uvec2(pos_off), size))                     
  );
-        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], pos)[c_off];  
  ,i);
-        GLSLC(1,     else                                                      
  );
-        GLSLF(2,         src[%i] = imageLoad(input_img[c_plane], 
pos_off)[c_off]; ,i);
-    }
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         offset = int_stride * uint64_t(pos.y - p);                
  );
-    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
-    GLSLC(1,         a = dst.v[pos.x - p];                                     
  );
-    GLSLC(1,         c = dst.v[pos.x + p];                                     
  );
-    GLSLC(1,         offset = int_stride * uint64_t(pos.y + p);                
  );
-    GLSLC(1,         dst = DataBuffer(uint64_t(integral_data) + offset);       
  );
-    GLSLC(1,         b = dst.v[pos.x - p];                                     
  );
-    GLSLC(1,         d = dst.v[pos.x + p];                                     
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         patch_diff = d + a - b - c;                               
  );
-    GLSLC(1,         w = exp(patch_diff * s);                                  
  );
-    GLSLC(1,         w_sum = w[0] + w[1] + w[2] + w[3];                        
  );
-    GLSLC(1,         sum = dot(w, src * 255);                                  
  );
-    GLSLC(0,                                                                   
  );
-    GLSLC(1,         weights[ws_off] += w_sum;                                 
  );
-    GLSLC(1,         sums[ws_off] += sum;                                      
  );
-    GLSLC(0, }                                                                 
  );
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
+
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_weights_comp_spv_data,
+                          ff_nlmeans_weights_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -485,121 +216,49 @@ typedef struct DenoisePushData {
 } DenoisePushData;
 
 static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool 
*exec,
-                                         FFVulkanShader *shd, 
FFVkSPIRVCompiler *spv,
-                                         const AVPixFmtDescriptor *desc, int 
planes)
+                                         FFVulkanShader *shd, int planes)
 {
     int err;
-    uint8_t *spv_data;
-    size_t spv_len;
-    void *spv_opaque = NULL;
-    FFVulkanDescriptorSetBinding *desc_set;
-    RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
-                          VK_SHADER_STAGE_COMPUTE_BIT,
-                          (const char *[]) { "GL_EXT_buffer_reference",
-                                             "GL_EXT_buffer_reference2" }, 2,
-                          WG_SIZE, WG_SIZE, 1,
-                          0));
-
-    GLSLC(0, layout(push_constant, std430) uniform pushConstants {        );
-    GLSLC(1,    uvec4 comp_off;                                           );
-    GLSLC(1,    uvec4 comp_plane;                                         );
-    GLSLC(1,    uvec4 ws_offset;                                          );
-    GLSLC(1,    uvec4 ws_stride;                                          );
-    GLSLC(1,    uint32_t ws_count;                                        );
-    GLSLC(1,    uint32_t t;                                               );
-    GLSLC(1,    uint32_t nb_components;                                   );
-    GLSLC(0, };                                                           );
+
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
 
     ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
                                 VK_SHADER_STAGE_COMPUTE_BIT);
 
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "input_img",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout  = ff_vk_shader_rep_fmt(vkctx->input_format, 
FF_VK_REP_FLOAT),
-            .mem_quali   = "readonly",
-            .dimensions  = 2,
-            .elems       = planes,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+    const FFVulkanDescriptorSetBinding desc_set_img[] = {
+        { /* input_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
-        {
-            .name        = "output_img",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-            .mem_layout  = ff_vk_shader_rep_fmt(vkctx->output_format, 
FF_VK_REP_FLOAT),
-            .mem_quali   = "writeonly",
-            .dimensions  = 2,
-            .elems       = planes,
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        { /* output_img */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems  = planes,
         },
     };
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
-
-    desc_set = (FFVulkanDescriptorSetBinding []) {
-        {
-            .name        = "weights_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float weights[];",
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 2, 0, 0);
+
+    const FFVulkanDescriptorSetBinding desc_set_ws[] = {
+        { /* weights_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
-        {
-            .name        = "sums_buffer",
-            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .mem_quali   = "readonly",
-            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
-            .buf_content = "float sums[];",
+        { /* sums_buffer */
+            .type   = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages = VK_SHADER_STAGE_COMPUTE_BIT,
         },
     };
+    ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_ws, 2, 0, 0);
 
-    RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
-
-    GLSLC(0, void main()                                                      
);
-    GLSLC(0, {                                                                
);
-    GLSLC(1,     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);           
);
-    GLSLC(1,     const uint plane = uint(gl_WorkGroupID.z);                   
);
-    GLSLC(1,     const uvec2 size = imageSize(output_img[plane]);             
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     uint c_off;                                                  
);
-    GLSLC(1,     uint c_plane;                                                
);
-    GLSLC(1,     uint ws_off;                                                 
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     float w_sum;                                                 
);
-    GLSLC(1,     float sum;                                                   
);
-    GLSLC(1,     vec4 src;                                                    
);
-    GLSLC(1,     vec4 r;                                                      
);
-    GLSLC(1,     uint invoc_idx;                                              
);
-    GLSLC(1,     uint comp_idx;                                               
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     if (!IS_WITHIN(pos, size))                                   
);
-    GLSLC(2,         return;                                                  
);
-    GLSLC(0,                                                                  
);
-    GLSLC(1,     src = imageLoad(input_img[plane], pos);                      
);
-    GLSLC(1,     for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {   
);
-    GLSLC(2,         if (plane == comp_plane[comp_idx]) {                     
);
-    GLSLC(3,             w_sum = 0.0;                                         
);
-    GLSLC(3,             sum = 0.0;                                           
);
-    GLSLC(3,             for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {    
);
-    GLSLC(4,                 ws_off = ws_count * invoc_idx + 
ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
-    GLSLC(4,                 w_sum += weights[ws_off];                        
);
-    GLSLC(4,                 sum += sums[ws_off];                             
);
-    GLSLC(3,             }                                                    
);
-    GLSLC(3,             c_off = comp_off[comp_idx];                          
);
-    GLSLC(3,             r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 
255; );
-    GLSLC(2,         }                                                        
);
-    GLSLC(1,     }                                                            
);
-    GLSLC(1,     imageStore(output_img[plane], pos, r);                       
);
-    GLSLC(0, }                                                                
);
-
-    RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", 
&spv_opaque));
-    RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+    RET(ff_vk_shader_link(vkctx, shd,
+                          ff_nlmeans_denoise_comp_spv_data,
+                          ff_nlmeans_denoise_comp_spv_len, "main"));
 
     RET(ff_vk_shader_register_exec(vkctx, exec, shd));
 
 fail:
-    if (spv_opaque)
-        spv->free_shader(spv, &spv_opaque);
-
     return err;
 }
 
@@ -610,15 +269,9 @@ static av_cold int init_filter(AVFilterContext *ctx)
     NLMeansVulkanContext *s = ctx->priv;
     FFVulkanContext *vkctx = &s->vkctx;
     const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
-    FFVkSPIRVCompiler *spv = NULL;
     int *offsets_buf;
     int offsets_dispatched = 0, nb_dispatches = 0;
 
-    const AVPixFmtDescriptor *desc;
-    desc = av_pix_fmt_desc_get(vkctx->output_format);
-    if (!desc)
-        return AVERROR(EINVAL);
-
     if (!(s->opts.r & 1)) {
         s->opts.r |= 1;
         av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to 
%i",
@@ -682,12 +335,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
 
     s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / 
TYPE_ELEMS));
 
-    spv = ff_vk_spirv_init();
-    if (!spv) {
-        av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
-        return AVERROR_EXTERNAL;
-    }
-
     s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
     if (!s->qf) {
         av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
@@ -698,11 +345,11 @@ static av_cold int init_filter(AVFilterContext *ctx)
     RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
 
     RET(init_integral_pipeline(vkctx, &s->e, &s->shd_horizontal, 
&s->shd_vertical,
-                               spv, desc, planes));
+                               planes));
 
-    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, spv, desc, 
planes));
+    RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, planes));
 
-    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, spv, desc, 
planes));
+    RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, planes));
 
     RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], 
&s->shd_vertical,
                                         1, 0, 0,
@@ -726,9 +373,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
     s->initialized = 1;
 
 fail:
-    if (spv)
-        spv->uninit(&spv);
-
     return err;
 }
 
diff --git a/libavfilter/vulkan/Makefile b/libavfilter/vulkan/Makefile
index 6d25cf8a50..cd303e535e 100644
--- a/libavfilter/vulkan/Makefile
+++ b/libavfilter/vulkan/Makefile
@@ -15,3 +15,7 @@ OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER) += 
vulkan/transpose.comp.spv.o
 OBJS-$(CONFIG_V360_VULKAN_FILTER) += vulkan/v360.comp.spv.o
 OBJS-$(CONFIG_INTERLACE_VULKAN_FILTER) += vulkan/interlace.comp.spv.o
 OBJS-$(CONFIG_XFADE_VULKAN_FILTER) += vulkan/xfade.comp.spv.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vulkan/nlmeans_horizontal.comp.spv.o \
+                                        vulkan/nlmeans_vertical.comp.spv.o \
+                                        vulkan/nlmeans_weights.comp.spv.o \
+                                        vulkan/nlmeans_denoise.comp.spv.o
diff --git a/libavfilter/vulkan/avgblur.comp.glsl 
b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
similarity index 51%
copy from libavfilter/vulkan/avgblur.comp.glsl
copy to libavfilter/vulkan/nlmeans_denoise.comp.glsl
index 4cfd4f433d..974c09318f 100644
--- a/libavfilter/vulkan/avgblur.comp.glsl
+++ b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2026 Lynne <[email protected]>
+ * Copyright (c) Lynne
  *
  * This file is part of FFmpeg.
  *
@@ -23,39 +23,64 @@
 #extension GL_EXT_shader_image_load_formatted : require
 #extension GL_EXT_scalar_block_layout : require
 #extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
 
 layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
 
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    uvec4 ws_offset;
+    uvec4 ws_stride;
+    uint32_t ws_count;
+    uint32_t t;
+    uint32_t nb_components;
+};
+
 layout (set = 0, binding = 0) uniform readonly  image2D input_img[];
 layout (set = 0, binding = 1) uniform writeonly image2D output_img[];
 
-layout (push_constant, scalar) uniform pushConstants {
-    vec4 filter_norm;
-    ivec2 filter_len;
-    uint planes;
+layout (set = 1, binding = 0, scalar) readonly buffer weights_buffer {
+    float weights[];
+};
+
+layout (set = 1, binding = 1, scalar) readonly buffer sums_buffer {
+    float sums[];
 };
 
 void main()
 {
     const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
+    const uint plane = uint(gl_WorkGroupID.z);
+    const ivec2 size = imageSize(output_img[plane]);
+
+    uint c_off;
+    uint c_plane;
+    uint ws_off;
+
+    float w_sum;
+    float sum;
+    vec4 src;
+    vec4 r;
+    uint invoc_idx;
+    uint comp_idx;
 
-    ivec2 size = imageSize(output_img[nonuniformEXT(gl_LocalInvocationID.z)]);
     if (any(greaterThanEqual(pos, size)))
         return;
 
-    if ((planes & (1 << gl_LocalInvocationID.z)) == 0) {
-        imageStore(output_img[gl_LocalInvocationID.z], pos,
-                   imageLoad(input_img[nonuniformEXT(gl_LocalInvocationID.z)],
-                             pos));
-        return;
+    src = imageLoad(input_img[plane], pos);
+    for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {
+        if (plane == comp_plane[comp_idx]) {
+            w_sum = 0.0;
+            sum = 0.0;
+            for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {
+                ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x;
+                w_sum += weights[ws_off];
+                sum += sums[ws_off];
+            }
+            c_off = comp_off[comp_idx];
+            r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 255;
+        }
     }
-
-    vec4 sum = vec4(0);
-    for (int y = -filter_len.y; y <= filter_len.y; y++)
-        for (int x = -filter_len.x; x <= filter_len.x; x++)
-            sum += imageLoad(input_img[nonuniformEXT(gl_LocalInvocationID.z)],
-                             pos + ivec2(x, y));
-
-    imageStore(output_img[nonuniformEXT(gl_LocalInvocationID.z)],
-               pos, sum * filter_norm);
+    imageStore(output_img[plane], pos, r);
 }
diff --git a/libavfilter/vulkan/nlmeans_horizontal.comp.glsl 
b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
new file mode 100644
index 0000000000..d1bd62ccb1
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+struct Block {
+    DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) 
buffer BlockBuffer {
+    Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint nb_components;
+};
+
+void main()
+{
+    uint64_t offset;
+    BlockBuffer b_dst;
+    Block block;
+    DTYPE s2;
+    DTYPE prefix_sum;
+    ivec2 pos;
+    int k;
+    int o;
+
+    DataBuffer integral_data;
+
+    uint c_plane;
+
+    uint comp_idx = uint(gl_WorkGroupID.y);
+    uint invoc_idx = uint(gl_WorkGroupID.z);
+
+    if (strength[comp_idx] == 0.0)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+
+    c_plane = comp_plane[comp_idx];
+
+    pos.y = int(gl_GlobalInvocationID.x);
+    if (pos.y < height[c_plane]) {
+        prefix_sum = DTYPE(0);
+        offset = int_stride * uint64_t(pos.y);
+        b_dst = BlockBuffer(uint64_t(integral_data) + offset);
+
+        for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {
+            block = b_dst.v[k];
+            for (o = 0; o < T_BLOCK_ELEMS; o++) {
+                s2 = block.data[o];
+                block.data[o] = s2 + prefix_sum;
+                prefix_sum += s2;
+            }
+            b_dst.v[k] = block;
+        }
+    }
+}
diff --git a/libavfilter/vulkan/nlmeans_vertical.comp.glsl 
b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
new file mode 100644
index 0000000000..d5842f4a16
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+struct Block {
+    DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) 
buffer BlockBuffer {
+    Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+    ivec2 xyoffsets[];
+};
+
+void main()
+{
+    uint64_t offset;
+    DataBuffer dst;
+    float s1;
+    DTYPE s2;
+    DTYPE prefix_sum;
+    uvec2 size;
+    ivec2 pos;
+    ivec2 pos_off;
+
+    DataBuffer integral_data;
+    ivec2 offs[TYPE_ELEMS];
+
+    uint c_off;
+    uint c_plane;
+
+    uint comp_idx = uint(gl_WorkGroupID.y);
+    uint invoc_idx = uint(gl_WorkGroupID.z);
+
+    if (strength[comp_idx] == 0.0)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+    for (uint i = 0; i < TYPE_ELEMS; i++)
+        offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+    c_off = comp_off[comp_idx];
+    c_plane = comp_plane[comp_idx];
+    size = imageSize(input_img[c_plane]);
+
+    pos.x = int(gl_GlobalInvocationID.x);
+    if (pos.x < width[c_plane]) {
+        prefix_sum = DTYPE(0);
+        for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {
+            offset = int_stride * uint64_t(pos.y);
+            dst = DataBuffer(uint64_t(integral_data) + offset);
+            s1 = imageLoad(input_img[c_plane], pos)[c_off];
+            for (int i = 0; i < TYPE_ELEMS; i++) {
+                pos_off = pos + offs[i];
+                if (any(greaterThanEqual(uvec2(pos_off), size)))
+                    s2[i] = s1;
+                else
+                    s2[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+            }
+            s2 = (s1 - s2) * (s1 - s2);
+            dst.v[pos.x] = s2 + prefix_sum;
+            prefix_sum += s2;
+        }
+    }
+}
diff --git a/libavfilter/vulkan/nlmeans_weights.comp.glsl 
b/libavfilter/vulkan/nlmeans_weights.comp.glsl
new file mode 100644
index 0000000000..24c918bd0a
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_weights.comp.glsl
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) 
in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer 
DataBuffer {
+    DTYPE v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    uvec4 width;
+    uvec4 height;
+    uvec4 ws_offset;
+    uvec4 ws_stride;
+    ivec4 patch_size;
+    vec4 strength;
+    uvec4 comp_off;
+    uvec4 comp_plane;
+    DataBuffer integral_base;
+    uint64_t integral_size;
+    uint64_t int_stride;
+    uint xyoffs_start;
+    uint ws_count;
+    uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 0, binding = 1, scalar) buffer weights_buffer {
+    float weights[];
+};
+
+layout (set = 0, binding = 2, scalar) buffer sums_buffer {
+    float sums[];
+};
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+    ivec2 xyoffsets[];
+};
+
+void main()
+{
+    uint64_t offset;
+    DataBuffer dst;
+    uvec2 size;
+    ivec2 pos;
+    ivec2 pos_off;
+    int p;
+    float s;
+
+    DataBuffer integral_data;
+    ivec2 offs[TYPE_ELEMS];
+
+    uint c_off;
+    uint c_plane;
+    uint ws_off;
+
+    pos = ivec2(gl_GlobalInvocationID.xy);
+    uint comp_idx = uint(gl_WorkGroupID.z) % nb_components;
+    uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;
+
+    c_off = comp_off[comp_idx];
+    c_plane = comp_plane[comp_idx];
+    p = patch_size[comp_idx];
+    s = strength[comp_idx];
+    if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= width[c_plane] - p || 
pos.y >= height[c_plane] - p)
+        return;
+
+    offset = integral_size * (invoc_idx * nb_components + comp_idx);
+    integral_data = DataBuffer(uint64_t(integral_base) + offset);
+    for (uint i = 0; i < TYPE_ELEMS; i++)
+        offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+    ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * 
ws_stride[comp_idx] + pos.x;
+    size = imageSize(input_img[c_plane]);
+
+    DTYPE a;
+    DTYPE b;
+    DTYPE c;
+    DTYPE d;
+
+    DTYPE patch_diff;
+    vec4 src;
+    vec4 w;
+    float w_sum;
+    float sum;
+
+    for (int i = 0; i < 4; i++) {
+        pos_off = pos + offs[i];
+        if (any(greaterThanEqual(uvec2(pos_off), size)))
+            src[i] = imageLoad(input_img[c_plane], pos)[c_off];
+        else
+            src[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+    }
+
+    offset = int_stride * uint64_t(pos.y - p);
+    dst = DataBuffer(uint64_t(integral_data) + offset);
+    a = dst.v[pos.x - p];
+    c = dst.v[pos.x + p];
+    offset = int_stride * uint64_t(pos.y + p);
+    dst = DataBuffer(uint64_t(integral_data) + offset);
+    b = dst.v[pos.x - p];
+    d = dst.v[pos.x + p];
+
+    patch_diff = d + a - b - c;
+    w = exp(patch_diff * s);
+    w_sum = w[0] + w[1] + w[2] + w[3];
+    sum = dot(w, src * 255);
+
+    weights[ws_off] += w_sum;
+    sums[ws_off] += sum;
+}

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to