[FFmpeg-cvslog] [ffmpeg] 02/06: vulkan: merge ProRes and ProRes RAW iDCTs

Lynne via ffmpeg-cvslog Mon, 22 Dec 2025 10:47:58 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 6eced881885201e4834cea8d6aee4ce114d7fc9a
Author:     Lynne <[email protected]>
AuthorDate: Thu Dec 18 13:30:30 2025 +0100
Commit:     Lynne <[email protected]>
CommitDate: Mon Dec 22 19:46:26 2025 +0100

    vulkan: merge ProRes and ProRes RAW iDCTs
    
    This cleans up the code a bit, and reduces binary size.
---
 libavcodec/vulkan/Makefile                       |  4 +-
 libavcodec/vulkan/{prores_idct.comp => dct.comp} | 93 +++++-------------------
 libavcodec/vulkan/prores_idct.comp               | 84 ---------------------
 libavcodec/vulkan/prores_raw_idct.comp           | 82 ---------------------
 libavcodec/vulkan_prores.c                       |  4 +
 libavcodec/vulkan_prores_raw.c                   |  5 ++
 6 files changed, 29 insertions(+), 243 deletions(-)

diff --git a/libavcodec/vulkan/Makefile b/libavcodec/vulkan/Makefile
index 78d511a90e..35e96c506d 100644
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -14,11 +14,11 @@ OBJS-$(CONFIG_FFV1_VULKAN_HWACCEL)  +=  vulkan/common.o \
                                        vulkan/ffv1_common.o 
vulkan/ffv1_reset.o \
                                        vulkan/ffv1_dec_setup.o 
vulkan/ffv1_dec.o
 
-OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o \
+OBJS-$(CONFIG_PRORES_RAW_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \
                                             vulkan/prores_raw_decode.o \
                                             vulkan/prores_raw_idct.o
 
-OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o \
+OBJS-$(CONFIG_PRORES_VULKAN_HWACCEL) += vulkan/common.o vulkan/dct.o \
                                         vulkan/prores_vld.o \
                                         vulkan/prores_idct.o
 
diff --git a/libavcodec/vulkan/prores_idct.comp b/libavcodec/vulkan/dct.comp
similarity index 60%
copy from libavcodec/vulkan/prores_idct.comp
copy to libavcodec/vulkan/dct.comp
index 5eef61e57a..34c6ad128f 100644
--- a/libavcodec/vulkan/prores_idct.comp
+++ b/libavcodec/vulkan/dct.comp
@@ -1,4 +1,7 @@
 /*
+ * Copyright (c) 2025 Lynne <[email protected]>
+ * Copyright (c) 2016 Nathan Egge <[email protected]>
+ *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
@@ -16,26 +19,24 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/* Two macroblocks, padded to avoid bank conflicts */
-shared float blocks[4*2][8*(8+1)];
+/**
+ * Orthonormal inverse 8-point Type-II DCT based on the Chen factorization[1].
+ * 1D with scale factors moved up front.
+ * This computes an n-point Type-II DCT by first computing an n/2-point 
Type-II DCT
+ * of the even indexed inputs and an n/2-point Type-IV DST of the odd indexed 
inputs,
+ * and then combining them using a "butterfly" operation.
+ *
+ * [1] W.H. Chen, C. Smith, and S. Fralick,
+ * "A Fast Computational Algorithm for the Discrete Cosine Transform",
+ * IEEE Transactions on Communications, Vol. 25, No. 9, pp 1004-1009, Sept. 
1977
+ */
 
-uint get_px(uint tex_idx, ivec2 pos)
-{
-#ifndef INTERLACED
-    return imageLoad(dst[tex_idx], pos).x;
-#else
-    return imageLoad(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + 
bottom_field)).x;
+#ifndef NB_COMPONENTS
+#define NB_COMPONENTS 1
 #endif
-}
 
-void put_px(uint tex_idx, ivec2 pos, uint v)
-{
-#ifndef INTERLACED
-    imageStore(dst[tex_idx], pos, uvec4(v));
-#else
-    imageStore(dst[tex_idx], ivec2(pos.x, (pos.y << 1) + bottom_field), 
uvec4(v));
-#endif
-}
+/* Padded by 1 row to avoid bank conflicts */
+shared float blocks[NB_BLOCKS][NB_COMPONENTS*8*(8 + 1)];
 
 const float idct_scale[64] = {
     0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 
0.1469844503024199,
@@ -56,7 +57,6 @@ const float idct_scale[64] = {
     0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 
0.0095150584360892,
 };
 
-/* 7.4 Inverse Transform */
 void idct8(uint block, uint offset, uint stride)
 {
     float t0, t1, t2, t3, t4, t5, t6, t7, u8;
@@ -117,60 +117,3 @@ void idct8(uint block, uint offset, uint stride)
     blocks[block][6*stride + offset] = u6;
     blocks[block][7*stride + offset] = u7;
 }
-
-void main(void)
-{
-    uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
-    uint comp = gid.z, block = (lid.y << 2) | (lid.x >> 3), idx = lid.x & 0x7;
-    uint chroma_shift = comp != 0 ? log2_chroma_w : 0;
-    bool act = gid.x < mb_width << (4 - chroma_shift);
-
-    /**
-     * Normalize coefficients to [-1, 1] for increased precision during the 
iDCT.
-     * DCT coeffs have the range of a 12-bit signed integer (7.4 Inverse 
Transform).
-     */
-    const float norm = 1.0f / (1 << 11);
-
-    /* Coalesced load of DCT coeffs in shared memory, inverse quantization */
-    if (act) {
-        /**
-         * According to the VK spec indexing an array in push constant memory 
with
-         * a non-dynamically uniform value is illegal ($15.9.1 in v1.4.326),
-         * so copy the whole matrix locally.
-         */
-        uint8_t[64] qmat = comp == 0 ? qmat_luma : qmat_chroma;
-
-        /* Table 15 */
-        uint8_t qidx = quant_idx[(gid.y >> 1) * mb_width + (gid.x >> (4 - 
chroma_shift))];
-        int qscale = qidx > 128 ? (qidx - 96) << 2 : qidx;
-
-        [[unroll]] for (uint i = 0; i < 8; ++i) {
-            uint cidx = (i << 3) + idx;
-            int   c = sign_extend(int(get_px(comp, ivec2(gid.x, (gid.y << 3) + 
i))), 16);
-            float v = float(c * qscale * int(qmat[cidx])) * norm;
-            blocks[block][i * 9 + idx] = v * idct_scale[cidx];
-        }
-    }
-
-    /* Column-wise iDCT */
-    idct8(block, idx, 9);
-    barrier();
-
-    /* Remap [-1, 1] to [0, 2] to remove a per-element addition in the output 
loop */
-    blocks[block][idx * 9] += 1.0f;
-
-    /* Row-wise iDCT */
-    idct8(block, idx * 9, 1);
-    barrier();
-
-    float fact = 1 << (depth - 1);
-    int maxv = (1 << depth) - 1;
-
-    /* 7.5.1 Color Component Samples. Rescale, clamp and write back to global 
memory */
-    if (act) {
-        [[unroll]] for (uint i = 0; i < 8; ++i) {
-            float v = round(blocks[block][i * 9 + idx] * fact);
-            put_px(comp, ivec2(gid.x, (gid.y << 3) + i), clamp(int(v), 0, 
maxv));
-        }
-    }
-}
diff --git a/libavcodec/vulkan/prores_idct.comp 
b/libavcodec/vulkan/prores_idct.comp
index 5eef61e57a..25431d61c1 100644
--- a/libavcodec/vulkan/prores_idct.comp
+++ b/libavcodec/vulkan/prores_idct.comp
@@ -16,9 +16,6 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/* Two macroblocks, padded to avoid bank conflicts */
-shared float blocks[4*2][8*(8+1)];
-
 uint get_px(uint tex_idx, ivec2 pos)
 {
 #ifndef INTERLACED
@@ -37,87 +34,6 @@ void put_px(uint tex_idx, ivec2 pos, uint v)
 #endif
 }
 
-const float idct_scale[64] = {
-    0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 
0.1469844503024199,
-    0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 
0.0344874224103679,
-    0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 
0.2038732892122293,
-    0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 
0.0478354290456362,
-    0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 
0.1920444391778541,
-    0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 
0.0450599888754343,
-    0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 
0.1728354290456362,
-    0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 
0.0405529186026822,
-    0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 
0.1469844503024199,
-    0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 
0.0344874224103679,
-    0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 
0.1154849415639109,
-    0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 
0.0270965939155924,
-    0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 
0.0795474112858021,
-    0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 
0.0186644585125857,
-    0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 
0.0405529186026822,
-    0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 
0.0095150584360892,
-};
-
-/* 7.4 Inverse Transform */
-void idct8(uint block, uint offset, uint stride)
-{
-    float t0, t1, t2, t3, t4, t5, t6, t7, u8;
-    float u0, u1, u2, u3, u4, u5, u6, u7;
-
-    /* Input */
-    t0 = blocks[block][0*stride + offset];
-    u4 = blocks[block][1*stride + offset];
-    t2 = blocks[block][2*stride + offset];
-    u6 = blocks[block][3*stride + offset];
-    t1 = blocks[block][4*stride + offset];
-    u5 = blocks[block][5*stride + offset];
-    t3 = blocks[block][6*stride + offset];
-    u7 = blocks[block][7*stride + offset];
-
-    /* Embedded scaled inverse 4-point Type-II DCT */
-    u0 = t0 + t1;
-    u1 = t0 - t1;
-    u3 = t2 + t3;
-    u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
-    t0 = u0 + u3;
-    t3 = u0 - u3;
-    t1 = u1 + u2;
-    t2 = u1 - u2;
-
-    /* Embedded scaled inverse 4-point Type-IV DST */
-    t5 = u5 + u6;
-    t6 = u5 - u6;
-    t7 = u4 + u7;
-    t4 = u4 - u7;
-    u7 = t7 + t5;
-    u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
-    u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
-    u4 = u8 - t4*(1.0823922002923939687994464107328f);
-    u6 = u8 - t6*(2.6131259297527530557132863468544f);
-    t7 = u7;
-    t6 = t7 - u6;
-    t5 = t6 + u5;
-    t4 = t5 - u4;
-
-    /* Butterflies */
-    u0 = t0 + t7;
-    u7 = t0 - t7;
-    u6 = t1 + t6;
-    u1 = t1 - t6;
-    u2 = t2 + t5;
-    u5 = t2 - t5;
-    u4 = t3 + t4;
-    u3 = t3 - t4;
-
-    /* Output */
-    blocks[block][0*stride + offset] = u0;
-    blocks[block][1*stride + offset] = u1;
-    blocks[block][2*stride + offset] = u2;
-    blocks[block][3*stride + offset] = u3;
-    blocks[block][4*stride + offset] = u4;
-    blocks[block][5*stride + offset] = u5;
-    blocks[block][6*stride + offset] = u6;
-    blocks[block][7*stride + offset] = u7;
-}
-
 void main(void)
 {
     uvec3 gid = gl_GlobalInvocationID, lid = gl_LocalInvocationID;
diff --git a/libavcodec/vulkan/prores_raw_idct.comp 
b/libavcodec/vulkan/prores_raw_idct.comp
index 29ddf3b9e8..01aad98330 100644
--- a/libavcodec/vulkan/prores_raw_idct.comp
+++ b/libavcodec/vulkan/prores_raw_idct.comp
@@ -24,8 +24,6 @@
 #define BLOCK_ID (gl_LocalInvocationID.y)
 #define ROW_ID (gl_LocalInvocationID.x)
 
-shared float blocks[16][4*64];
-
 const ivec2 scan[64] = {
     ivec2( 0,  0), ivec2( 4,  0), ivec2( 0,  2), ivec2( 4,  2),
     ivec2( 0,  8), ivec2( 4,  8), ivec2( 6,  8), ivec2( 2, 10),
@@ -45,86 +43,6 @@ const ivec2 scan[64] = {
     ivec2(12, 12), ivec2( 8, 14), ivec2(10, 14), ivec2(14, 14),
 };
 
-const float idct_scale[64] = {
-    0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 
0.1469844503024199,
-    0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 
0.0344874224103679,
-    0.1733799806652684, 0.2404849415639108, 0.2265318615882219, 
0.2038732892122293,
-    0.1733799806652684, 0.1362237766939547, 0.0938325693794663, 
0.0478354290456362,
-    0.1633203706095471, 0.2265318615882219, 0.2133883476483184, 
0.1920444391778541,
-    0.1633203706095471, 0.1283199917898342, 0.0883883476483185, 
0.0450599888754343,
-    0.1469844503024199, 0.2038732892122293, 0.1920444391778541, 
0.1728354290456362,
-    0.1469844503024199, 0.1154849415639109, 0.0795474112858021, 
0.0405529186026822,
-    0.1250000000000000, 0.1733799806652684, 0.1633203706095471, 
0.1469844503024199,
-    0.1250000000000000, 0.0982118697983878, 0.0676495125182746, 
0.0344874224103679,
-    0.0982118697983878, 0.1362237766939547, 0.1283199917898342, 
0.1154849415639109,
-    0.0982118697983878, 0.0771645709543638, 0.0531518809229535, 
0.0270965939155924,
-    0.0676495125182746, 0.0938325693794663, 0.0883883476483185, 
0.0795474112858021,
-    0.0676495125182746, 0.0531518809229535, 0.0366116523516816, 
0.0186644585125857,
-    0.0344874224103679, 0.0478354290456362, 0.0450599888754343, 
0.0405529186026822,
-    0.0344874224103679, 0.0270965939155924, 0.0186644585125857, 
0.0095150584360892,
-};
-
-void idct8(uint block, uint offset, uint stride)
-{
-    float t0, t1, t2, t3, t4, t5, t6, t7, u8;
-    float u0, u1, u2, u3, u4, u5, u6, u7;
-
-    /* Input */
-    t0 = blocks[block][0*stride + offset];
-    u4 = blocks[block][1*stride + offset];
-    t2 = blocks[block][2*stride + offset];
-    u6 = blocks[block][3*stride + offset];
-    t1 = blocks[block][4*stride + offset];
-    u5 = blocks[block][5*stride + offset];
-    t3 = blocks[block][6*stride + offset];
-    u7 = blocks[block][7*stride + offset];
-
-    /* Embedded scaled inverse 4-point Type-II DCT */
-    u0 = t0 + t1;
-    u1 = t0 - t1;
-    u3 = t2 + t3;
-    u2 = (t2 - t3)*(1.4142135623730950488016887242097f) - u3;
-    t0 = u0 + u3;
-    t3 = u0 - u3;
-    t1 = u1 + u2;
-    t2 = u1 - u2;
-
-    /* Embedded scaled inverse 4-point Type-IV DST */
-    t5 = u5 + u6;
-    t6 = u5 - u6;
-    t7 = u4 + u7;
-    t4 = u4 - u7;
-    u7 = t7 + t5;
-    u5 = (t7 - t5)*(1.4142135623730950488016887242097f);
-    u8 = (t4 + t6)*(1.8477590650225735122563663787936f);
-    u4 = u8 - t4*(1.0823922002923939687994464107328f);
-    u6 = u8 - t6*(2.6131259297527530557132863468544f);
-    t7 = u7;
-    t6 = t7 - u6;
-    t5 = t6 + u5;
-    t4 = t5 - u4;
-
-    /* Butterflies */
-    u0 = t0 + t7;
-    u7 = t0 - t7;
-    u6 = t1 + t6;
-    u1 = t1 - t6;
-    u2 = t2 + t5;
-    u5 = t2 - t5;
-    u4 = t3 + t4;
-    u3 = t3 - t4;
-
-    /* Output */
-    blocks[block][0*stride + offset] = u0;
-    blocks[block][1*stride + offset] = u1;
-    blocks[block][2*stride + offset] = u2;
-    blocks[block][3*stride + offset] = u3;
-    blocks[block][4*stride + offset] = u4;
-    blocks[block][5*stride + offset] = u5;
-    blocks[block][6*stride + offset] = u6;
-    blocks[block][7*stride + offset] = u7;
-}
-
 void main(void)
 {
     const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + 
gl_WorkGroupID.x;
diff --git a/libavcodec/vulkan_prores.c b/libavcodec/vulkan_prores.c
index bc0364609c..afea8857e8 100644
--- a/libavcodec/vulkan_prores.c
+++ b/libavcodec/vulkan_prores.c
@@ -24,6 +24,7 @@
 #include "libavutil/vulkan_spirv.h"
 
 extern const char *ff_source_common_comp;
+extern const char *ff_source_dct_comp;
 extern const char *ff_source_prores_vld_comp;
 extern const char *ff_source_prores_idct_comp;
 
@@ -511,6 +512,9 @@ static int init_idct_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
     };
     RET(ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0));
 
+    GLSLC(0, #define NB_BLOCKS 4*2);
+    GLSLD(ff_source_dct_comp);
+
     GLSLD(ff_source_prores_idct_comp);
 
     RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",
diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c
index 82e8f3ad16..0a844156fb 100644
--- a/libavcodec/vulkan_prores_raw.c
+++ b/libavcodec/vulkan_prores_raw.c
@@ -26,6 +26,7 @@
 #include "libavutil/mem.h"
 
 extern const char *ff_source_common_comp;
+extern const char *ff_source_dct_comp;
 extern const char *ff_source_prores_raw_decode_comp;
 extern const char *ff_source_prores_raw_idct_comp;
 
@@ -385,6 +386,10 @@ static int init_idct_shader(AVCodecContext *avctx, 
FFVulkanContext *s,
 
     RET(add_common_data(avctx, s, shd, 0));
 
+    GLSLC(0, #define NB_BLOCKS 16);
+    GLSLC(0, #define NB_COMPONENTS 4);
+    GLSLD(ff_source_dct_comp);
+
     GLSLD(ff_source_prores_raw_idct_comp);
 
     RET(spv->compile_shader(s, spv, shd, &spv_data, &spv_len, "main",

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/06: vulkan: merge ProRes and ProRes RAW iDCTs

Reply via email to