PR #23101 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23101 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23101.patch
There were two things wrong with the decoder: the tiles were incorrectly indexed (for non-aligned widths, you got bitstream desyncs). The first commit fixes that in a straightforward way. And the dequantization was indeed incomplete. ProRes RAW bakes in an 8-point curve, which is required to bake in both the camera's native delinearization curve, and any other curve the manufacturer wants. The last patch addresses that for both Vulkan and C. This math in the C version caused overflows. The reference decoder is float for a reason. Hence, I had to switch to the 32-bit in -> 12-bit out simple iDCT. Anyhow, with this, the output of both C and Vulkan matches the reference implementation, and support for Apple's decoder (which is much slower than ours, HAHAHA!) could also be added. >From 0aa96233ceb3f4465a89ee2bf10f696742ccd665 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Fri, 15 May 2026 02:30:08 +0900 Subject: [PATCH 1/4] vulkan/common: fix LOAD64 again duh, gb.buf is incremented in the loop and I missed that. ugh. --- libavcodec/vulkan/common.glsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/vulkan/common.glsl b/libavcodec/vulkan/common.glsl index 0ff9b45b7d..9f1393bbef 100644 --- a/libavcodec/vulkan/common.glsl +++ b/libavcodec/vulkan/common.glsl @@ -289,7 +289,8 @@ shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize gb.bits = 0; \ gb.bits_valid = 0; \ u8buf ptr = u8buf(gb.buf); \ - for (uint i = 0; i < ((4 - uint(gb.buf)) & 3); ++i) { \ + uint prefix = (4 - uint(gb.buf)) & 3; \ + for (uint i = 0; i < prefix; ++i) { \ gb.bits |= uint64_t(ptr[i].v) << (56 - i * 8); \ gb.bits_valid += 8; \ gb.buf += 1; \ -- 2.52.0 >From 081a04e351000f52376507ee93304b8072ebcacd Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Fri, 15 May 2026 02:46:11 +0900 Subject: [PATCH 2/4] prores_raw: fix tile alignment issues Reverse engineered the decoder a bit more. All tiles are always 16x1. The issue is that at the edges, tiles don't have the same width. Instead, the first tile that starts to clip is half, and then the next tile after that is also half the previous tile's width. --- libavcodec/prores_raw.c | 66 +++++++++++-------- libavcodec/prores_raw.h | 3 +- libavcodec/vulkan/prores_raw_decode.comp.glsl | 9 +-- libavcodec/vulkan/prores_raw_idct.comp.glsl | 14 ++-- libavcodec/vulkan_prores_raw.c | 5 +- 5 files changed, 52 insertions(+), 45 deletions(-) diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c index c1c05fd959..314386f339 100644 --- a/libavcodec/prores_raw.c +++ b/libavcodec/prores_raw.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/avassert.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem_internal.h" #include "libavutil/mem.h" @@ -131,11 +132,10 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile, uint16_t *dst = (uint16_t *)(frame->data[0] + tile->y*frame->linesize[0] + 2*tile->x); int idx; - const int w = FFMIN(s->tw, avctx->width - tile->x) / 2; - const int nb_blocks = w / 8; - const int log2_nb_blocks = 31 - ff_clz(nb_blocks); - const int block_mask = (1 << log2_nb_blocks) - 1; - const int nb_codes = 64 * nb_blocks; + const int log2_nb_blocks = tile->log2_nb_blocks; + const int nb_blocks = 1 << log2_nb_blocks; + const int block_mask = nb_blocks - 1; + const int nb_codes = 64 * nb_blocks; LOCAL_ALIGNED_32(int16_t, block, [64*16]); @@ -426,15 +426,13 @@ static int decode_frame(AVCodecContext *avctx, ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat); - s->nb_tw = (w + 15) >> 4; + int tw16 = (w + 15) >> 4; + s->nb_tw = (tw16 >> align) + av_popcount(~(-1 * (1 << align)) & tw16); s->nb_th = (h + 15) >> 4; - s->nb_tw = (s->nb_tw >> align) + av_popcount(~(-1 * (1 << align)) & s->nb_tw); s->nb_tiles = s->nb_tw * s->nb_th; av_log(avctx, AV_LOG_DEBUG, "%dx%d | nb_tiles: %d\n", s->nb_tw, s->nb_th, s->nb_tiles); - s->tw = s->version == 0 ? 128 : 256; s->th = 16; - av_log(avctx, AV_LOG_DEBUG, "tile_size: %dx%d\n", s->tw, s->th); av_fast_mallocz(&s->tiles, &s->tiles_size, s->nb_tiles * sizeof(*s->tiles)); if (!s->tiles) @@ -443,29 +441,43 @@ static int decode_frame(AVCodecContext *avctx, if (bytestream2_get_bytes_left(&gb) < s->nb_tiles * 2) return AVERROR_INVALIDDATA; - /* Read tile data offsets */ + /* + * Tiles form a nb_tw x nb_th grid over the 16-aligned coded frame, but the + * columns are not uniform width: each row's width (in 16-px units) is split + * greedily into power-of-two-wide tiles, 2^align down to 2^0, so the right + * edge is covered by progressively narrower tiles rather than one clamped + * one. A tile is (1 << log2_nb_blocks) blocks wide (block = 16 px), 16 tall. + */ int offset = bytestream2_tell(&gb) + s->nb_tiles * 2; - for (int n = 0; n < s->nb_tiles; n++) { - TileContext *tile = &s->tiles[n]; + int n = 0; + for (int ty = 0; ty < s->nb_th; ty++) { + unsigned tx = 0; + int rem = tw16; + for (int e = align; rem > 0; e--) { + int unit = 1 << e; + while (unit <= rem) { + TileContext *tile = &s->tiles[n++]; + int size = bytestream2_get_be16(&gb); - int size = bytestream2_get_be16(&gb); - if (offset >= avpkt->size) - return AVERROR_INVALIDDATA; - if (size >= avpkt->size) - return AVERROR_INVALIDDATA; - if (offset > avpkt->size - size) - return AVERROR_INVALIDDATA; + if (offset >= avpkt->size) + return AVERROR_INVALIDDATA; + if (size >= avpkt->size) + return AVERROR_INVALIDDATA; + if (offset > avpkt->size - size) + return AVERROR_INVALIDDATA; - bytestream2_init(&tile->gb, avpkt->data + offset, size); + bytestream2_init(&tile->gb, avpkt->data + offset, size); + tile->x = tx * 16; + tile->y = ty * s->th; + tile->log2_nb_blocks = e; + offset += size; - tile->y = (n / s->nb_tw) * s->th; - tile->x = (n % s->nb_tw) * s->tw; - - if (avctx->width - tile->x < 16) - return AVERROR_PATCHWELCOME; - - offset += size; + tx += unit; + rem -= unit; + } + } } + av_assert1(n == s->nb_tiles); ret = ff_thread_get_buffer(avctx, frame, 0); if (ret < 0) diff --git a/libavcodec/prores_raw.h b/libavcodec/prores_raw.h index 3ac8068dd5..23b55661e4 100644 --- a/libavcodec/prores_raw.h +++ b/libavcodec/prores_raw.h @@ -33,6 +33,7 @@ typedef struct TileContext { GetByteContext gb; unsigned x, y; + int log2_nb_blocks; } TileContext; typedef struct ProResRAWContext { @@ -42,7 +43,7 @@ typedef struct ProResRAWContext { TileContext *tiles; unsigned int tiles_size; int nb_tiles; - int tw, th; + int th; int nb_tw, nb_th; enum AVPixelFormat pix_fmt; diff --git a/libavcodec/vulkan/prores_raw_decode.comp.glsl b/libavcodec/vulkan/prores_raw_decode.comp.glsl index c1ab920e27..92859d59d0 100644 --- a/libavcodec/vulkan/prores_raw_decode.comp.glsl +++ b/libavcodec/vulkan/prores_raw_decode.comp.glsl @@ -30,6 +30,7 @@ struct TileData { ivec2 pos; uint offset; uint size; + uint log2_nb_blocks; }; layout (set = 0, binding = 0, r16ui) uniform writeonly uimage2D dst; @@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { layout (push_constant, scalar) uniform pushConstants { u8buf pkt_data; - ivec2 tile_size; }; #define COMP_ID (gl_LocalInvocationID.y) @@ -215,10 +215,6 @@ void main(void) const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; TileData td = tile_data[tile_idx]; - int width = imageSize(dst).x; - if (expectEXT(td.pos.x >= width, false)) - return; - uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; u8vec2buf hdr_data = u8vec2buf(pkt_offset); int header_len = hdr_data[0].v.x >> 3; @@ -232,8 +228,7 @@ void main(void) return; const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); - const int w = min(tile_size.x, width - td.pos.x) >> 1; - const int nb_blocks = w >> 3; + const int nb_blocks = 1 << td.log2_nb_blocks; const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3], size[2], diff --git a/libavcodec/vulkan/prores_raw_idct.comp.glsl b/libavcodec/vulkan/prores_raw_idct.comp.glsl index 15af6d5a3f..ea16272558 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp.glsl +++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl @@ -30,6 +30,7 @@ struct TileData { ivec2 pos; uint offset; uint size; + uint log2_nb_blocks; }; layout (set = 0, binding = 0, r16ui) uniform uimage2D dst; @@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { layout (push_constant, scalar) uniform pushConstants { u8buf pkt_data; - ivec2 tile_size; uint8_t qmat[64]; }; @@ -73,17 +73,12 @@ void main(void) const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x; TileData td = tile_data[tile_idx]; - int width = imageSize(dst).x; - if (expectEXT(td.pos.x >= width, false)) - return; - uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; u8vec2buf hdr_data = u8vec2buf(pkt_offset); int qscale = pack16(hdr_data[0].v.yx); const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); - const uint w = min(tile_size.x, width - td.pos.x) >> 1; - const uint nb_blocks = w >> 3; + const uint nb_blocks = 1 << td.log2_nb_blocks; /* Copy push-constant qmat into shared memory for fast non-uniform access */ if (gl_LocalInvocationIndex < 64) @@ -110,6 +105,11 @@ void main(void) idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1); barrier(); + /* Narrow tiles use fewer than the workgroup's block rows; the surplus + * rows carry no data and must not be written. No barrier follows. */ + if (BLOCK_ID >= nb_blocks) + return; + [[unroll]] for (uint y = 0; y < 8; y++) { int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0)); diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 392b74a863..953b67d592 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -51,7 +51,6 @@ typedef struct ProResRAWVulkanDecodeContext { typedef struct DecodePushData { VkDeviceAddress pkt_data; - int32_t tile_size[2]; uint8_t qmat[64]; } DecodePushData; @@ -59,6 +58,7 @@ typedef struct TileData { int32_t pos[2]; uint32_t offset; uint32_t size; + uint32_t log2_nb_blocks; } TileData; static int vk_prores_raw_start_frame(AVCodecContext *avctx, @@ -118,6 +118,7 @@ static int vk_prores_raw_decode_slice(AVCodecContext *avctx, td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x; td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y; td[pp->nb_tiles].size = size; + td[pp->nb_tiles].log2_nb_blocks = prr->tiles[pp->nb_tiles].log2_nb_blocks; if (vp->slices_buf && slices_buf->host_ref) { td[pp->nb_tiles].offset = data - slices_buf->mapped_mem; @@ -229,8 +230,6 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx) /* Update push data */ DecodePushData pd_decode = (DecodePushData) { .pkt_data = slices_buf->address, - .tile_size[0] = prr->tw, - .tile_size[1] = prr->th, }; memcpy(pd_decode.qmat, prr->qmat, 64); ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader, -- 2.52.0 >From 83c2277637cb35410239fbe92d2c6e913c0aa36c Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 14 May 2026 22:50:36 +0900 Subject: [PATCH 3/4] prores_raw: parse the linearization curve from the bitstream After an extended Ghidra session, it turns out that the camera/recorder bakes a custom curve that *has* to be applied. It contains both the camera's inverse transfer curve, plus whatever else the camera applied. It could (and does) contain quantization refinements. And its used to switch between low and high quality encoding by boosting coeffs (thus acting as an additional dequant curve). --- libavcodec/prores_raw.c | 11 +++++++++-- libavcodec/prores_raw.h | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c index 314386f339..5465e3495c 100644 --- a/libavcodec/prores_raw.c +++ b/libavcodec/prores_raw.c @@ -420,8 +420,15 @@ static int decode_frame(AVCodecContext *avctx, bytestream2_get_buffer(&gb_hdr, qmat, 64); if ((flags >> 4) & 1) { - bytestream2_skip(&gb_hdr, 2); - bytestream2_skip(&gb_hdr, 2 * 7); + /* 8-poing 16-bit control points, defining the combined linearization + * curve (inv. transfer fn + encoder-defined shaping) */ + for (int i = 0; i < 8; i++) + s->lin_curve[i] = bytestream2_get_be16(&gb_hdr); + } else { + /* default curve: ptwos */ + static const uint16_t default_lin_curve[8] = + { 0, 512, 1024, 2048, 4096, 8192, 16384, 32768 }; + memcpy(s->lin_curve, default_lin_curve, sizeof(s->lin_curve)); } ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat); diff --git a/libavcodec/prores_raw.h b/libavcodec/prores_raw.h index 23b55661e4..1e7fee435e 100644 --- a/libavcodec/prores_raw.h +++ b/libavcodec/prores_raw.h @@ -54,6 +54,11 @@ typedef struct ProResRAWContext { DECLARE_ALIGNED(32, uint8_t, scan)[64]; DECLARE_ALIGNED(32, uint8_t, qmat)[64]; + + /* 8-point combined linearization curve + * (inv. transfer fn + encoder-defined shaping) from the frame header, + * applied after iDCT */ + uint16_t lin_curve[8]; } ProResRAWContext; extern const uint8_t ff_prores_raw_dc_cb[13]; -- 2.52.0 >From 563e4dc7e11bbaf5d043e033541e006f05afbc79 Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Fri, 15 May 2026 05:25:19 +0900 Subject: [PATCH 4/4] prores_raw: synchronize decoder with reference implementation This completes the reverse engineering of the decoder. The commit applies the linearization curve from the previous patch. --- libavcodec/prores_raw.c | 19 ++++---- libavcodec/proresdsp.c | 50 ++++++++++++++++++--- libavcodec/proresdsp.h | 3 +- libavcodec/vulkan/prores_raw_idct.comp.glsl | 39 ++++++++++------ libavcodec/vulkan_prores_raw.c | 8 ++-- 5 files changed, 86 insertions(+), 33 deletions(-) diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c index 5465e3495c..dd9d735356 100644 --- a/libavcodec/prores_raw.c +++ b/libavcodec/prores_raw.c @@ -45,15 +45,19 @@ static av_cold int decode_init(AVCodecContext *avctx) { ProResRAWContext *s = avctx->priv_data; - avctx->bits_per_raw_sample = 12; + /* The codec outputs linear data, with the transfer function of the + * camera and any adjustments built into an 8-point linearization curve */ + avctx->bits_per_raw_sample = 16; + avctx->color_trc = AVCOL_TRC_LINEAR; avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; - avctx->color_trc = AVCOL_TRC_UNSPECIFIED; avctx->colorspace = AVCOL_SPC_UNSPECIFIED; s->pix_fmt = AV_PIX_FMT_NONE; ff_blockdsp_init(&s->bdsp); - ff_proresdsp_init(&s->prodsp, avctx->bits_per_raw_sample); + /* Coefficients and the iDCT are 12-bit, the linearization curve then + * expands the result to the 16-bit linear output range. */ + ff_proresdsp_init(&s->prodsp, 12); ff_permute_scantable(s->scan, ff_prores_interlaced_scan, s->prodsp.idct_permutation); @@ -137,7 +141,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile, const int block_mask = nb_blocks - 1; const int nb_codes = 64 * nb_blocks; - LOCAL_ALIGNED_32(int16_t, block, [64*16]); + LOCAL_ALIGNED_32(int32_t, block, [64*16]); int16_t sign = 0; int16_t dc_add = 0; @@ -158,8 +162,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile, if ((ret = init_get_bits8(&gb, data, size)) < 0) return ret; - for (int n = 0; n < nb_blocks; n++) - s->bdsp.clear_block(block + n*64); + memset(block, 0, nb_blocks * 64 * sizeof(*block)); /* Special handling for first block */ int dc = get_value(&gb, 700); @@ -234,7 +237,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile, for (int n = 0; n < nb_blocks; n++) { uint16_t *ptr = dst + n*16; - s->prodsp.idct_put_bayer(ptr, linesize, block + n*64, qmat); + s->prodsp.idct_put_bayer(ptr, linesize, block + n*64, qmat, s->lin_curve); } return 0; @@ -265,7 +268,7 @@ static int decode_tile(AVCodecContext *avctx, TileContext *tile, return AVERROR_INVALIDDATA; for (int i = 0; i < 64; i++) - qmat[i] = s->qmat[i] * scale >> 1; + qmat[i] = s->qmat[i] * scale; const uint8_t *comp_start = gb->buffer_start + header_len; diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c index eb5dbf4799..dec3beb9b1 100644 --- a/libavcodec/proresdsp.c +++ b/libavcodec/proresdsp.c @@ -40,6 +40,14 @@ #define BIT_DEPTH 12 #include "simple_idct_template.c" #undef BIT_DEPTH +#undef IN_IDCT_DEPTH + +/* 32bit iDCT for the ProRes RAW */ +#define IN_IDCT_DEPTH 32 +#define BIT_DEPTH 12 +#include "simple_idct_template.c" +#undef BIT_DEPTH +#undef IN_IDCT_DEPTH /** * Special version of ff_simple_idct_int16_10bit() which does dequantization @@ -74,6 +82,24 @@ static void prores_idct_12(int16_t *restrict block, const int16_t *restrict qmat } } +/* + * 32-bit iDCT for the ProRes RAW + * qmat must be s->qmat[i] * scale + */ +static void prores_idct_bayer_32(int32_t *restrict block, const int16_t *restrict qmat) +{ + for (int i = 0; i < 64; i++) + block[i] = (block[i] * qmat[i]) >> 1; + + for (int i = 0; i < 8; i++) + idctRowCondDC_int32_12bit(block + i*8, 0); + + for (int i = 0; i < 8; i++) { + block[i] += 8192; + idctSparseCol_int32_12bit(block + i); + } +} + #define CLIP_MIN (1 << 2) ///< minimum value for clipping resulting pixels #define CLIP_MAX_10 (1 << 10) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels #define CLIP_MAX_12 (1 << 12) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels @@ -99,12 +125,21 @@ static inline void put_pixel(uint16_t *dst, ptrdiff_t linesize, const int16_t *i } } -static inline void put_pixel_bayer_12(uint16_t *dst, ptrdiff_t linesize, - const int16_t *in) +/* Apply the 8-point combined linearization curve (inv. transfer fn + encoder shaping) */ +static inline void put_pixel_bayer_lin_curve_12(uint16_t *dst, ptrdiff_t linesize, + const int32_t *in, const uint16_t *lin_curve) { for (int y = 0; y < 8; y++, dst += linesize) { - for (int x = 0; x < 8; x++) - dst[x*2] = CLIP_12(in[(y << 3) + x]) << 4; + for (int x = 0; x < 8; x++) { + /* Convert the 32-bit input into 16-bits (lrintf(x*16 - 15.5f) = 16) */ + int u = av_clip_uint16(in[(y << 3) + x]*16 - 16); + unsigned seg = (unsigned)u >> 13; + unsigned frac = (unsigned)u & 0x1FFF; + unsigned cp0 = lin_curve[seg]; + unsigned cp1 = seg < 7 ? lin_curve[seg + 1] : 0; + unsigned o = (cp0 * 8192u + ((cp1 - cp0) & 0xFFFFu) * frac + 4096u) >> 13; + dst[x*2] = FFMIN(o, 0xFFFFu); + } } } @@ -131,10 +166,11 @@ static void prores_idct_put_12_c(uint16_t *out, ptrdiff_t linesize, int16_t *blo } static void prores_idct_put_bayer_12_c(uint16_t *out, ptrdiff_t linesize, - int16_t *block, const int16_t *qmat) + int32_t *block, const int16_t *qmat, + const uint16_t *lin_curve) { - prores_idct_12(block, qmat); - put_pixel_bayer_12(out, linesize << 1, block); + prores_idct_bayer_32(block, qmat); + put_pixel_bayer_lin_curve_12(out, linesize << 1, block, lin_curve); } av_cold void ff_proresdsp_init(ProresDSPContext *dsp, int bits_per_raw_sample) diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h index f8b57d7e87..75c782fb56 100644 --- a/libavcodec/proresdsp.h +++ b/libavcodec/proresdsp.h @@ -30,7 +30,8 @@ typedef struct ProresDSPContext { int idct_permutation_type; uint8_t idct_permutation[64]; void (*idct_put)(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat); - void (*idct_put_bayer)(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat); + void (*idct_put_bayer)(uint16_t *out, ptrdiff_t linesize, int32_t *block, const int16_t *qmat, + const uint16_t *lin_curve); } ProresDSPContext; void ff_proresdsp_init(ProresDSPContext *dsp, int bits_per_raw_sample); diff --git a/libavcodec/vulkan/prores_raw_idct.comp.glsl b/libavcodec/vulkan/prores_raw_idct.comp.glsl index ea16272558..2989236513 100644 --- a/libavcodec/vulkan/prores_raw_idct.comp.glsl +++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl @@ -41,6 +41,7 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf { layout (push_constant, scalar) uniform pushConstants { u8buf pkt_data; uint8_t qmat[64]; + uint16_t lin_curve[8]; }; #define COMP_ID (gl_LocalInvocationID.z) @@ -67,6 +68,7 @@ const u8vec2 scan[64] = { }; shared uint8_t qmat_buf[64]; +shared uint lin_curve_buf[8]; void main(void) { @@ -75,32 +77,30 @@ void main(void) uint64_t pkt_offset = uint64_t(pkt_data) + td.offset; u8vec2buf hdr_data = u8vec2buf(pkt_offset); - int qscale = pack16(hdr_data[0].v.yx); + int qscale = int(hdr_data[0].v.y); const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1); const uint nb_blocks = 1 << td.log2_nb_blocks; - /* Copy push-constant qmat into shared memory for fast non-uniform access */ - if (gl_LocalInvocationIndex < 64) - qmat_buf[gl_LocalInvocationIndex] = qmat[gl_LocalInvocationIndex]; + if (gl_LocalInvocationIndex == 0) { + [[unroll]] for (uint i = 0; i < 64; i++) qmat_buf[i] = qmat[i]; + [[unroll]] for (uint i = 0; i < 8; i++) lin_curve_buf[i] = uint(lin_curve[i]); + } barrier(); [[unroll]] for (uint y = 0; y < 8; y++) { uint block_off = y*8 + ROW_ID; int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[block_off])[0]); - float vf = float(sign_extend(v, 16)) / 32768.0; - vf *= qmat_buf[block_off] * qscale; - blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = (vf / (64*4.56)) * - idct_scale[block_off]; + /* Dequantize (coeff * qmat * qscale), matching the reference decoder */ + float vf = float(sign_extend(v, 16)) * float(qmat_buf[block_off]) * float(qscale); + blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = vf * idct_scale[block_off]; } /* Column-wise iDCT */ idct8(BLOCK_ID, COMP_ID*72 + ROW_ID, 9); barrier(); - blocks[BLOCK_ID][COMP_ID*72 + ROW_ID * 9] += 0.5f; - /* Row-wise iDCT */ idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1); barrier(); @@ -112,11 +112,22 @@ void main(void) [[unroll]] for (uint y = 0; y < 8; y++) { - int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0)); - v = clamp(v, 0, 4095); - v <<= 4; + /* Bias the signed iDCT output into the reference's unsigned 16-bit space */ + int u = clamp(int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID])) + 32768, + 0, 65535); + + /* 8-point combined linearization curve (inv. transfer fn + + * encoder-defined shaping). cp1 - cp0 is the segment slope; for the + * final segment cp[8] == 0. */ + uint seg = uint(u) >> 13; + uint frac = uint(u) & 0x1FFFu; + uint cp0 = lin_curve_buf[seg]; + uint cp1 = seg < 7u ? lin_curve_buf[seg + 1u] : 0u; + uint outv = (cp0 * 8192u + ((cp1 - cp0) & 0xFFFFu) * frac + 4096u) >> 13u; + outv = min(outv, 0xFFFFu); + imageStore(dst, offs + 2*ivec2(BLOCK_ID*8 + ROW_ID, y), - ivec4(v)); + ivec4(outv)); } } diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c index 953b67d592..b6314ab693 100644 --- a/libavcodec/vulkan_prores_raw.c +++ b/libavcodec/vulkan_prores_raw.c @@ -52,6 +52,7 @@ typedef struct ProResRAWVulkanDecodeContext { typedef struct DecodePushData { VkDeviceAddress pkt_data; uint8_t qmat[64]; + uint16_t lin_curve[8]; } DecodePushData; typedef struct TileData { @@ -232,9 +233,10 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx) .pkt_data = slices_buf->address, }; memcpy(pd_decode.qmat, prr->qmat, 64); + memcpy(pd_decode.lin_curve, prr->lin_curve, sizeof(pd_decode.lin_curve)); ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader, VK_SHADER_STAGE_COMPUTE_BIT, - 0, sizeof(pd_decode) - 64, &pd_decode); + 0, offsetof(DecodePushData, qmat), &pd_decode); vk->CmdDispatch(exec->buf, prr->nb_tw, prr->nb_th, 1); @@ -302,7 +304,7 @@ static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s, { int err; - ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData) - 64, + ff_vk_shader_add_push_const(shd, 0, offsetof(DecodePushData, qmat), VK_SHADER_STAGE_COMPUTE_BIT); ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL, (uint32_t []) { 1, 4, 1 }, 0); @@ -338,7 +340,7 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s, }; for (int i = 0; i < 64; i++) SPEC_LIST_ADD(sl, 18 + i, 32, - av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7])); + av_float2int(8*idct_8_scales[i >> 3]*idct_8_scales[i & 7])); ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl, (uint32_t []) { 8, nb_blocks, 4 }, 0); -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
