PR #23462 opened by Niklas Haas (haasn) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23462 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23462.patch
# Summary of changes Briefly describe what this PR does and why. <!-- If this PR requires new FATE test samples, attach them to the PR and list their target paths below (relative to the fate-suite root). Attached filenames must match the sample's filename: ```fate-samples # e.g. vorbis/new-sample.ogg ``` --> From 02bd59b16cbd5b9f88da012c380cb0c0392d7910 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Wed, 20 May 2026 16:21:50 +0200 Subject: [PATCH 1/8] swscale/ops: add SWS_RW_PALETTE for PAL8 read type I decided to model this as a separate read/write type, rather than as a separate operation (e.g. SWS_OP_PALETTE), because it makes the semantics surrounding the read value range, plane pointer setup, etc. much cleaner. (This will become evident in the upcoming changes to the dispatch layer) Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops.c | 15 +++++++++------ libswscale/ops.h | 2 ++ libswscale/uops.c | 2 ++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/libswscale/ops.c b/libswscale/ops.c index 4afeaabf54..43fd76e8d4 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -170,8 +170,9 @@ int ff_sws_rw_op_planes(const SwsOp *op) { av_assert2(op->op == SWS_OP_READ || op->op == SWS_OP_WRITE); switch (op->rw.mode) { - case SWS_RW_PLANAR: return op->rw.elems; - case SWS_RW_PACKED: return 1; + case SWS_RW_PLANAR: return op->rw.elems; + case SWS_RW_PACKED: return 1; + case SWS_RW_PALETTE: return 2; } av_unreachable("Invalid read/write mode!"); @@ -384,8 +385,9 @@ void ff_sws_op_list_update_comps(SwsOpList *ops) for (int i = 0; i < op->rw.elems; i++) { int idx = 0; switch (op->rw.mode) { - case SWS_RW_PACKED: idx = i; break; - case SWS_RW_PLANAR: idx = ops->plane_src[i]; break; + case SWS_RW_PALETTE: idx = i; break; + case SWS_RW_PACKED: idx = i; break; + case SWS_RW_PLANAR: idx = ops->plane_src[i]; break; } av_assert0(!(ops->comps_src.flags[idx] & SWS_COMP_GARBAGE)); @@ -869,8 +871,9 @@ static void print_q4(AVBPrint *bp, const AVRational q4[4], SwsCompMask mask) } static const char *const rw_mode_names[] = { - [SWS_RW_PLANAR] = "planar", - [SWS_RW_PACKED] = "packed", + [SWS_RW_PLANAR] = "planar", + [SWS_RW_PACKED] = "packed", + [SWS_RW_PALETTE] = "palette" }; void ff_sws_op_desc(AVBPrint *bp, const SwsOp *op) diff --git a/libswscale/ops.h b/libswscale/ops.h index 65d9d49e60..ab519117ef 100644 --- a/libswscale/ops.h +++ b/libswscale/ops.h @@ -96,6 +96,7 @@ typedef enum SwsReadWriteMode { */ SWS_RW_PLANAR, /* one plane per component */ SWS_RW_PACKED, /* all components on a single plane */ + SWS_RW_PALETTE, /* plane 0 is 8-bit index, plane 1 is packed palette */ } SwsReadWriteMode; typedef struct SwsReadWriteOp { @@ -106,6 +107,7 @@ typedef struct SwsReadWriteOp { * rgb565 = 1x u16 <- use SWS_OP_UNPACK to unpack * monow = 1x u8 (frac 3) * rgb4 = 1x u8 (frac 1) + * pal8 = 4x u8 (palette) */ SwsReadWriteMode mode; /* how data is laid out in memory */ uint8_t elems; /* number of elements (of type `op.type`) to read/write */ diff --git a/libswscale/uops.c b/libswscale/uops.c index b73aedb6e1..8541649ed0 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -495,6 +495,8 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList *ops, SwsUOpFlags flags, if (op->rw.frac) return AVERROR(ENOTSUP); uop.uop = is_read ? SWS_UOP_READ_PACKED : SWS_UOP_WRITE_PACKED; + } else if (op->rw.mode == SWS_RW_PALETTE) { + return AVERROR(ENOTSUP); } else if (op->rw.frac == 3) { uop.uop = is_read ? SWS_UOP_READ_BIT : SWS_UOP_WRITE_BIT; } else if (op->rw.frac == 1) { -- 2.52.0 From eec24173da7dc798ba7c1ae9bc3e7c4e9a9d56ce Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 9 Jun 2026 13:06:04 +0200 Subject: [PATCH 2/8] swscale/format: exclude palette formats for output In theory, we could learn to handle them internally, using the same systematic palette trick, but I'll defer this for now, as vf_scale already handles this internally. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/format.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libswscale/format.c b/libswscale/format.c index d34e9d7be4..7792d17ffc 100644 --- a/libswscale/format.c +++ b/libswscale/format.c @@ -936,7 +936,11 @@ static int test_format_ops(enum AVPixelFormat format, int output) SwsPixelType pixel_type, raw_type; int ret = fmt_analyze(format, &rw, &pack, &swizzle, &shift, &pixel_type, &raw_type); - return ret == 0; + if (ret < 0) + return 0; + if (rw.mode == SWS_RW_PALETTE && output) + return 0; /* palettes are currently only supported as input */ + return 1; } static SwsSwizzleOp swizzle_inv(SwsSwizzleOp swiz) { @@ -1084,6 +1088,9 @@ int ff_sws_encode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt) RET(fmt_analyze(fmt, &rw_op, &pack, &swizzle, &shift, &pixel_type, &raw_type)); + if (rw_op.mode == SWS_RW_PALETTE) + return AVERROR(ENOTSUP); + if (shift.amount) { RET(ff_sws_op_list_append(ops, &(SwsOp) { .op = SWS_OP_LSHIFT, -- 2.52.0 From ea40e7cb3b4a2d2aede015010d6b172fa4134b12 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Wed, 20 May 2026 16:30:32 +0200 Subject: [PATCH 3/8] swscale/ops_dispatch: add support for dispatching palette reads This requires some tiny bit of extra setup work from the dispatch layer. Specifically, we need to arrange for the palette data pointer to end up in exec.in[1], and to disable the pointer advancement logic for this plane (this can be accomplished by just setting the stride and bump to 0). We also want to disable the tail buffer / overflow pixel copying logic for the palette, which can be accomplished by ensuring that p->planes_in only contains the number of *data* planes, excluding the fixed palette. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops_dispatch.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c index 44248195d7..b6cacf16f2 100644 --- a/libswscale/ops_dispatch.c +++ b/libswscale/ops_dispatch.c @@ -45,6 +45,7 @@ typedef struct SwsOpPass { int pixel_bits_out; int idx_in[4]; int idx_out[4]; + int palette_idx; int *offsets_y; int filter_size_h; bool memcpy_first; @@ -267,6 +268,11 @@ static int op_pass_setup(const SwsFrame *out, const SwsFrame *in, exec->out_bump[i] = out->linesize[idx] - loop_size; } + if (p->palette_idx >= 0) { + exec->in[1] = in->data[p->palette_idx]; + exec->in_stride[1] = exec->in_bump[1] = 0; + } + const bool memcpy_in = p->memcpy_first || p->memcpy_last; if (!memcpy_in && !p->memcpy_out) { av_assert0(safe_blocks == num_blocks); @@ -462,8 +468,18 @@ static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, } } +static int rw_data_planes(const SwsOp *op) +{ + /* Exclude the palette plane from the plane count, since it does not need + * to be directly processed/adjusted by the dispatch layer */ + return op->rw.mode == SWS_RW_PALETTE ? 1 : ff_sws_rw_op_planes(op); +} + static int rw_pixel_bits(const SwsOp *op) { + if (op->rw.mode == SWS_RW_PALETTE) + return 8; /* index size */ + int elems = 0; switch (op->rw.mode) { case SWS_RW_PLANAR: elems = 1; break; @@ -526,8 +542,9 @@ static int compile(SwsGraph *graph, const SwsOpBackend *backend, const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(dst->format); const SwsOp *read = ff_sws_op_list_input(ops); const SwsOp *write = ff_sws_op_list_output(ops); - p->planes_in = ff_sws_rw_op_planes(read); - p->planes_out = ff_sws_rw_op_planes(write); + p->palette_idx = read->rw.mode == SWS_RW_PALETTE ? ops->plane_src[1] : -1; + p->planes_in = rw_data_planes(read); + p->planes_out = rw_data_planes(write); p->pixel_bits_in = rw_pixel_bits(read); p->pixel_bits_out = rw_pixel_bits(write); p->exec_base = (SwsOpExec) { -- 2.52.0 From 4ec7cea31d9209e19f47d8e797597b2f086386d3 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 9 Jun 2026 12:44:41 +0200 Subject: [PATCH 4/8] swscale/uops: add SWS_UOP_READ_PALETTE This commit only adds the uop itself; it does not yet add any implementations. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/uops.c | 5 ++++- libswscale/uops.h | 1 + libswscale/uops_macros.h | 8 ++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/uops.c b/libswscale/uops.c index 8541649ed0..82682e1a87 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -54,6 +54,7 @@ static const struct { UOP_NAME(READ_PACKED, "read_packed"), UOP_NAME(READ_NIBBLE, "read_nibble"), UOP_NAME(READ_BIT, "read_bit"), + UOP_NAME(READ_PALETTE, "read_palette"), UOP_NAME(WRITE_PLANAR, "write_planar"), UOP_NAME(WRITE_PACKED, "write_packed"), UOP_NAME(WRITE_NIBBLE, "write_nibble"), @@ -496,7 +497,9 @@ static int translate_rw_op(SwsContext *ctx, SwsUOpList *ops, SwsUOpFlags flags, return AVERROR(ENOTSUP); uop.uop = is_read ? SWS_UOP_READ_PACKED : SWS_UOP_WRITE_PACKED; } else if (op->rw.mode == SWS_RW_PALETTE) { - return AVERROR(ENOTSUP); + if (op->rw.frac || !is_read) + return AVERROR(ENOTSUP); + uop.uop = SWS_UOP_READ_PALETTE; } else if (op->rw.frac == 3) { uop.uop = is_read ? SWS_UOP_READ_BIT : SWS_UOP_WRITE_BIT; } else if (op->rw.frac == 1) { diff --git a/libswscale/uops.h b/libswscale/uops.h index b2e9af30a4..533d036ca6 100644 --- a/libswscale/uops.h +++ b/libswscale/uops.h @@ -96,6 +96,7 @@ typedef enum SwsUOpType { SWS_UOP_READ_PACKED, /* simple packed byte-aligned read */ SWS_UOP_READ_NIBBLE, /* fractional read (4 bits) from single plane */ SWS_UOP_READ_BIT, /* fractional read (1 bit) from single plane */ + SWS_UOP_READ_PALETTE, /* indexed read from palette in plane 1 */ SWS_UOP_WRITE_PLANAR, /* simple planar byte-aligned write */ SWS_UOP_WRITE_PACKED, /* simple packed byte-aligned write */ diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index f63d046aa3..664575859e 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -75,6 +75,8 @@ MACRO(__VA_ARGS__, u8_read_bit_x , SWS_PIXEL_U8 , SWS_UOP_READ_BIT , 0x1) #define SWS_FOR_STRUCT_U8_READ_BIT(MACRO, ...) \ MACRO(__VA_ARGS__, u8_read_bit_x , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_READ_BIT , .mask = 0x1) +#define SWS_FOR_U8_READ_PALETTE(MACRO, ...) +#define SWS_FOR_STRUCT_U8_READ_PALETTE(MACRO, ...) #define SWS_FOR_U8_WRITE_PLANAR(MACRO, ...) \ MACRO(__VA_ARGS__, u8_write_planar_x , SWS_PIXEL_U8 , SWS_UOP_WRITE_PLANAR , 0x1) \ MACRO(__VA_ARGS__, u8_write_planar_xy , SWS_PIXEL_U8 , SWS_UOP_WRITE_PLANAR , 0x3) \ @@ -417,6 +419,8 @@ #define SWS_FOR_STRUCT_U16_READ_NIBBLE(MACRO, ...) #define SWS_FOR_U16_READ_BIT(MACRO, ...) #define SWS_FOR_STRUCT_U16_READ_BIT(MACRO, ...) +#define SWS_FOR_U16_READ_PALETTE(MACRO, ...) +#define SWS_FOR_STRUCT_U16_READ_PALETTE(MACRO, ...) #define SWS_FOR_U16_WRITE_PLANAR(MACRO, ...) \ MACRO(__VA_ARGS__, u16_write_planar_x , SWS_PIXEL_U16, SWS_UOP_WRITE_PLANAR , 0x1) \ MACRO(__VA_ARGS__, u16_write_planar_xy , SWS_PIXEL_U16, SWS_UOP_WRITE_PLANAR , 0x3) \ @@ -721,6 +725,8 @@ #define SWS_FOR_STRUCT_U32_READ_NIBBLE(MACRO, ...) #define SWS_FOR_U32_READ_BIT(MACRO, ...) #define SWS_FOR_STRUCT_U32_READ_BIT(MACRO, ...) +#define SWS_FOR_U32_READ_PALETTE(MACRO, ...) +#define SWS_FOR_STRUCT_U32_READ_PALETTE(MACRO, ...) #define SWS_FOR_U32_WRITE_PLANAR(MACRO, ...) \ MACRO(__VA_ARGS__, u32_write_planar_x , SWS_PIXEL_U32, SWS_UOP_WRITE_PLANAR , 0x1) \ MACRO(__VA_ARGS__, u32_write_planar_xy , SWS_PIXEL_U32, SWS_UOP_WRITE_PLANAR , 0x3) \ @@ -1011,6 +1017,8 @@ #define SWS_FOR_STRUCT_F32_READ_NIBBLE(MACRO, ...) #define SWS_FOR_F32_READ_BIT(MACRO, ...) #define SWS_FOR_STRUCT_F32_READ_BIT(MACRO, ...) +#define SWS_FOR_F32_READ_PALETTE(MACRO, ...) +#define SWS_FOR_STRUCT_F32_READ_PALETTE(MACRO, ...) #define SWS_FOR_F32_WRITE_PLANAR(MACRO, ...) #define SWS_FOR_STRUCT_F32_WRITE_PLANAR(MACRO, ...) #define SWS_FOR_F32_WRITE_PACKED(MACRO, ...) -- 2.52.0 From b1cd5e20e0c20c951e0e12544e99ab96a7c71e37 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 9 Jun 2026 12:46:20 +0200 Subject: [PATCH 5/8] swscale/uops_backend: add SWS_UOP_READ_PALETTE reference implementation This does not actually generate any code yet as the macro is still empty, but that will change once I add support for generated palette reads to the format handling code. This logic merely needs to be in place first to avoid introducing broken intermediate states where palette uops are generated but not implemented by the reference backend. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/uops_backend.c | 1 + libswscale/uops_tmpl.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/libswscale/uops_backend.c b/libswscale/uops_backend.c index 50f5302ca6..a9d2ebc6ee 100644 --- a/libswscale/uops_backend.c +++ b/libswscale/uops_backend.c @@ -66,6 +66,7 @@ SWS_FOR(TYPE, READ_PACKED, REF_ENTRY) \ SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY) \ SWS_FOR(TYPE, READ_BIT, REF_ENTRY) \ + SWS_FOR(TYPE, READ_PALETTE, REF_ENTRY) \ SWS_FOR(TYPE, PERMUTE, REF_ENTRY) \ SWS_FOR(TYPE, COPY, REF_ENTRY) \ SWS_FOR(TYPE, WRITE_PLANAR, REF_ENTRY) \ diff --git a/libswscale/uops_tmpl.c b/libswscale/uops_tmpl.c index 44e8551083..214f7a0ef9 100644 --- a/libswscale/uops_tmpl.c +++ b/libswscale/uops_tmpl.c @@ -163,6 +163,24 @@ DECL_READ(read_nibble, const SwsCompMask mask) CONTINUE(x, y, z, w); } +DECL_READ(read_palette, const SwsCompMask mask) +{ + av_assert2(mask == SWS_COMP_ELEMS(4)); + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + const pixel_t index = in0[i]; + const pixel_t *value = &in1[index * 4]; + x[i] = value[0]; + y[i] = value[1]; + z[i] = value[2]; + w[i] = value[3]; + } + + iter->in[0] += SIZEOF_BLOCK; + CONTINUE(x, y, z, w); +} + DECL_WRITE(write_bit, const SwsCompMask mask) { av_assert2(mask == SWS_COMP_ELEMS(1)); @@ -199,6 +217,7 @@ SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar) SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed) SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble) SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit) +SWS_FOR(PX, READ_PALETTE, DECL_IMPL_READ, read_palette) SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar) SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed) SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble) @@ -208,6 +227,7 @@ SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY) SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY) SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY) SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY) +SWS_FOR_STRUCT(PX, READ_PALETTE, DECL_ENTRY) SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY) SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY) SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY) -- 2.52.0 From b3231f7c7392429c8b27b399d1179d260b3f0819 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 9 Jun 2026 12:47:31 +0200 Subject: [PATCH 6/8] swscale/format: add support for AV_PIX_FMT_PAL8 This is handled using the new SWS_RW_PALETTE read op mode. We need to be a bit careful to use the correct pixfmt descriptor downstream, because the descriptor for PAL8 itself merely describes the *index*, rather than the actual data values. Accomplish this by introducing a new function to map the palette format to the resulting pixel format after applying the palette (explicitly documented as AV_PIX_FMT_RGB32). +pal8 16x16 -> rgb24 16x16: + [ u8 +++X] SWS_OP_READ : 4 elem(s) palette >> 0 + min: {0 0 0 _}, max: {255 255 255 _} + [ u8 +++X] SWS_OP_SWIZZLE : 2103 + min: {0 0 0 _}, max: {255 255 255 _} + [ u8 XXXX] SWS_OP_WRITE : 3 elem(s) packed >> 0 + (X = unused, z = byteswapped, + = exact, 0 = zero) + translated micro-ops: + u8_read_palette_xyzw + u8_permute_xz_zx + u8_write_packed_xyz ... Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/format.c | 55 +++++++++++++++++++++++++++---------- libswscale/uops_macros.h | 6 ++-- tests/ref/fate/sws-ops-list | 2 +- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/libswscale/format.c b/libswscale/format.c index 7792d17ffc..c9e5fd183b 100644 --- a/libswscale/format.c +++ b/libswscale/format.c @@ -680,10 +680,24 @@ void ff_sws_frame_from_avframe(SwsFrame *dst, const AVFrame *src) #if CONFIG_UNSTABLE +/** + * Returns the underlying descriptor for fake formats like PAL8 whose + * descriptors alone do not fully describe the pixel data. + */ +static inline const AVPixFmtDescriptor *fmt_desc_decoded(enum AVPixelFormat fmt) +{ + if (fmt == AV_PIX_FMT_PAL8) + return av_pix_fmt_desc_get(AV_PIX_FMT_RGB32); + + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + av_assert0(!(desc->flags & AV_PIX_FMT_FLAG_PAL)); + return desc; +} + /* Returns the type suitable for a pixel after fully decoding/unpacking it */ static SwsPixelType fmt_pixel_type(enum AVPixelFormat fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + const AVPixFmtDescriptor *desc = fmt_desc_decoded(fmt); const int bits = FFALIGN(desc->comp[0].depth, 8); if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { switch (bits) { @@ -823,6 +837,18 @@ static FmtInfo fmt_info_irregular(enum AVPixelFormat fmt) case AV_PIX_FMT_XV48LE: case AV_PIX_FMT_XV48BE: return PACKED_FMT(UYVA, 4); + + /* Miscellaneous irregular formats */ + case AV_PIX_FMT_PAL8: + return (FmtInfo) { + .rw = { .elems = 4, .mode = SWS_RW_PALETTE }, + /* PAL8 is explicitly defined as endian-dependent */ + #if AV_HAVE_BIGENDIAN + .swizzle = ARGB, + #else + .swizzle = BGRA, + #endif + }; } return (FmtInfo) {0}; @@ -963,7 +989,7 @@ static SwsSwizzleOp swizzle_inv(SwsSwizzleOp swiz) { */ static SwsClearOp fmt_clear(enum AVPixelFormat fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + const AVPixFmtDescriptor *desc = fmt_desc_decoded(fmt); const bool has_chroma = desc->nb_components >= 3; const bool has_alpha = desc->flags & AV_PIX_FMT_FLAG_ALPHA; @@ -989,7 +1015,7 @@ static SwsClearOp fmt_clear(enum AVPixelFormat fmt) int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + const AVPixFmtDescriptor *desc = fmt_desc_decoded(fmt); SwsPixelType pixel_type, raw_type; SwsReadWriteOp rw_op; SwsSwizzleOp swizzle; @@ -1078,7 +1104,7 @@ int ff_sws_decode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt) int ff_sws_encode_pixfmt(SwsOpList *ops, enum AVPixelFormat fmt) { - const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + const AVPixFmtDescriptor *desc = fmt_desc_decoded(fmt); SwsPixelType pixel_type, raw_type; SwsReadWriteOp rw_op; SwsSwizzleOp swizzle; @@ -1161,23 +1187,24 @@ static SwsLinearOp fmt_encode_range(const SwsFormat *fmt, bool *incomplete) { Q0, Q0, Q0, Q1, Q0 }, }}; - const int depth0 = fmt->desc->comp[0].depth; - const int depth1 = fmt->desc->comp[1].depth; - const int depth2 = fmt->desc->comp[2].depth; - const int depth3 = fmt->desc->comp[3].depth; + const AVPixFmtDescriptor *desc = fmt_desc_decoded(fmt->format); + const int depth0 = desc->comp[0].depth; + const int depth1 = desc->comp[1].depth; + const int depth2 = desc->comp[2].depth; + const int depth3 = desc->comp[3].depth; - if (fmt->desc->flags & AV_PIX_FMT_FLAG_FLOAT) + if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) return c; /* floats are directly output as-is */ av_assert0(depth0 < 32 && depth1 < 32 && depth2 < 32 && depth3 < 32); - if (fmt->csp == AVCOL_SPC_RGB || (fmt->desc->flags & AV_PIX_FMT_FLAG_XYZ)) { + if (fmt->csp == AVCOL_SPC_RGB || (desc->flags & AV_PIX_FMT_FLAG_XYZ)) { c.m[0][0] = Q((1 << depth0) - 1); c.m[1][1] = Q((1 << depth1) - 1); c.m[2][2] = Q((1 << depth2) - 1); } else if (fmt->range == AVCOL_RANGE_JPEG) { /* Full range YUV */ c.m[0][0] = Q((1 << depth0) - 1); - if (fmt->desc->nb_components >= 3) { + if (desc->nb_components >= 3) { /* This follows the ITU-R convention, which is slightly different * from the JFIF convention. */ c.m[1][1] = Q((1 << depth1) - 1); @@ -1191,7 +1218,7 @@ static SwsLinearOp fmt_encode_range(const SwsFormat *fmt, bool *incomplete) *incomplete = true; c.m[0][0] = Q(219 << (depth0 - 8)); c.m[0][4] = Q( 16 << (depth0 - 8)); - if (fmt->desc->nb_components >= 3) { + if (desc->nb_components >= 3) { c.m[1][1] = Q(224 << (depth1 - 8)); c.m[2][2] = Q(224 << (depth2 - 8)); c.m[1][4] = Q(128 << (depth1 - 8)); @@ -1199,8 +1226,8 @@ static SwsLinearOp fmt_encode_range(const SwsFormat *fmt, bool *incomplete) } } - if (fmt->desc->flags & AV_PIX_FMT_FLAG_ALPHA) { - const bool is_ya = fmt->desc->nb_components == 2; + if (desc->flags & AV_PIX_FMT_FLAG_ALPHA) { + const bool is_ya = desc->nb_components == 2; c.m[3][3] = Q((1 << (is_ya ? depth1 : depth3)) - 1); } diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index 664575859e..2ec2d5dcf0 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -75,8 +75,10 @@ MACRO(__VA_ARGS__, u8_read_bit_x , SWS_PIXEL_U8 , SWS_UOP_READ_BIT , 0x1) #define SWS_FOR_STRUCT_U8_READ_BIT(MACRO, ...) \ MACRO(__VA_ARGS__, u8_read_bit_x , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_READ_BIT , .mask = 0x1) -#define SWS_FOR_U8_READ_PALETTE(MACRO, ...) -#define SWS_FOR_STRUCT_U8_READ_PALETTE(MACRO, ...) +#define SWS_FOR_U8_READ_PALETTE(MACRO, ...) \ + MACRO(__VA_ARGS__, u8_read_palette_xyzw , SWS_PIXEL_U8 , SWS_UOP_READ_PALETTE , 0xf) +#define SWS_FOR_STRUCT_U8_READ_PALETTE(MACRO, ...) \ + MACRO(__VA_ARGS__, u8_read_palette_xyzw , .type = SWS_PIXEL_U8 , .uop = SWS_UOP_READ_PALETTE , .mask = 0xf) #define SWS_FOR_U8_WRITE_PLANAR(MACRO, ...) \ MACRO(__VA_ARGS__, u8_write_planar_x , SWS_PIXEL_U8 , SWS_UOP_WRITE_PLANAR , 0x1) \ MACRO(__VA_ARGS__, u8_write_planar_xy , SWS_PIXEL_U8 , SWS_UOP_WRITE_PLANAR , 0x3) \ diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list index 68a1fc1105..f242b7df31 100644 --- a/tests/ref/fate/sws-ops-list +++ b/tests/ref/fate/sws-ops-list @@ -1 +1 @@ -e2f26cb6df5c11015e613016bb1a004a +6d955baf502cae74c89a4866af8f06d6 -- 2.52.0 From 8195b1d769cd64a01ff98f949f8f2c667d833eb7 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Wed, 20 May 2026 16:44:09 +0200 Subject: [PATCH 7/8] tests/checkasm/sw_ops: add check for SWS_UOP_READ_PALETTE We just need to ensure the palette contains valid data, which will happen automatically as long as the plane 1 is large enough. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- tests/checkasm/sw_ops.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/sw_ops.c b/tests/checkasm/sw_ops.c index 99140ced52..fabe9a362c 100644 --- a/tests/checkasm/sw_ops.c +++ b/tests/checkasm/sw_ops.c @@ -177,6 +177,11 @@ static void check_compiled(const char *name, exec.out_bump[i] = exec.out_stride[i] - write_size; } + if (read_op->rw.mode == SWS_RW_PALETTE) { + static_assert(sizeof(src0[1]) >= sizeof(uint32_t[256]), "palette plane too small"); + exec.in_bump[1] = exec.in_stride[1] = 0; + } + int32_t in_bump_y[LINES]; if (read_op->rw.filter.op == SWS_OP_FILTER_V) { const int *offsets = read_op->rw.filter.kernel->offsets; @@ -403,8 +408,9 @@ static void check_read(const char *name, const SwsUOp *uop) switch (uop->uop) { case SWS_UOP_READ_PACKED: case SWS_UOP_READ_BIT: - case SWS_UOP_READ_NIBBLE: mode = SWS_RW_PACKED; break; - case SWS_UOP_READ_PLANAR: mode = SWS_RW_PLANAR; break; + case SWS_UOP_READ_NIBBLE: mode = SWS_RW_PACKED; break; + case SWS_UOP_READ_PLANAR: mode = SWS_RW_PLANAR; break; + case SWS_UOP_READ_PALETTE: mode = SWS_RW_PALETTE; break; default: return; } @@ -764,6 +770,7 @@ void checkasm_check_sw_ops(void) CHECK_FOR(READ_PACKED, check_read); CHECK_FOR(READ_NIBBLE, check_read); CHECK_FOR(READ_BIT, check_read); + CHECK_FOR(READ_PALETTE, check_read); CHECK_FOR(WRITE_PLANAR, check_write); CHECK_FOR(WRITE_PACKED, check_write); CHECK_FOR(WRITE_NIBBLE, check_write); -- 2.52.0 From 991aa1409f60f051514693ffb50cb032a410d55a Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Tue, 9 Jun 2026 12:49:20 +0200 Subject: [PATCH 8/8] swscale/x86/ops: add AVX2/SSE4 path for SWS_UOP_READ_PALETTE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AVX2 is a fairly straightforward vpgatherdd + 4x4 transpose. The SSE4 fallback is an unrolled scalar loop, for lack of anything better to do. checkasm: - CPU: AMD Ryzen 9 9950X3D 16-Core Processor (00B40F40) - Timing source: x86 (rdtsc) - Bench duration: 10000 µs per function (45898205 cycles) - Random seed: 2518020648 Benchmark results: name cycles (vs ref) u8_read_palette_xyzw_c: 2877.5 u8_read_palette_xyzw_x86_sse4: 1951.9 ( 1.47x) u8_read_palette_xyzw_x86_avx2: 1051.6 ( 2.74x) Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 2 ++ libswscale/x86/ops_int.asm | 57 ++++++++++++++++++++++++++++++++ libswscale/x86/uops_macros.asm.h | 1 + 3 files changed, 60 insertions(+) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 1adb73e21c..c54d213c09 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -312,6 +312,7 @@ static bool uop_is_type_invariant(const SwsUOpType uop) SWS_FOR_STRUCT(TYPE, READ_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \ SWS_FOR_STRUCT(TYPE, READ_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, READ_BIT, DECL_ENTRY, EXT, NULL, NULL) \ +SWS_FOR_STRUCT(TYPE, READ_PALETTE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, WRITE_PACKED, DECL_ENTRY, EXT, NULL, setup_rw_packed) \ SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE, DECL_ENTRY, EXT, NULL, NULL) \ SWS_FOR_STRUCT(TYPE, WRITE_BIT, DECL_ENTRY, EXT, NULL, NULL) \ @@ -334,6 +335,7 @@ SWS_FOR_STRUCT(TYPE, DITHER, DECL_ENTRY, EXT, NULL, setup_dither) SWS_FOR(TYPE, READ_PACKED, REF_ENTRY, EXT) \ SWS_FOR(TYPE, READ_NIBBLE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, READ_BIT, REF_ENTRY, EXT) \ + SWS_FOR(TYPE, READ_PALETTE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_PACKED, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_NIBBLE, REF_ENTRY, EXT) \ SWS_FOR(TYPE, WRITE_BIT, REF_ENTRY, EXT) \ diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm index 111e6d0796..ce9ab1fdc9 100644 --- a/libswscale/x86/ops_int.asm +++ b/libswscale/x86/ops_int.asm @@ -282,6 +282,62 @@ IF1 V2, read_packed34 mx2, my2, mz2, mw2, in0q + mmsize * COMPS CONTINUE tmp0q %endmacro +%macro read_pal8 6 ; x, y, z, w, palette, index +%if cpuflag(avx2) + pmovzxbd %1, [%6 + 0] + pmovzxbd %2, [%6 + 8] + pmovzxbd %3, [%6 + 16] + pmovzxbd %4, [%6 + 24] + vperm2i128 m8, %1, %3, q0200 + vperm2i128 m9, %1, %3, q0301 + vperm2i128 m10, %2, %4, q0200 + vperm2i128 m11, %2, %4, q0301 + pcmpeqb m14, m14 + pcmpeqb m15, m15 + vpgatherdd %1, [%5 + 4 * m8], m14 + vpgatherdd %2, [%5 + 4 * m9], m15 + pcmpeqb m14, m14 + pcmpeqb m15, m15 + vpgatherdd %3, [%5 + 4 * m10], m14 + vpgatherdd %4, [%5 + 4 * m11], m15 + pshufb %1, m12 + pshufb %2, m12 + pshufb %3, m12 + pshufb %4, m12 + punpckldq m8, %1, %2 + punpckldq m9, %3, %4 + punpckhdq m10, %1, %2 + punpckhdq m11, %3, %4 + punpcklqdq %1, m8, m9 + punpckhqdq %2, m8, m9 + punpcklqdq %3, m10, m11 + punpckhqdq %4, m10, m11 +%else ; !cpuflag(avx2) + %assign i 0 + %rep 16 + movzx tmp1d, byte [%6 + i] + pinsrb %1, [%5 + 4 * tmp1q + 0], i + pinsrb %2, [%5 + 4 * tmp1q + 1], i + pinsrb %3, [%5 + 4 * tmp1q + 2], i + pinsrb %4, [%5 + 4 * tmp1q + 3], i + %assign i i+1 + %endrep +%endif +%endmacro + +%macro READ_PALETTE 0 +assert COMPS == 4 +assert BITS == 8 + %if cpuflag(avx2) + VBROADCASTI128 m12, [read8_unpack4] + %endif + LOAD_CONT tmp0q + read_pal8 mx, my, mz, mw, in1q, in0q +IF1 V2, read_pal8 mx2, my2, mz2, mw2, in1q, in0q + mmsize + add in0q, BLOCK_SIZE + CONTINUE tmp0q +%endmacro + %macro write_packed2 0 %if cpuflag(avx2) vpermq mx, mx, q3120 ; { X0 X2 | X1 X3 } @@ -716,6 +772,7 @@ assert 0, SWS_UOP_DITHER is not implemented for integer types DECL_%1_READ_PACKED (READ_PACKED) DECL_%1_READ_NIBBLE (READ_NIBBLE) DECL_%1_READ_BIT (READ_BIT) + DECL_%1_READ_PALETTE (READ_PALETTE) DECL_%1_WRITE_PACKED (WRITE_PACKED) DECL_%1_WRITE_NIBBLE (WRITE_NIBBLE) DECL_%1_WRITE_BIT (WRITE_BIT) diff --git a/libswscale/x86/uops_macros.asm.h b/libswscale/x86/uops_macros.asm.h index d9565d12f2..fce08f320f 100644 --- a/libswscale/x86/uops_macros.asm.h +++ b/libswscale/x86/uops_macros.asm.h @@ -57,6 +57,7 @@ {DEF_MACRO(READ_PLANAR_FH, TYPE)}, \ {DEF_MACRO(READ_PLANAR_FV, TYPE)}, \ {DEF_MACRO(READ_PLANAR_FV_FMA, TYPE)}, \ + {DEF_MACRO(READ_PALETTE, TYPE)}, \ {DEF_MACRO(WRITE_BIT, TYPE)}, \ {DEF_MACRO(WRITE_NIBBLE, TYPE)}, \ {DEF_MACRO(WRITE_PACKED, TYPE)}, \ -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
