This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 68f3886460e13f32e00a8266852e6e60462a3a08 Author: Niklas Haas <[email protected]> AuthorDate: Fri Feb 27 15:17:24 2026 +0100 Commit: Niklas Haas <[email protected]> CommitDate: Thu Mar 5 23:34:56 2026 +0000 swscale/ops_dispatch: split off compile/dispatch code from ops.c This code is self-contained and logically distinct from the ops-related helpers in ops.c, so it belongs in its own file. Purely cosmetic; no functional change. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/Makefile | 1 + libswscale/ops.c | 381 ------------------------------------------- libswscale/ops_dispatch.c | 408 ++++++++++++++++++++++++++++++++++++++++++++++ libswscale/ops_dispatch.h | 94 +++++++++++ libswscale/ops_internal.h | 66 +------- 5 files changed, 504 insertions(+), 446 deletions(-) diff --git a/libswscale/Makefile b/libswscale/Makefile index c8119e4f43..e76d92d6d2 100644 --- a/libswscale/Makefile +++ b/libswscale/Makefile @@ -30,6 +30,7 @@ OBJS-$(CONFIG_UNSTABLE) += \ ops.o \ ops_backend.o \ ops_chain.o \ + ops_dispatch.o \ ops_memcpy.o \ ops_optimizer.o \ diff --git a/libswscale/ops.c b/libswscale/ops.c index 0c9357e645..d2837d97c7 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -841,384 +841,3 @@ void ff_sws_op_list_print(void *log, int lev, int lev_extra, av_log(log, lev, " (X = unused, z = byteswapped, + = exact, 0 = zero)\n"); } - -int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, - const SwsOpList *ops, SwsCompiledOp *out) -{ - SwsOpList *copy, rest; - SwsCompiledOp compiled = {0}; - int ret = 0; - - copy = ff_sws_op_list_duplicate(ops); - if (!copy) - return AVERROR(ENOMEM); - - /* Ensure these are always set during compilation */ - ff_sws_op_list_update_comps(copy); - - /* Make an on-stack copy of `ops` to ensure we can still properly clean up - * the copy afterwards */ - rest = *copy; - - ret = backend->compile(ctx, &rest, &compiled); - if (ret < 0) { - int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; - av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", - backend->name, av_err2str(ret)); - if (rest.num_ops != ops->num_ops) { - av_log(ctx, msg_lev, "Uncompiled remainder:\n"); - ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, &rest); - } - } else { - *out = compiled; - } - - ff_sws_op_list_free(©); - return ret; -} - -int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) -{ - for (int n = 0; ff_sws_op_backends[n]; n++) { - const SwsOpBackend *backend = ff_sws_op_backends[n]; - if (ops->src.hw_format != backend->hw_format || - ops->dst.hw_format != backend->hw_format) - continue; - if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) - continue; - - av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " - "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", - backend->name, out->block_size, out->over_read, out->over_write, - out->cpu_flags); - return 0; - } - - av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); - ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops); - return AVERROR(ENOTSUP); -} - -typedef struct SwsOpPass { - SwsCompiledOp comp; - SwsOpExec exec_base; - int num_blocks; - int tail_off_in; - int tail_off_out; - int tail_size_in; - int tail_size_out; - int planes_in; - int planes_out; - int pixel_bits_in; - int pixel_bits_out; - int idx_in[4]; - int idx_out[4]; - bool memcpy_in; - bool memcpy_out; -} SwsOpPass; - -static void op_pass_free(void *ptr) -{ - SwsOpPass *p = ptr; - if (!p) - return; - - if (p->comp.free) - p->comp.free(p->comp.priv); - - av_free(p); -} - -static inline void get_row_data(const SwsOpPass *p, const int y, - const uint8_t *in[4], uint8_t *out[4]) -{ - const SwsOpExec *base = &p->exec_base; - for (int i = 0; i < p->planes_in; i++) - in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i]; - for (int i = 0; i < p->planes_out; i++) - out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i]; -} - -static void op_pass_setup(const SwsFrame *out, const SwsFrame *in, - const SwsPass *pass) -{ - const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format); - const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format); - - SwsOpPass *p = pass->priv; - SwsOpExec *exec = &p->exec_base; - const SwsCompiledOp *comp = &p->comp; - const int block_size = comp->block_size; - p->num_blocks = (pass->width + block_size - 1) / block_size; - - /* Set up main loop parameters */ - const int aligned_w = p->num_blocks * block_size; - const int safe_width = (p->num_blocks - 1) * block_size; - const int tail_size = pass->width - safe_width; - p->tail_off_in = safe_width * p->pixel_bits_in >> 3; - p->tail_off_out = safe_width * p->pixel_bits_out >> 3; - p->tail_size_in = tail_size * p->pixel_bits_in >> 3; - p->tail_size_out = tail_size * p->pixel_bits_out >> 3; - p->memcpy_in = false; - p->memcpy_out = false; - - for (int i = 0; i < p->planes_in; i++) { - const int idx = p->idx_in[i]; - const int chroma = idx == 1 || idx == 2; - const int sub_x = chroma ? indesc->log2_chroma_w : 0; - const int sub_y = chroma ? indesc->log2_chroma_h : 0; - const int plane_w = (aligned_w + sub_x) >> sub_x; - const int plane_pad = (comp->over_read + sub_x) >> sub_x; - const int plane_size = plane_w * p->pixel_bits_in >> 3; - if (comp->slice_align) - p->memcpy_in |= plane_size + plane_pad > in->linesize[idx]; - exec->in[i] = in->data[idx]; - exec->in_stride[i] = in->linesize[idx]; - exec->in_sub_y[i] = sub_y; - exec->in_sub_x[i] = sub_x; - } - - for (int i = 0; i < p->planes_out; i++) { - const int idx = p->idx_out[i]; - const int chroma = idx == 1 || idx == 2; - const int sub_x = chroma ? outdesc->log2_chroma_w : 0; - const int sub_y = chroma ? outdesc->log2_chroma_h : 0; - const int plane_w = (aligned_w + sub_x) >> sub_x; - const int plane_pad = (comp->over_write + sub_x) >> sub_x; - const int plane_size = plane_w * p->pixel_bits_out >> 3; - if (comp->slice_align) - p->memcpy_out |= plane_size + plane_pad > out->linesize[idx]; - exec->out[i] = out->data[idx]; - exec->out_stride[i] = out->linesize[idx]; - exec->out_sub_y[i] = sub_y; - exec->out_sub_x[i] = sub_x; - } - - /* Pre-fill pointer bump for the main section only; this value does not - * matter at all for the tail / last row handlers because they only ever - * process a single line */ - const int blocks_main = p->num_blocks - p->memcpy_out; - for (int i = 0; i < 4; i++) { - exec->in_bump[i] = exec->in_stride[i] - blocks_main * exec->block_size_in; - exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out; - } - - exec->in_frame = in; - exec->out_frame = out; -} - -/* Dispatch kernel over the last column of the image using memcpy */ -static av_always_inline void -handle_tail(const SwsOpPass *p, SwsOpExec *exec, - const bool copy_out, const bool copy_in, - int y, const int h) -{ - DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; - - const SwsOpExec *base = &p->exec_base; - const SwsCompiledOp *comp = &p->comp; - const int tail_size_in = p->tail_size_in; - const int tail_size_out = p->tail_size_out; - const int bx = p->num_blocks - 1; - - const uint8_t *in_data[4]; - uint8_t *out_data[4]; - get_row_data(p, y, in_data, out_data); - - for (int i = 0; i < p->planes_in; i++) { - in_data[i] += p->tail_off_in; - if (copy_in) { - exec->in[i] = (void *) tmp[0][i]; - exec->in_stride[i] = sizeof(tmp[0][i]); - } else { - exec->in[i] = in_data[i]; - } - } - - for (int i = 0; i < p->planes_out; i++) { - out_data[i] += p->tail_off_out; - if (copy_out) { - exec->out[i] = (void *) tmp[1][i]; - exec->out_stride[i] = sizeof(tmp[1][i]); - } else { - exec->out[i] = out_data[i]; - } - } - - for (int y_end = y + h; y < y_end; y++) { - if (copy_in) { - for (int i = 0; i < p->planes_in; i++) { - av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); - memcpy(tmp[0][i], in_data[i], tail_size_in); - in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */ - } - } - - comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1); - - if (copy_out) { - for (int i = 0; i < p->planes_out; i++) { - av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); - memcpy(out_data[i], tmp[1][i], tail_size_out); - out_data[i] += base->out_stride[i]; - } - } - - for (int i = 0; i < 4; i++) { - if (!copy_in && exec->in[i]) - exec->in[i] += exec->in_stride[i]; - if (!copy_out && exec->out[i]) - exec->out[i] += exec->out_stride[i]; - } - } -} - -static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, - const int h, const SwsPass *pass) -{ - const SwsOpPass *p = pass->priv; - const SwsCompiledOp *comp = &p->comp; - - /* Fill exec metadata for this slice */ - DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; - exec.slice_y = y; - exec.slice_h = h; - - /** - * To ensure safety, we need to consider the following: - * - * 1. We can overread the input, unless this is the last line of an - * unpadded buffer. All defined operations can handle arbitrary pixel - * input, so overread of arbitrary data is fine. - * - * 2. We can overwrite the output, as long as we don't write more than the - * amount of pixels that fit into one linesize. So we always need to - * memcpy the last column on the output side if unpadded. - * - * 3. For the last row, we also need to memcpy the remainder of the input, - * to avoid reading past the end of the buffer. Note that since we know - * the run() function is called on stripes of the same buffer, we don't - * need to worry about this for the end of a slice. - */ - - const int last_slice = y + h == pass->height; - const bool memcpy_in = last_slice && p->memcpy_in; - const bool memcpy_out = p->memcpy_out; - const int num_blocks = p->num_blocks; - const int blocks_main = num_blocks - memcpy_out; - const int h_main = h - memcpy_in; - - /* Handle main section */ - get_row_data(p, y, exec.in, exec.out); - comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main); - - if (memcpy_in) { - /* Safe part of last row */ - get_row_data(p, y + h_main, exec.in, exec.out); - comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h); - } - - /* Handle last column via memcpy, takes over `exec` so call these last */ - if (memcpy_out) - handle_tail(p, &exec, true, false, y, h_main); - if (memcpy_in) - handle_tail(p, &exec, memcpy_out, true, y + h_main, 1); -} - -static int rw_planes(const SwsOp *op) -{ - return op->rw.packed ? 1 : op->rw.elems; -} - -static int rw_pixel_bits(const SwsOp *op) -{ - const int elems = op->rw.packed ? op->rw.elems : 1; - const int size = ff_sws_pixel_type_size(op->type); - const int bits = 8 >> op->rw.frac; - av_assert1(bits >= 1); - return elems * size * bits; -} - -static int compile(SwsGraph *graph, const SwsOpList *ops, - const SwsFormat *dst, SwsPass *input, SwsPass **output) -{ - SwsContext *ctx = graph->ctx; - SwsOpPass *p = av_mallocz(sizeof(*p)); - if (!p) - return AVERROR(ENOMEM); - - int ret = ff_sws_ops_compile(ctx, ops, &p->comp); - if (ret < 0) - goto fail; - - const SwsOp *read = &ops->ops[0]; - const SwsOp *write = &ops->ops[ops->num_ops - 1]; - p->planes_in = rw_planes(read); - p->planes_out = rw_planes(write); - p->pixel_bits_in = rw_pixel_bits(read); - p->pixel_bits_out = rw_pixel_bits(write); - p->exec_base = (SwsOpExec) { - .width = dst->width, - .height = dst->height, - .block_size_in = p->comp.block_size * p->pixel_bits_in >> 3, - .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3, - }; - - for (int i = 0; i < 4; i++) { - p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1; - p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1; - } - - SwsPass *pass; - pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, - input, p->comp.slice_align, p, op_pass_run); - if (!pass) { - ret = AVERROR(ENOMEM); - goto fail; - } - pass->setup = op_pass_setup; - pass->free = op_pass_free; - - *output = pass; - return 0; - -fail: - op_pass_free(p); - return ret; -} - -int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, - const SwsFormat *dst, SwsPass *input, SwsPass **output) -{ - SwsContext *ctx = graph->ctx; - const SwsOp *read = &ops->ops[0]; - const SwsOp *write = &ops->ops[ops->num_ops - 1]; - int ret; - - /* Check if the whole operation graph is an end-to-end no-op */ - if (ff_sws_op_list_is_noop(ops)) { - *output = input; - return 0; - } - - if (ops->num_ops < 2) { - av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n"); - return AVERROR(EINVAL); - } - - if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) { - av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " - "and write, respectively.\n"); - return AVERROR(EINVAL); - } - - if (flags & SWS_OP_FLAG_OPTIMIZE) { - ret = ff_sws_op_list_optimize(ops); - if (ret < 0) - return ret; - } else { - ff_sws_op_list_update_comps(ops); - } - - return compile(graph, ops, dst, input, output); -} diff --git a/libswscale/ops_dispatch.c b/libswscale/ops_dispatch.c new file mode 100644 index 0000000000..3faa677858 --- /dev/null +++ b/libswscale/ops_dispatch.c @@ -0,0 +1,408 @@ +/** + * Copyright (C) 2025 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/avassert.h" +#include "libavutil/mem.h" +#include "libavutil/mem_internal.h" + +#include "ops.h" +#include "ops_internal.h" +#include "ops_dispatch.h" + +typedef struct SwsOpPass { + SwsCompiledOp comp; + SwsOpExec exec_base; + int num_blocks; + int tail_off_in; + int tail_off_out; + int tail_size_in; + int tail_size_out; + int planes_in; + int planes_out; + int pixel_bits_in; + int pixel_bits_out; + int idx_in[4]; + int idx_out[4]; + bool memcpy_in; + bool memcpy_out; +} SwsOpPass; + +int ff_sws_ops_compile_backend(SwsContext *ctx, const SwsOpBackend *backend, + const SwsOpList *ops, SwsCompiledOp *out) +{ + SwsOpList *copy, rest; + SwsCompiledOp compiled = {0}; + int ret = 0; + + copy = ff_sws_op_list_duplicate(ops); + if (!copy) + return AVERROR(ENOMEM); + + /* Ensure these are always set during compilation */ + ff_sws_op_list_update_comps(copy); + + /* Make an on-stack copy of `ops` to ensure we can still properly clean up + * the copy afterwards */ + rest = *copy; + + ret = backend->compile(ctx, &rest, &compiled); + if (ret < 0) { + int msg_lev = ret == AVERROR(ENOTSUP) ? AV_LOG_TRACE : AV_LOG_ERROR; + av_log(ctx, msg_lev, "Backend '%s' failed to compile operations: %s\n", + backend->name, av_err2str(ret)); + if (rest.num_ops != ops->num_ops) { + av_log(ctx, msg_lev, "Uncompiled remainder:\n"); + ff_sws_op_list_print(ctx, msg_lev, AV_LOG_TRACE, &rest); + } + } else { + *out = compiled; + } + + ff_sws_op_list_free(©); + return ret; +} + +int ff_sws_ops_compile(SwsContext *ctx, const SwsOpList *ops, SwsCompiledOp *out) +{ + for (int n = 0; ff_sws_op_backends[n]; n++) { + const SwsOpBackend *backend = ff_sws_op_backends[n]; + if (ops->src.hw_format != backend->hw_format || + ops->dst.hw_format != backend->hw_format) + continue; + if (ff_sws_ops_compile_backend(ctx, backend, ops, out) < 0) + continue; + + av_log(ctx, AV_LOG_VERBOSE, "Compiled using backend '%s': " + "block size = %d, over-read = %d, over-write = %d, cpu flags = 0x%x\n", + backend->name, out->block_size, out->over_read, out->over_write, + out->cpu_flags); + return 0; + } + + av_log(ctx, AV_LOG_WARNING, "No backend found for operations:\n"); + ff_sws_op_list_print(ctx, AV_LOG_WARNING, AV_LOG_TRACE, ops); + return AVERROR(ENOTSUP); +} + +static void op_pass_free(void *ptr) +{ + SwsOpPass *p = ptr; + if (!p) + return; + + if (p->comp.free) + p->comp.free(p->comp.priv); + + av_free(p); +} + +static inline void get_row_data(const SwsOpPass *p, const int y, + const uint8_t *in[4], uint8_t *out[4]) +{ + const SwsOpExec *base = &p->exec_base; + for (int i = 0; i < p->planes_in; i++) + in[i] = base->in[i] + (y >> base->in_sub_y[i]) * base->in_stride[i]; + for (int i = 0; i < p->planes_out; i++) + out[i] = base->out[i] + (y >> base->out_sub_y[i]) * base->out_stride[i]; +} + +static void op_pass_setup(const SwsFrame *out, const SwsFrame *in, + const SwsPass *pass) +{ + const AVPixFmtDescriptor *indesc = av_pix_fmt_desc_get(in->format); + const AVPixFmtDescriptor *outdesc = av_pix_fmt_desc_get(out->format); + + SwsOpPass *p = pass->priv; + SwsOpExec *exec = &p->exec_base; + const SwsCompiledOp *comp = &p->comp; + const int block_size = comp->block_size; + p->num_blocks = (pass->width + block_size - 1) / block_size; + + /* Set up main loop parameters */ + const int aligned_w = p->num_blocks * block_size; + const int safe_width = (p->num_blocks - 1) * block_size; + const int tail_size = pass->width - safe_width; + p->tail_off_in = safe_width * p->pixel_bits_in >> 3; + p->tail_off_out = safe_width * p->pixel_bits_out >> 3; + p->tail_size_in = tail_size * p->pixel_bits_in >> 3; + p->tail_size_out = tail_size * p->pixel_bits_out >> 3; + p->memcpy_in = false; + p->memcpy_out = false; + + for (int i = 0; i < p->planes_in; i++) { + const int idx = p->idx_in[i]; + const int chroma = idx == 1 || idx == 2; + const int sub_x = chroma ? indesc->log2_chroma_w : 0; + const int sub_y = chroma ? indesc->log2_chroma_h : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_read + sub_x) >> sub_x; + const int plane_size = plane_w * p->pixel_bits_in >> 3; + if (comp->slice_align) + p->memcpy_in |= plane_size + plane_pad > in->linesize[idx]; + exec->in[i] = in->data[idx]; + exec->in_stride[i] = in->linesize[idx]; + exec->in_sub_y[i] = sub_y; + exec->in_sub_x[i] = sub_x; + } + + for (int i = 0; i < p->planes_out; i++) { + const int idx = p->idx_out[i]; + const int chroma = idx == 1 || idx == 2; + const int sub_x = chroma ? outdesc->log2_chroma_w : 0; + const int sub_y = chroma ? outdesc->log2_chroma_h : 0; + const int plane_w = (aligned_w + sub_x) >> sub_x; + const int plane_pad = (comp->over_write + sub_x) >> sub_x; + const int plane_size = plane_w * p->pixel_bits_out >> 3; + if (comp->slice_align) + p->memcpy_out |= plane_size + plane_pad > out->linesize[idx]; + exec->out[i] = out->data[idx]; + exec->out_stride[i] = out->linesize[idx]; + exec->out_sub_y[i] = sub_y; + exec->out_sub_x[i] = sub_x; + } + + /* Pre-fill pointer bump for the main section only; this value does not + * matter at all for the tail / last row handlers because they only ever + * process a single line */ + const int blocks_main = p->num_blocks - p->memcpy_out; + for (int i = 0; i < 4; i++) { + exec->in_bump[i] = exec->in_stride[i] - blocks_main * exec->block_size_in; + exec->out_bump[i] = exec->out_stride[i] - blocks_main * exec->block_size_out; + } + + exec->in_frame = in; + exec->out_frame = out; +} + +/* Dispatch kernel over the last column of the image using memcpy */ +static av_always_inline void +handle_tail(const SwsOpPass *p, SwsOpExec *exec, + const bool copy_out, const bool copy_in, + int y, const int h) +{ + DECLARE_ALIGNED_64(uint8_t, tmp)[2][4][sizeof(uint32_t[128])]; + + const SwsOpExec *base = &p->exec_base; + const SwsCompiledOp *comp = &p->comp; + const int tail_size_in = p->tail_size_in; + const int tail_size_out = p->tail_size_out; + const int bx = p->num_blocks - 1; + + const uint8_t *in_data[4]; + uint8_t *out_data[4]; + get_row_data(p, y, in_data, out_data); + + for (int i = 0; i < p->planes_in; i++) { + in_data[i] += p->tail_off_in; + if (copy_in) { + exec->in[i] = (void *) tmp[0][i]; + exec->in_stride[i] = sizeof(tmp[0][i]); + } else { + exec->in[i] = in_data[i]; + } + } + + for (int i = 0; i < p->planes_out; i++) { + out_data[i] += p->tail_off_out; + if (copy_out) { + exec->out[i] = (void *) tmp[1][i]; + exec->out_stride[i] = sizeof(tmp[1][i]); + } else { + exec->out[i] = out_data[i]; + } + } + + for (int y_end = y + h; y < y_end; y++) { + if (copy_in) { + for (int i = 0; i < p->planes_in; i++) { + av_assert2(tmp[0][i] + tail_size_in < (uint8_t *) tmp[1]); + memcpy(tmp[0][i], in_data[i], tail_size_in); + in_data[i] += base->in_stride[i]; /* exec->in_stride was clobbered */ + } + } + + comp->func(exec, comp->priv, bx, y, p->num_blocks, y + 1); + + if (copy_out) { + for (int i = 0; i < p->planes_out; i++) { + av_assert2(tmp[1][i] + tail_size_out < (uint8_t *) tmp[2]); + memcpy(out_data[i], tmp[1][i], tail_size_out); + out_data[i] += base->out_stride[i]; + } + } + + for (int i = 0; i < 4; i++) { + if (!copy_in && exec->in[i]) + exec->in[i] += exec->in_stride[i]; + if (!copy_out && exec->out[i]) + exec->out[i] += exec->out_stride[i]; + } + } +} + +static void op_pass_run(const SwsFrame *out, const SwsFrame *in, const int y, + const int h, const SwsPass *pass) +{ + const SwsOpPass *p = pass->priv; + const SwsCompiledOp *comp = &p->comp; + + /* Fill exec metadata for this slice */ + DECLARE_ALIGNED_32(SwsOpExec, exec) = p->exec_base; + exec.slice_y = y; + exec.slice_h = h; + + /** + * To ensure safety, we need to consider the following: + * + * 1. We can overread the input, unless this is the last line of an + * unpadded buffer. All defined operations can handle arbitrary pixel + * input, so overread of arbitrary data is fine. + * + * 2. We can overwrite the output, as long as we don't write more than the + * amount of pixels that fit into one linesize. So we always need to + * memcpy the last column on the output side if unpadded. + * + * 3. For the last row, we also need to memcpy the remainder of the input, + * to avoid reading past the end of the buffer. Note that since we know + * the run() function is called on stripes of the same buffer, we don't + * need to worry about this for the end of a slice. + */ + + const int last_slice = y + h == pass->height; + const bool memcpy_in = last_slice && p->memcpy_in; + const bool memcpy_out = p->memcpy_out; + const int num_blocks = p->num_blocks; + const int blocks_main = num_blocks - memcpy_out; + const int h_main = h - memcpy_in; + + /* Handle main section */ + get_row_data(p, y, exec.in, exec.out); + comp->func(&exec, comp->priv, 0, y, blocks_main, y + h_main); + + if (memcpy_in) { + /* Safe part of last row */ + get_row_data(p, y + h_main, exec.in, exec.out); + comp->func(&exec, comp->priv, 0, y + h_main, num_blocks - 1, y + h); + } + + /* Handle last column via memcpy, takes over `exec` so call these last */ + if (memcpy_out) + handle_tail(p, &exec, true, false, y, h_main); + if (memcpy_in) + handle_tail(p, &exec, memcpy_out, true, y + h_main, 1); +} + +static int rw_planes(const SwsOp *op) +{ + return op->rw.packed ? 1 : op->rw.elems; +} + +static int rw_pixel_bits(const SwsOp *op) +{ + const int elems = op->rw.packed ? op->rw.elems : 1; + const int size = ff_sws_pixel_type_size(op->type); + const int bits = 8 >> op->rw.frac; + av_assert1(bits >= 1); + return elems * size * bits; +} + +static int compile(SwsGraph *graph, const SwsOpList *ops, + const SwsFormat *dst, SwsPass *input, SwsPass **output) +{ + SwsContext *ctx = graph->ctx; + SwsOpPass *p = av_mallocz(sizeof(*p)); + if (!p) + return AVERROR(ENOMEM); + + int ret = ff_sws_ops_compile(ctx, ops, &p->comp); + if (ret < 0) + goto fail; + + const SwsOp *read = &ops->ops[0]; + const SwsOp *write = &ops->ops[ops->num_ops - 1]; + p->planes_in = rw_planes(read); + p->planes_out = rw_planes(write); + p->pixel_bits_in = rw_pixel_bits(read); + p->pixel_bits_out = rw_pixel_bits(write); + p->exec_base = (SwsOpExec) { + .width = dst->width, + .height = dst->height, + .block_size_in = p->comp.block_size * p->pixel_bits_in >> 3, + .block_size_out = p->comp.block_size * p->pixel_bits_out >> 3, + }; + + for (int i = 0; i < 4; i++) { + p->idx_in[i] = i < p->planes_in ? ops->order_src.in[i] : -1; + p->idx_out[i] = i < p->planes_out ? ops->order_dst.in[i] : -1; + } + + SwsPass *pass; + pass = ff_sws_graph_add_pass(graph, dst->format, dst->width, dst->height, + input, p->comp.slice_align, p, op_pass_run); + if (!pass) { + ret = AVERROR(ENOMEM); + goto fail; + } + pass->setup = op_pass_setup; + pass->free = op_pass_free; + + *output = pass; + return 0; + +fail: + op_pass_free(p); + return ret; +} + +int ff_sws_compile_pass(SwsGraph *graph, SwsOpList *ops, int flags, + const SwsFormat *dst, SwsPass *input, SwsPass **output) +{ + SwsContext *ctx = graph->ctx; + const SwsOp *read = &ops->ops[0]; + const SwsOp *write = &ops->ops[ops->num_ops - 1]; + int ret; + + /* Check if the whole operation graph is an end-to-end no-op */ + if (ff_sws_op_list_is_noop(ops)) { + *output = input; + return 0; + } + + if (ops->num_ops < 2) { + av_log(ctx, AV_LOG_ERROR, "Need at least two operations.\n"); + return AVERROR(EINVAL); + } + + if (read->op != SWS_OP_READ || write->op != SWS_OP_WRITE) { + av_log(ctx, AV_LOG_ERROR, "First and last operations must be a read " + "and write, respectively.\n"); + return AVERROR(EINVAL); + } + + if (flags & SWS_OP_FLAG_OPTIMIZE) { + ret = ff_sws_op_list_optimize(ops); + if (ret < 0) + return ret; + } else { + ff_sws_op_list_update_comps(ops); + } + + return compile(graph, ops, dst, input, output); +} diff --git a/libswscale/ops_dispatch.h b/libswscale/ops_dispatch.h new file mode 100644 index 0000000000..270ff09e31 --- /dev/null +++ b/libswscale/ops_dispatch.h @@ -0,0 +1,94 @@ +/** + * Copyright (C) 2026 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef SWSCALE_OPS_DISPATCH_H +#define SWSCALE_OPS_DISPATCH_H + +#include <assert.h> + +#include "libavutil/frame.h" +#include "graph.h" + +/** + * Global execution context for all compiled functions. + * + * Note: This struct is hard-coded in assembly, so do not change the layout + * without updating the corresponding assembly definitions. + */ +typedef struct SwsOpExec { + /* The data pointers point to the first pixel to process */ + const uint8_t *in[4]; + uint8_t *out[4]; + + /* Separation between lines in bytes */ + ptrdiff_t in_stride[4]; + ptrdiff_t out_stride[4]; + + /* Pointer bump, difference between stride and processed line size */ + ptrdiff_t in_bump[4]; + ptrdiff_t out_bump[4]; + + /* Extra metadata, may or may not be useful */ + int32_t width, height; /* Overall image dimensions */ + int32_t slice_y, slice_h; /* Start and height of current slice */ + int32_t block_size_in; /* Size of a block of pixels in bytes */ + int32_t block_size_out; + + /* Subsampling factors for each plane */ + uint8_t in_sub_y[4], out_sub_y[4]; + uint8_t in_sub_x[4], out_sub_x[4]; + + /* Pointers back to the original SwsFrame */ + const SwsFrame *in_frame; + const SwsFrame *out_frame; +} SwsOpExec; + +static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) + + 6 * sizeof(int32_t) + + 16 * sizeof(uint8_t) + + 2 * sizeof(void *), + "SwsOpExec layout mismatch"); + +/** + * Process a given range of pixel blocks. + * + * Note: `bx_start` and `bx_end` are in units of `SwsCompiledOp.block_size`. + */ +typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv, + int bx_start, int y_start, int bx_end, int y_end); + +#define SWS_DECL_FUNC(NAME) \ + void NAME(const SwsOpExec *, const void *, int, int, int, int) + +typedef struct SwsCompiledOp { + SwsOpFunc func; + + int slice_align; /* slice height alignment */ + int block_size; /* number of pixels processed per iteration */ + int over_read; /* implementation over-reads input by this many bytes */ + int over_write; /* implementation over-writes output by this many bytes */ + int cpu_flags; /* active set of CPU flags (informative) */ + + /* Arbitrary private data */ + void *priv; + void (*free)(void *priv); +} SwsCompiledOp; + +#endif /* SWSCALE_OPS_DISPATCH_H */ diff --git a/libswscale/ops_internal.h b/libswscale/ops_internal.h index 808b9ba4e0..3db850c290 100644 --- a/libswscale/ops_internal.h +++ b/libswscale/ops_internal.h @@ -24,6 +24,7 @@ #include "libavutil/mem_internal.h" #include "ops.h" +#include "ops_dispatch.h" #define Q(N) ((AVRational) { N, 1 }) @@ -51,71 +52,6 @@ static inline void ff_sws_pack_op_decode(const SwsOp *op, uint64_t mask[4], int } } -/** - * Global execution context for all compiled functions. - * - * Note: This struct is hard-coded in assembly, so do not change the layout - * without updating the corresponding assembly definitions. - */ -typedef struct SwsOpExec { - /* The data pointers point to the first pixel to process */ - const uint8_t *in[4]; - uint8_t *out[4]; - - /* Separation between lines in bytes */ - ptrdiff_t in_stride[4]; - ptrdiff_t out_stride[4]; - - /* Pointer bump, difference between stride and processed line size */ - ptrdiff_t in_bump[4]; - ptrdiff_t out_bump[4]; - - /* Extra metadata, may or may not be useful */ - int32_t width, height; /* Overall image dimensions */ - int32_t slice_y, slice_h; /* Start and height of current slice */ - int32_t block_size_in; /* Size of a block of pixels in bytes */ - int32_t block_size_out; - - /* Subsampling factors for each plane */ - uint8_t in_sub_y[4], out_sub_y[4]; - uint8_t in_sub_x[4], out_sub_x[4]; - - /* Pointers back to the original SwsFrame */ - const SwsFrame *in_frame; - const SwsFrame *out_frame; -} SwsOpExec; - -static_assert(sizeof(SwsOpExec) == 24 * sizeof(void *) + - 6 * sizeof(int32_t) + - 16 * sizeof(uint8_t) + - 2 * sizeof(void *), - "SwsOpExec layout mismatch"); - -/** - * Process a given range of pixel blocks. - * - * Note: `bx_start` and `bx_end` are in units of `SwsCompiledOp.block_size`. - */ -typedef void (*SwsOpFunc)(const SwsOpExec *exec, const void *priv, - int bx_start, int y_start, int bx_end, int y_end); - -#define SWS_DECL_FUNC(NAME) \ - void NAME(const SwsOpExec *, const void *, int, int, int, int) - -typedef struct SwsCompiledOp { - SwsOpFunc func; - - int slice_align; /* slice height alignment */ - int block_size; /* number of pixels processed per iteration */ - int over_read; /* implementation over-reads input by this many bytes */ - int over_write; /* implementation over-writes output by this many bytes */ - int cpu_flags; /* active set of CPU flags (informative) */ - - /* Arbitrary private data */ - void *priv; - void (*free)(void *priv); -} SwsCompiledOp; - typedef struct SwsOpBackend { const char *name; /* Descriptive name for this backend */ _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
