This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch release/8.0 in repository ffmpeg.
commit ce87d2918405ccf1e73d797003496dd2a38b24bc Author: Niklas Haas <[email protected]> AuthorDate: Sun Jun 28 18:10:37 2026 +0200 Commit: Marvin Scholz <[email protected]> CommitDate: Mon Jun 29 14:11:02 2026 +0200 avfilter/vf_scale_cuda: add `use_filters` option This may be faster or slower than the existing specialized kernels, so I opted not to prefer it by default. I also deliberately didn't expose additional filter function capabilites yet. The main motivating reason here is to get correct anti-aliasing behavior when downscaling, which is currently completely broken. Signed-off-by: Niklas Haas <[email protected]> (cherry-picked from commit 5d0748243f87aed58f1f3bfe085518811a9e23ee) Signed-off-by: Marvin Scholz <[email protected]> --- doc/filters.texi | 6 + libavfilter/Makefile | 2 +- libavfilter/vf_scale_cuda.c | 333 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 322 insertions(+), 19 deletions(-) diff --git a/doc/filters.texi b/doc/filters.texi index f0962755df..2c40404c4f 100644 --- a/doc/filters.texi +++ b/doc/filters.texi @@ -26903,6 +26903,12 @@ frame-consumer that exhausts the limited decoder frame pool. If set to 1, frames are passed through as-is if they match the desired output parameters. This is the default behaviour. +@item use_filters +If set to 1, filter with a generic weight LUT instead of using fixed-function +shader kernels. May be faster or slower depending on the hardware. A value +of @code{auto} (the default) enables this automatically when required for +correct anti-aliasing when downscaling. + @item param Algorithm-Specific parameter. diff --git a/libavfilter/Makefile b/libavfilter/Makefile index fa93829a94..e7c598a6b9 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -467,7 +467,7 @@ OBJS-$(CONFIG_ROTATE_FILTER) += vf_rotate.o OBJS-$(CONFIG_SAB_FILTER) += vf_sab.o OBJS-$(CONFIG_SCALE_FILTER) += vf_scale.o scale_eval.o framesync.o OBJS-$(CONFIG_SCALE_D3D11_FILTER) += vf_scale_d3d11.o scale_eval.o -OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o scale_eval.o \ +OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o scale_eval.o scale_filters.o \ vf_scale_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_SCALE_NPP_FILTER) += vf_scale_npp.o scale_eval.o OBJS-$(CONFIG_SCALE_QSV_FILTER) += vf_vpp_qsv.o diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index df7362bde4..ed7bed9ea3 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -22,14 +22,20 @@ #include <float.h> #include <stdio.h> +#include <string.h> +#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" #include "libavutil/cuda_check.h" #include "libavutil/internal.h" +#include "libavutil/mem.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" +#include "libavutil/refstruct.h" + +#include "libswscale/filters.h" #include "avfilter.h" #include "filters.h" @@ -81,6 +87,19 @@ enum { INTERP_ALGO_COUNT }; +enum { + FILTER_OUT, + FILTER_TMP, + FILTER_NB, +}; + +typedef struct CUDAScaleFilter { + CUdeviceptr weights; ///< float[dst_size][filter_size] + CUdeviceptr offsets; ///< int[dst_size] + int filter_size; + int dst_size; +} CUDAScaleFilter; + typedef struct CUDAScaleContext { const AVClass *class; @@ -112,14 +131,19 @@ typedef struct CUDAScaleContext { CUcontext cu_ctx; CUmodule cu_module; - CUfunction cu_func; - CUfunction cu_func_uv; + CUfunction cu_func[FILTER_NB]; + CUfunction cu_func_uv[FILTER_NB]; CUstream cu_stream; int interp_algo; int interp_use_linear; int interp_as_integer; + CUDAScaleFilter filters[FILTER_NB]; + CUDAScaleFilter filters_uv[FILTER_NB]; + AVFrame *inter_buf; /* intermediate buffer for separated scaling */ + int use_filters; /* -1 for auto */ + float param; } CUDAScaleContext; @@ -138,23 +162,42 @@ static av_cold int cudascale_init(AVFilterContext *ctx) return 0; } +static void filter_uninit(CudaFunctions *cu, CUDAScaleFilter *filter) +{ + if (filter->weights) + cu->cuMemFree(filter->weights); + if (filter->offsets) + cu->cuMemFree(filter->offsets); + memset(filter, 0, sizeof(*filter)); +} + static av_cold void cudascale_uninit(AVFilterContext *ctx) { CUDAScaleContext *s = ctx->priv; - if (s->hwctx && s->cu_module) { + if (s->hwctx) { CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUcontext dummy; CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); - CHECK_CU(cu->cuModuleUnload(s->cu_module)); - s->cu_module = NULL; + + for (int i = 0; i < FF_ARRAY_ELEMS(s->filters); i++) { + filter_uninit(cu, &s->filters[i]); + filter_uninit(cu, &s->filters_uv[i]); + } + + if (s->cu_module) { + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + } + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); } av_frame_free(&s->frame); av_buffer_unref(&s->frames_ctx); av_frame_free(&s->tmp_frame); + av_frame_free(&s->inter_buf); } static av_cold int init_hwframe_ctx(CUDAScaleContext *s, AVBufferRef *device_ctx, int width, int height) @@ -194,6 +237,50 @@ fail: return ret; } +static av_cold int inter_buf_init(CUDAScaleContext *s, AVBufferRef *device_ctx, + enum AVPixelFormat format, int width, int height) +{ + AVBufferRef *ref = NULL; + AVHWFramesContext *fctx; + int ret; + + ref = av_hwframe_ctx_alloc(device_ctx); + if (!ref) + return AVERROR(ENOMEM); + fctx = (AVHWFramesContext*)ref->data; + + fctx->format = AV_PIX_FMT_CUDA; + fctx->sw_format = format; + fctx->width = FFALIGN(width, 32); + fctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(ref); + if (ret < 0) + goto fail; + + av_assert0(!s->inter_buf); + s->inter_buf = av_frame_alloc(); + if (!s->inter_buf) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = av_hwframe_get_buffer(ref, s->inter_buf, 0); + if (ret < 0) + goto fail; + + s->inter_buf->width = width; + s->inter_buf->height = height; + + av_buffer_unref(&ref); + return 0; + +fail: + av_frame_free(&s->inter_buf); + av_buffer_unref(&ref); + return ret; +} + static int format_is_supported(enum AVPixelFormat fmt) { for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) @@ -285,6 +372,18 @@ static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int if (in_width == out_width && in_height == out_height && in_format == out_format && s->interp_algo == INTERP_ALGO_DEFAULT) s->interp_algo = INTERP_ALGO_NEAREST; + + if (s->interp_algo == INTERP_ALGO_NEAREST) { + s->use_filters = 0; + } else if (s->use_filters < 0 && (in_width < out_width || in_height < out_height)) + s->use_filters = 1; /* downscaling; needed for anti-aliasing */ + + if (s->use_filters) { + ret = inter_buf_init(s, in_frames_ctx->device_ref, in_format, + out_width, in_height); + if (ret < 0) + return ret; + } } outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); @@ -310,6 +409,14 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) extern const unsigned char ff_vf_scale_cuda_ptx_data[]; extern const unsigned int ff_vf_scale_cuda_ptx_len; + if (s->use_filters) { + /* Final pass is always vertical unless not vertically scaling */ + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + function_infix = inlink->h == outlink->h ? "Generic_h" : "Generic_v"; + s->interp_use_linear = 0; + s->interp_as_integer = 0; + } else { switch(s->interp_algo) { case INTERP_ALGO_NEAREST: function_infix = "Nearest"; @@ -336,6 +443,7 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n"); return AVERROR_BUG; } + } ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); if (ret < 0) @@ -347,7 +455,7 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) goto fail; snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s", function_infix, in_fmt_name, out_fmt_name); - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, buf)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_OUT], s->cu_module, buf)); if (ret < 0) { av_log(ctx, AV_LOG_FATAL, "Unsupported conversion: %s -> %s\n", in_fmt_name, out_fmt_name); ret = AVERROR(ENOSYS); @@ -356,17 +464,173 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) av_log(ctx, AV_LOG_DEBUG, "Luma filter: %s (%s -> %s)\n", buf, av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt)); snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, in_fmt_name, out_fmt_name); - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, buf)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_OUT], s->cu_module, buf)); if (ret < 0) goto fail; av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt)); + if (s->inter_buf) { + /* Intermediate pass is always horizontal */ + snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s", in_fmt_name, in_fmt_name); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_TMP], s->cu_module, buf)); + if (ret < 0) + goto fail; + + snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s_uv", in_fmt_name, in_fmt_name); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_TMP], s->cu_module, buf)); + if (ret < 0) + goto fail; + } + fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; } +static av_cold int cudascale_filter_init(AVFilterContext *ctx, + CUDAScaleFilter *f, + int src_size, int dst_size, + double virtual_size) +{ + CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + SwsFilterParams params = { + .scaler_params = { SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT }, + .src_size = src_size, + .dst_size = dst_size, + .virtual_size = virtual_size, + }; + + switch (s->interp_algo) { + case INTERP_ALGO_NEAREST: return 0; /* no weights needed */ + case INTERP_ALGO_BILINEAR: params.scaler = SWS_SCALE_BILINEAR; break; + case INTERP_ALGO_LANCZOS: params.scaler = SWS_SCALE_LANCZOS; break; + case INTERP_ALGO_DEFAULT: + case INTERP_ALGO_BICUBIC: + params.scaler = SWS_SCALE_BICUBIC; + params.scaler_params[0] = params.scaler_params[1] = 0.0; + if (s->param != SCALE_CUDA_PARAM_DEFAULT) + params.scaler_params[1] = s->param; + break; + } + + SwsFilterWeights *weights = NULL; + int ret = ff_sws_filter_generate(ctx, ¶ms, &weights); + if (ret < 0) { + if (ret == AVERROR(ENOTSUP)) { + av_log(ctx, AV_LOG_ERROR, "Filter size exceeds the maximum " + "currently supported by the CUDA scaler (%d).\n", + SWS_FILTER_SIZE_MAX); + } + return ret; + } + + float *tmp = av_malloc_array(weights->num_weights, sizeof(*tmp)); + if (!tmp) { + ret = AVERROR(ENOMEM); + goto fail; + } + for (size_t i = 0; i < weights->num_weights; i++) + tmp[i] = weights->weights[i] / (float) SWS_FILTER_SCALE; + + f->filter_size = weights->filter_size; + f->dst_size = dst_size; + + const size_t weights_size = weights->num_weights * sizeof(*tmp); + ret = CHECK_CU(cu->cuMemAlloc(&f->weights, weights_size)); + if (ret < 0) + goto fail; + ret = CHECK_CU(cu->cuMemcpyHtoD(f->weights, tmp, weights_size)); + if (ret < 0) + goto fail; + + const size_t offsets_size = dst_size * sizeof(*weights->offsets); + ret = CHECK_CU(cu->cuMemAlloc(&f->offsets, offsets_size)); + if (ret < 0) + goto fail; + ret = CHECK_CU(cu->cuMemcpyHtoD(f->offsets, weights->offsets, offsets_size)); + if (ret < 0) + goto fail; + + av_log(ctx, AV_LOG_VERBOSE, " using %d tap '%s' filter: %d -> %d\n", + f->filter_size, weights->name, src_size, dst_size); + + ret = 0; + +fail: + av_free(tmp); + av_refstruct_unref(&weights); + return ret; +} + +static av_cold int cudascale_setup_filters(AVFilterContext *ctx) +{ + CUDAScaleContext *s = ctx->priv; + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + int ret; + + const int sub_x = s->in_desc->log2_chroma_w; + const int sub_y = s->in_desc->log2_chroma_h; + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; + + int pass_x = -1, pass_y = -1; + if (inlink->w != outlink->w && inlink->h != outlink->h) { + /* Always perform the horizontal scaling pass first */ + pass_x = FILTER_TMP; + pass_y = FILTER_OUT; + } else if (inlink->w != outlink->w) { + pass_x = FILTER_OUT; + } else if (inlink->h != outlink->h) { + pass_y = FILTER_OUT; + } + + if (pass_x >= 0) { + ret = cudascale_filter_init(ctx, &s->filters[pass_x], + inlink->w, outlink->w, 0.0); + if (ret < 0) + goto fail; + if (s->in_planes > 1) { + const int src_size = AV_CEIL_RSHIFT(inlink->w, sub_x); + const int dst_size = AV_CEIL_RSHIFT(outlink->w, sub_x); + const double ratio = (double) outlink->w / inlink->w; + ret = cudascale_filter_init(ctx, &s->filters_uv[pass_x], + src_size, dst_size, src_size * ratio); + if (ret < 0) + goto fail; + } + } + + if (pass_y >= 0) { + ret = cudascale_filter_init(ctx, &s->filters[pass_y], + inlink->h, outlink->h, 0.0); + if (ret < 0) + goto fail; + if (s->in_planes > 1) { + const int src_size = AV_CEIL_RSHIFT(inlink->h, sub_y); + const int dst_size = AV_CEIL_RSHIFT(outlink->h, sub_y); + const double ratio = (double) outlink->h / inlink->h; + ret = cudascale_filter_init(ctx, &s->filters_uv[pass_y], + src_size, dst_size, src_size * ratio); + if (ret < 0) + goto fail; + } + } + + ret = 0; + +fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + return ret; +} + static av_cold int cudascale_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; @@ -424,6 +688,12 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink) outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), s->passthrough ? " (passthrough)" : ""); + if (s->use_filters) { + ret = cudascale_setup_filters(ctx); + if (ret < 0) + return ret; + } + ret = cudascale_load_functions(ctx); if (ret < 0) return ret; @@ -436,7 +706,8 @@ fail: static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height, - AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch, int mpeg_range) + AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch, int mpeg_range, + const CUDAScaleFilter *filter) { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; @@ -457,9 +728,15 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, .src_width = src_width, .src_height = src_height, .param = s->param, - .mpeg_range = mpeg_range + .mpeg_range = mpeg_range, }; + if (filter) { + params.weights = filter->weights; + params.offsets = filter->offsets; + params.filter_size = filter->filter_size; + } + void *args[] = { ¶ms }; return CHECK_CU(cu->cuLaunchKernel(func, @@ -467,7 +744,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); } -static int scalecuda_resize(AVFilterContext *ctx, +static int scalecuda_resize(AVFilterContext *ctx, int pass, AVFrame *out, AVFrame *in) { CUDAScaleContext *s = ctx->priv; @@ -476,6 +753,13 @@ static int scalecuda_resize(AVFilterContext *ctx, int i, ret; int mpeg_range = in->color_range != AVCOL_RANGE_JPEG; + const AVPixFmtDescriptor *out_desc = s->out_desc; + int out_planes = s->out_planes; + if (pass == FILTER_TMP) { + out_desc = s->in_desc; + out_planes = s->in_planes; + } + CUtexObject tex[4] = { 0, 0, 0, 0 }; int crop_width = (in->width - in->crop_right) - in->crop_left; @@ -517,23 +801,25 @@ static int scalecuda_resize(AVFilterContext *ctx, } // scale primary plane(s). Usually Y (and A), or single plane of RGB frames. - ret = call_resize_kernel(ctx, s->cu_func, + ret = call_resize_kernel(ctx, s->cu_func[pass], tex, in->crop_left, in->crop_top, crop_width, crop_height, - out, out->width, out->height, out->linesize[0], mpeg_range); + out, out->width, out->height, out->linesize[0], mpeg_range, + &s->filters[pass]); if (ret < 0) goto exit; - if (s->out_planes > 1) { + if (out_planes > 1) { // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane. - ret = call_resize_kernel(ctx, s->cu_func_uv, tex, + ret = call_resize_kernel(ctx, s->cu_func_uv[pass], tex, AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w), AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h), AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w), AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h), out, - AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w), - AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h), - out->linesize[1], mpeg_range); + AV_CEIL_RSHIFT(out->width, out_desc->log2_chroma_w), + AV_CEIL_RSHIFT(out->height, out_desc->log2_chroma_h), + out->linesize[1], mpeg_range, + &s->filters_uv[pass]); if (ret < 0) goto exit; } @@ -555,7 +841,16 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) AVFrame *src = in; int ret; - ret = scalecuda_resize(ctx, s->frame, src); + if (s->inter_buf) { + /* Handle first pass separately */ + s->inter_buf->color_range = in->color_range; + ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in); + if (ret < 0) + return ret; + src = s->inter_buf; + } + + ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src); if (ret < 0) return ret; @@ -650,6 +945,8 @@ static const AVOption options[] = { { "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_LANCZOS }, 0, 0, FLAGS, .unit = "interp_algo" }, { "format", "Output video pixel format", OFFSET(format), AV_OPT_TYPE_PIXEL_FMT, { .i64 = AV_PIX_FMT_NONE }, INT_MIN, INT_MAX, .flags=FLAGS }, { "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS }, + { "use_filters", "Use generic filters instead of fixed function kernels", OFFSET(use_filters), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, FLAGS, .unit = "use_filters" }, + { "auto", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = -1}, 0, 0, FLAGS, .unit = "use_filters" }, { "param", "Algorithm-Specific parameter", OFFSET(param), AV_OPT_TYPE_FLOAT, { .dbl = SCALE_CUDA_PARAM_DEFAULT }, -FLT_MAX, FLT_MAX, FLAGS }, { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 2, FLAGS, .unit = "force_oar" }, { "disable", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, 0, 0, FLAGS, .unit = "force_oar" }, _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
