[FFmpeg-devel] [PR] avfilter/scale_cude: convolve using generic separated filter kernel weights (PR #23559)

Niklas Haas via ffmpeg-devel Mon, 22 Jun 2026 10:55:18 -0700

PR #23559 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23559
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23559.patch


This fixes downscaling case by implementing proper anti-aliasing.


>From ae542f0c67b1f9fe9ae687d5927a72b3eeb6ed1e Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 22 Jun 2026 16:20:14 +0200
Subject: [PATCH 1/4] avfilter/scale_filters: add internal copy of
 libswscale/filters.c

Useful for GPU-based filters, which may also need to compute filter weights.
Since we cannot cross-link to internal functions, we need to recompile this
helper inside libavfilter.c.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/scale_filters.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 libavfilter/scale_filters.c

diff --git a/libavfilter/scale_filters.c b/libavfilter/scale_filters.c
new file mode 100644
index 0000000000..347d073210
--- /dev/null
+++ b/libavfilter/scale_filters.c
@@ -0,0 +1,19 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libswscale/filters.c"
-- 
2.52.0


>From 12fc0251ba6fe1728fd3f7ca817d5c9b05d510d3 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 22 Jun 2026 18:43:59 +0200
Subject: [PATCH 2/4] avfilter/vf_scale_cuda: generalize kernel signature to
 accept weights

Ignored for now by the existing fixed function kernels.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.cu | 27 ++++++++++++++++++++-------
 libavfilter/vf_scale_cuda.h  |  5 +++++
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index ca7955344b..032b0e9c48 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -28,7 +28,9 @@ using subsample_function_t = T (*)(cudaTextureObject_t tex, 
int xo, int yo,
                                    int dst_width, int dst_height,
                                    int src_left, int src_top,
                                    int src_width, int src_height,
-                                   int bit_depth, float param);
+                                   int bit_depth, float param,
+                                   const float *weights, const int *offsets,
+                                   int filter_size);
 
 // --- CONVERSION LOGIC ---
 
@@ -90,14 +92,16 @@ static inline __device__ ushort conv_16to10pl(ushort in)
     __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], 
int xo, int yo, \
                                     int dst_width, int dst_height, int 
dst_pitch,              \
                                     int src_left, int src_top, int src_width, 
int src_height,  \
-                                    float param, int mpeg_range)
+                                    float param, int mpeg_range,               
                \
+                                    const float *weights, const int *offsets, 
int filter_size)
 
 #define SUB_F(m, plane) \
     subsample_func_##m(src_tex[plane], xo, yo, \
                        dst_width, dst_height,  \
                        src_left, src_top,      \
                        src_width, src_height,  \
-                       in_bit_depth, param)
+                       in_bit_depth, param,    \
+                       weights, offsets, filter_size)
 
 // FFmpeg passes pitch in bytes, CUDA uses potentially larger types
 #define FIXED_PITCH \
@@ -1095,7 +1099,9 @@ __device__ static inline T 
Subsample_Nearest(cudaTextureObject_t tex,
                                              int dst_width, int dst_height,
                                              int src_left, int src_top,
                                              int src_width, int src_height,
-                                             int bit_depth, float param)
+                                             int bit_depth, float param,
+                                             const float *weights, const int 
*offsets,
+                                             int filter_size)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
@@ -1111,7 +1117,9 @@ __device__ static inline T 
Subsample_Bilinear(cudaTextureObject_t tex,
                                               int dst_width, int dst_height,
                                               int src_left, int src_top,
                                               int src_width, int src_height,
-                                              int bit_depth, float param)
+                                              int bit_depth, float param,
+                                              const float *weights, const int 
*offsets,
+                                              int filter_size)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
@@ -1143,7 +1151,9 @@ __device__ static inline T 
Subsample_Bicubic(cudaTextureObject_t tex,
                                              int dst_width, int dst_height,
                                              int src_left, int src_top,
                                              int src_width, int src_height,
-                                             int bit_depth, float param)
+                                             int bit_depth, float param,
+                                             const float *weights, const int 
*offsets,
+                                             int filter_size)
 {
     float hscale = (float)src_width / (float)dst_width;
     float vscale = (float)src_height / (float)dst_height;
@@ -1197,7 +1207,10 @@ __device__ static inline T 
Subsample_Bicubic(cudaTextureObject_t tex,
         params.dst_width, params.dst_height, params.dst_pitch, \
         params.src_left, params.src_top,                \
         params.src_width, params.src_height,            \
-        params.param, params.mpeg_range);
+        params.param, params.mpeg_range,                \
+        (const float*) params.weights,                  \
+        (const int*) params.offsets,                    \
+        params.filter_size);
 
 extern "C" {
 
diff --git a/libavfilter/vf_scale_cuda.h b/libavfilter/vf_scale_cuda.h
index d685f73072..391631699e 100644
--- a/libavfilter/vf_scale_cuda.h
+++ b/libavfilter/vf_scale_cuda.h
@@ -44,6 +44,11 @@ typedef struct {
     int src_height;
     float param;
     int mpeg_range;
+
+    /* Weights for the generic filter kernel */
+    CUdeviceptr weights;
+    CUdeviceptr offsets;
+    int filter_size;
 } CUDAScaleKernelParams;
 
 #endif
-- 
2.52.0


>From 19491071cf22fc09bd36dddb581f8e5c7df44f73 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 22 Jun 2026 18:51:01 +0200
Subject: [PATCH 3/4] avfilter/vf_scale_cuda: add generic 1D filter kernel

This can be useful for any sort of separable filtering with arbitrary
weights.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.cu | 79 ++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index 032b0e9c48..01398c3db8 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -1186,6 +1186,43 @@ __device__ static inline T 
Subsample_Bicubic(cudaTextureObject_t tex,
 #undef PIX
 }
 
+enum ScaleDir {
+    SCALE_DIR_X,
+    SCALE_DIR_Y,
+};
+
+template<typename T, int dir>
+__device__ static inline T Subsample_Generic(cudaTextureObject_t tex,
+                                             int xo, int yo,
+                                             int dst_width, int dst_height,
+                                             int src_left, int src_top,
+                                             int src_width, int src_height,
+                                             int bit_depth, float param,
+                                             const float *weights, const int 
*offsets,
+                                             int filter_size)
+{
+    const float factor = bit_depth > 8 ? 0xFFFF : 0xFF;
+
+    floatT sum;
+    vec_set_scalar(sum, 0.0f);
+
+    if (dir == SCALE_DIR_X) {
+        const float *row = &weights[xo * filter_size];
+        const float x = 0.5f + src_left + offsets[xo];
+        const float y = 0.5f + src_top  + yo;
+        for (int i = 0; i < filter_size; i++)
+            sum += tex2D<floatT>(tex, x + i, y) * row[i];
+    } else {
+        const float *col = &weights[yo * filter_size];
+        const float x = 0.5f + src_left + xo;
+        const float y = 0.5f + src_top  + offsets[yo];
+        for (int i = 0; i < filter_size; i++)
+            sum += tex2D<floatT>(tex, x, y + i) * col[i];
+    }
+
+    return from_floatN<T, floatT>(sum * factor);
+}
+
 /// --- FUNCTION EXPORTS ---
 
 #define KERNEL_ARGS(T) CUDAScaleKernelParams params
@@ -1373,4 +1410,46 @@ LANCZOS_KERNELS_RGB(rgb0)
 LANCZOS_KERNELS_RGB(bgr0)
 LANCZOS_KERNELS_RGB(rgba)
 LANCZOS_KERNELS_RGB(bgra)
+
+#define GENERIC_KERNEL(D, DIR, C, S) \
+    __global__ void Subsample_Generic_##D##_##C##S(                     \
+        KERNEL_ARGS(Convert_##C::out_T##S))                             \
+    {                                                                   \
+        SUBSAMPLE((Convert_##C::Convert##S<                             \
+                       Subsample_Generic<Convert_##C::in_T, DIR>,       \
+                       Subsample_Generic<Convert_##C::in_T_uv, DIR> >), \
+                  Convert_##C::out_T##S) \
+    }
+
+#define GENERIC_KERNEL_RAW(C) \
+    GENERIC_KERNEL(h, SCALE_DIR_X, C,)      \
+    GENERIC_KERNEL(h, SCALE_DIR_X, C,_uv)   \
+    GENERIC_KERNEL(v, SCALE_DIR_Y, C,)      \
+    GENERIC_KERNEL(v, SCALE_DIR_Y, C,_uv)
+
+#define GENERIC_KERNELS(C) \
+    GENERIC_KERNEL_RAW(planar8_ ## C)       \
+    GENERIC_KERNEL_RAW(planar10_ ## C)      \
+    GENERIC_KERNEL_RAW(planar16_ ## C)      \
+    GENERIC_KERNEL_RAW(semiplanar8_ ## C)   \
+    GENERIC_KERNEL_RAW(semiplanar10_ ## C)  \
+    GENERIC_KERNEL_RAW(semiplanar16_ ## C)
+
+#define GENERIC_KERNELS_RGB(C) \
+    GENERIC_KERNEL_RAW(rgb0_ ## C)  \
+    GENERIC_KERNEL_RAW(bgr0_ ## C)  \
+    GENERIC_KERNEL_RAW(rgba_ ## C)  \
+    GENERIC_KERNEL_RAW(bgra_ ## C)
+
+GENERIC_KERNELS(planar8)
+GENERIC_KERNELS(planar10)
+GENERIC_KERNELS(planar16)
+GENERIC_KERNELS(semiplanar8)
+GENERIC_KERNELS(semiplanar10)
+GENERIC_KERNELS(semiplanar16)
+
+GENERIC_KERNELS_RGB(rgb0)
+GENERIC_KERNELS_RGB(bgr0)
+GENERIC_KERNELS_RGB(rgba)
+GENERIC_KERNELS_RGB(bgra)
 }
-- 
2.52.0


>From dbe67fd95ad95f9378d5e1702cb1add577fadd88 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Mon, 22 Jun 2026 19:16:31 +0200
Subject: [PATCH 4/4] avfilter/vf_scale_cuda: add generic filter kernel
 implementation

This may be faster or slower than the existing specialized kernels,
so I opted not to prefer it by default. I also deliberately didn't expose
additional filter function capabilites yet.

The main motivating reason here is to get correct anti-aliasing behavior
when downscaling, which is currently completely broken.

Signed-off-by: Niklas Haas <[email protected]>
---
 doc/filters.texi            |   6 +
 libavfilter/Makefile        |   2 +-
 libavfilter/vf_scale_cuda.c | 333 ++++++++++++++++++++++++++++++++++--
 3 files changed, 322 insertions(+), 19 deletions(-)

diff --git a/doc/filters.texi b/doc/filters.texi
index 1a649cf794..d4062353a2 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -27463,6 +27463,12 @@ frame-consumer that exhausts the limited decoder frame 
pool.
 If set to 1, frames are passed through as-is if they match the desired output
 parameters. This is the default behaviour.
 
+@item use_filters
+If set to 1, filter with a generic weight LUt instead of using fixed-function
+shader kernels. May be faster or slower depending on the hardware. A value
+of @code{auto} (the default) enables this automatically when required for
+correct anti-aliasing when downscaling.
+
 @item param
 Algorithm-Specific parameter.
 
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 5f0760a2ff..a8c14230ce 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -473,7 +473,7 @@ OBJS-$(CONFIG_SAB_FILTER)                    += vf_sab.o
 OBJS-$(CONFIG_SCALE_FILTER)                  += vf_scale.o scale_eval.o 
framesync.o
 OBJS-$(CONFIG_SCALE_D3D11_FILTER)            += vf_scale_d3d11.o scale_eval.o
 OBJS-$(CONFIG_SCALE_D3D12_FILTER)            += vf_scale_d3d12.o scale_eval.o
-OBJS-$(CONFIG_SCALE_CUDA_FILTER)             += vf_scale_cuda.o scale_eval.o \
+OBJS-$(CONFIG_SCALE_CUDA_FILTER)             += vf_scale_cuda.o scale_eval.o 
scale_filters.o \
                                                 vf_scale_cuda.ptx.o 
cuda/load_helper.o
 OBJS-$(CONFIG_SCALE_NPP_FILTER)              += vf_scale_npp.o scale_eval.o
 OBJS-$(CONFIG_SCALE_QSV_FILTER)              += vf_vpp_qsv.o
diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 01cac53348..403e4a2cca 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -22,14 +22,20 @@
 
 #include <float.h>
 #include <stdio.h>
+#include <string.h>
 
+#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_cuda_internal.h"
 #include "libavutil/cuda_check.h"
 #include "libavutil/internal.h"
+#include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
+#include "libavutil/refstruct.h"
+
+#include "libswscale/filters.h"
 
 #include "avfilter.h"
 #include "filters.h"
@@ -81,6 +87,19 @@ enum {
     INTERP_ALGO_COUNT
 };
 
+enum {
+    FILTER_OUT,
+    FILTER_TMP,
+    FILTER_NB,
+};
+
+typedef struct CUDAScaleFilter {
+    CUdeviceptr weights; ///< float[dst_size][filter_size]
+    CUdeviceptr offsets; ///< int[dst_size]
+    int filter_size;
+    int dst_size;
+} CUDAScaleFilter;
+
 typedef struct CUDAScaleContext {
     const AVClass *class;
 
@@ -112,14 +131,19 @@ typedef struct CUDAScaleContext {
 
     CUcontext   cu_ctx;
     CUmodule    cu_module;
-    CUfunction  cu_func;
-    CUfunction  cu_func_uv;
+    CUfunction  cu_func[FILTER_NB];
+    CUfunction  cu_func_uv[FILTER_NB];
     CUstream    cu_stream;
 
     int interp_algo;
     int interp_use_linear;
     int interp_as_integer;
 
+    CUDAScaleFilter filters[FILTER_NB];
+    CUDAScaleFilter filters_uv[FILTER_NB];
+    AVFrame *inter_buf; /* intermediate buffer for separated scaling */
+    int use_filters; /* -1 for auto */
+
     float param;
 } CUDAScaleContext;
 
@@ -138,23 +162,42 @@ static av_cold int cudascale_init(AVFilterContext *ctx)
     return 0;
 }
 
+static void filter_uninit(CudaFunctions *cu, CUDAScaleFilter *filter)
+{
+    if (filter->weights)
+        cu->cuMemFree(filter->weights);
+    if (filter->offsets)
+        cu->cuMemFree(filter->offsets);
+    memset(filter, 0, sizeof(*filter));
+}
+
 static av_cold void cudascale_uninit(AVFilterContext *ctx)
 {
     CUDAScaleContext *s = ctx->priv;
 
-    if (s->hwctx && s->cu_module) {
+    if (s->hwctx) {
         CudaFunctions *cu = s->hwctx->internal->cuda_dl;
         CUcontext dummy;
 
         CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
-        CHECK_CU(cu->cuModuleUnload(s->cu_module));
-        s->cu_module = NULL;
+
+        for (int i = 0; i < FF_ARRAY_ELEMS(s->filters); i++) {
+            filter_uninit(cu, &s->filters[i]);
+            filter_uninit(cu, &s->filters_uv[i]);
+        }
+
+        if (s->cu_module) {
+            CHECK_CU(cu->cuModuleUnload(s->cu_module));
+            s->cu_module = NULL;
+        }
+
         CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     }
 
     av_frame_free(&s->frame);
     av_buffer_unref(&s->frames_ctx);
     av_frame_free(&s->tmp_frame);
+    av_frame_free(&s->inter_buf);
 }
 
 static av_cold int init_hwframe_ctx(CUDAScaleContext *s, AVBufferRef 
*device_ctx, int width, int height)
@@ -194,6 +237,50 @@ fail:
     return ret;
 }
 
+static av_cold int inter_buf_init(CUDAScaleContext *s, AVBufferRef *device_ctx,
+                                  enum AVPixelFormat format, int width, int 
height)
+{
+    AVBufferRef *ref = NULL;
+    AVHWFramesContext *fctx;
+    int ret;
+
+    ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!ref)
+        return AVERROR(ENOMEM);
+    fctx = (AVHWFramesContext*)ref->data;
+
+    fctx->format    = AV_PIX_FMT_CUDA;
+    fctx->sw_format = format;
+    fctx->width     = FFALIGN(width,  32);
+    fctx->height    = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(ref);
+    if (ret < 0)
+        goto fail;
+
+    av_assert0(!s->inter_buf);
+    s->inter_buf = av_frame_alloc();
+    if (!s->inter_buf) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = av_hwframe_get_buffer(ref, s->inter_buf, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->inter_buf->width  = width;
+    s->inter_buf->height = height;
+
+    av_buffer_unref(&ref);
+    return 0;
+
+fail:
+    av_frame_free(&s->inter_buf);
+    av_buffer_unref(&ref);
+    return ret;
+}
+
 static int format_is_supported(enum AVPixelFormat fmt)
 {
     for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
@@ -285,6 +372,18 @@ static av_cold int init_processing_chain(AVFilterContext 
*ctx, int in_width, int
         if (in_width == out_width && in_height == out_height &&
             in_format == out_format && s->interp_algo == INTERP_ALGO_DEFAULT)
             s->interp_algo = INTERP_ALGO_NEAREST;
+
+        if (s->interp_algo == INTERP_ALGO_NEAREST) {
+            s->use_filters = 0;
+        } else if (s->use_filters < 0 && (in_width < out_width || in_height < 
out_height))
+            s->use_filters = 1; /* downscaling; needed for anti-aliasing */
+
+        if (s->use_filters) {
+            ret = inter_buf_init(s, in_frames_ctx->device_ref, in_format,
+                                 out_width, in_height);
+            if (ret < 0)
+                return ret;
+        }
     }
 
     outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
@@ -310,6 +409,14 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
     extern const unsigned char ff_vf_scale_cuda_ptx_data[];
     extern const unsigned int ff_vf_scale_cuda_ptx_len;
 
+    if (s->use_filters) {
+        /* Final pass is always vertical unless not vertically scaling */
+        AVFilterLink  *inlink = ctx->inputs[0];
+        AVFilterLink *outlink = ctx->outputs[0];
+        function_infix = inlink->h == outlink->h ? "Generic_h" : "Generic_v";
+        s->interp_use_linear = 0;
+        s->interp_as_integer = 0;
+    } else {
     switch(s->interp_algo) {
     case INTERP_ALGO_NEAREST:
         function_infix = "Nearest";
@@ -336,6 +443,7 @@ static av_cold int cudascale_load_functions(AVFilterContext 
*ctx)
         av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n");
         return AVERROR_BUG;
     }
+    }
 
     ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
     if (ret < 0)
@@ -347,7 +455,7 @@ static av_cold int cudascale_load_functions(AVFilterContext 
*ctx)
         goto fail;
 
     snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s", function_infix, 
in_fmt_name, out_fmt_name);
-    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func, s->cu_module, buf));
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_OUT], 
s->cu_module, buf));
     if (ret < 0) {
         av_log(ctx, AV_LOG_FATAL, "Unsupported conversion: %s -> %s\n", 
in_fmt_name, out_fmt_name);
         ret = AVERROR(ENOSYS);
@@ -356,17 +464,173 @@ static av_cold int 
cudascale_load_functions(AVFilterContext *ctx)
     av_log(ctx, AV_LOG_DEBUG, "Luma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
 
     snprintf(buf, sizeof(buf), "Subsample_%s_%s_%s_uv", function_infix, 
in_fmt_name, out_fmt_name);
-    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv, s->cu_module, buf));
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_OUT], 
s->cu_module, buf));
     if (ret < 0)
         goto fail;
     av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
 
+    if (s->inter_buf) {
+        /* Intermediate pass is always horizontal */
+        snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s", in_fmt_name, 
in_fmt_name);
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_TMP], 
s->cu_module, buf));
+        if (ret < 0)
+            goto fail;
+
+        snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s_uv", 
in_fmt_name, in_fmt_name);
+        ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uv[FILTER_TMP], 
s->cu_module, buf));
+        if (ret < 0)
+            goto fail;
+    }
+
 fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 
     return ret;
 }
 
+static av_cold int cudascale_filter_init(AVFilterContext *ctx,
+                                         CUDAScaleFilter *f,
+                                         int src_size, int dst_size,
+                                         double virtual_size)
+{
+    CUDAScaleContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    SwsFilterParams params = {
+        .scaler_params = { SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT },
+        .src_size      = src_size,
+        .dst_size      = dst_size,
+        .virtual_size  = virtual_size,
+    };
+
+    switch (s->interp_algo) {
+    case INTERP_ALGO_NEAREST:  return 0; /* no weights needed */
+    case INTERP_ALGO_BILINEAR: params.scaler = SWS_SCALE_BILINEAR; break;
+    case INTERP_ALGO_LANCZOS:  params.scaler = SWS_SCALE_LANCZOS;  break;
+    case INTERP_ALGO_DEFAULT:
+    case INTERP_ALGO_BICUBIC:
+        params.scaler = SWS_SCALE_BICUBIC;
+        params.scaler_params[0] = params.scaler_params[1] = 0.0;
+        if (s->param != SCALE_CUDA_PARAM_DEFAULT)
+            params.scaler_params[1] = s->param;
+        break;
+    }
+
+    SwsFilterWeights *weights = NULL;
+    int ret = ff_sws_filter_generate(ctx, &params, &weights);
+    if (ret < 0) {
+        if (ret == AVERROR(ENOTSUP)) {
+            av_log(ctx, AV_LOG_ERROR, "Filter size exceeds the maximum "
+                   "currently supported by the CUDA scaler (%d).\n",
+                   SWS_FILTER_SIZE_MAX);
+        }
+        return ret;
+    }
+
+    float *tmp = av_malloc_array(weights->num_weights, sizeof(*tmp));
+    if (!tmp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    for (size_t i = 0; i < weights->num_weights; i++)
+        tmp[i] = weights->weights[i] / (float) SWS_FILTER_SCALE;
+
+    f->filter_size = weights->filter_size;
+    f->dst_size    = dst_size;
+
+    const size_t weights_size = weights->num_weights * sizeof(*tmp);
+    ret = CHECK_CU(cu->cuMemAlloc(&f->weights, weights_size));
+    if (ret < 0)
+        goto fail;
+    ret = CHECK_CU(cu->cuMemcpyHtoD(f->weights, tmp, weights_size));
+    if (ret < 0)
+        goto fail;
+
+    const size_t offsets_size = dst_size * sizeof(*weights->offsets);
+    ret = CHECK_CU(cu->cuMemAlloc(&f->offsets, offsets_size));
+    if (ret < 0)
+        goto fail;
+    ret = CHECK_CU(cu->cuMemcpyHtoD(f->offsets, weights->offsets, 
offsets_size));
+    if (ret < 0)
+        goto fail;
+
+    av_log(ctx, AV_LOG_VERBOSE, "  using %d tap '%s' filter: %d -> %d\n",
+           f->filter_size, weights->name, src_size, dst_size);
+
+    ret = 0;
+
+fail:
+    av_free(tmp);
+    av_refstruct_unref(&weights);
+    return ret;
+}
+
+static av_cold int cudascale_setup_filters(AVFilterContext *ctx)
+{
+    CUDAScaleContext *s = ctx->priv;
+    AVFilterLink  *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy;
+    int ret;
+
+    const int sub_x = s->in_desc->log2_chroma_w;
+    const int sub_y = s->in_desc->log2_chroma_h;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    int pass_x = -1, pass_y = -1;
+    if (inlink->w != outlink->w && inlink->h != outlink->h) {
+        /* Always perform the horizontal scaling pass first */
+        pass_x = FILTER_TMP;
+        pass_y = FILTER_OUT;
+    } else if (inlink->w != outlink->w) {
+        pass_x = FILTER_OUT;
+    } else if (inlink->h != outlink->h) {
+        pass_y = FILTER_OUT;
+    }
+
+    if (pass_x >= 0) {
+        ret = cudascale_filter_init(ctx, &s->filters[pass_x],
+                                    inlink->w, outlink->w, 0.0);
+        if (ret < 0)
+            goto fail;
+        if (s->in_planes > 1) {
+            const int src_size = AV_CEIL_RSHIFT(inlink->w,  sub_x);
+            const int dst_size = AV_CEIL_RSHIFT(outlink->w, sub_x);
+            const double ratio = (double) outlink->w / inlink->w;
+            ret = cudascale_filter_init(ctx, &s->filters_uv[pass_x],
+                                        src_size, dst_size, src_size * ratio);
+            if (ret < 0)
+                goto fail;
+        }
+    }
+
+    if (pass_y >= 0) {
+        ret = cudascale_filter_init(ctx, &s->filters[pass_y],
+                                    inlink->h, outlink->h, 0.0);
+        if (ret < 0)
+            goto fail;
+        if (s->in_planes > 1) {
+            const int src_size = AV_CEIL_RSHIFT(inlink->h,  sub_y);
+            const int dst_size = AV_CEIL_RSHIFT(outlink->h, sub_y);
+            const double ratio = (double) outlink->h / inlink->h;
+            ret = cudascale_filter_init(ctx, &s->filters_uv[pass_y],
+                                        src_size, dst_size, src_size * ratio);
+            if (ret < 0)
+                goto fail;
+        }
+    }
+
+    ret = 0;
+
+fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    return ret;
+}
+
 static av_cold int cudascale_config_props(AVFilterLink *outlink)
 {
     AVFilterContext *ctx = outlink->src;
@@ -427,6 +691,12 @@ static av_cold int cudascale_config_props(AVFilterLink 
*outlink)
            outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt),
            s->passthrough ? " (passthrough)" : "");
 
+    if (s->use_filters) {
+        ret = cudascale_setup_filters(ctx);
+        if (ret < 0)
+            return ret;
+    }
+
     ret = cudascale_load_functions(ctx);
     if (ret < 0)
         return ret;
@@ -439,7 +709,8 @@ fail:
 
 static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
                               CUtexObject src_tex[4], int src_left, int 
src_top, int src_width, int src_height,
-                              AVFrame *out_frame, int dst_width, int 
dst_height, int dst_pitch, int mpeg_range)
+                              AVFrame *out_frame, int dst_width, int 
dst_height, int dst_pitch, int mpeg_range,
+                              const CUDAScaleFilter *filter)
 {
     CUDAScaleContext *s = ctx->priv;
     CudaFunctions *cu = s->hwctx->internal->cuda_dl;
@@ -460,9 +731,15 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
         .src_width = src_width,
         .src_height = src_height,
         .param = s->param,
-        .mpeg_range = mpeg_range
+        .mpeg_range = mpeg_range,
     };
 
+    if (filter) {
+        params.weights = filter->weights;
+        params.offsets = filter->offsets;
+        params.filter_size = filter->filter_size;
+    }
+
     void *args[] = { &params };
 
     return CHECK_CU(cu->cuLaunchKernel(func,
@@ -470,7 +747,7 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
                                        BLOCKX, BLOCKY, 1, 0, s->cu_stream, 
args, NULL));
 }
 
-static int scalecuda_resize(AVFilterContext *ctx,
+static int scalecuda_resize(AVFilterContext *ctx, int pass,
                             AVFrame *out, AVFrame *in)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -479,6 +756,13 @@ static int scalecuda_resize(AVFilterContext *ctx,
     int i, ret;
     int mpeg_range = in->color_range != AVCOL_RANGE_JPEG;
 
+    const AVPixFmtDescriptor *out_desc = s->out_desc;
+    int out_planes = s->out_planes;
+    if (pass == FILTER_TMP) {
+        out_desc   = s->in_desc;
+        out_planes = s->in_planes;
+    }
+
     CUtexObject tex[4] = { 0, 0, 0, 0 };
 
     int crop_width = (in->width - in->crop_right) - in->crop_left;
@@ -520,23 +804,25 @@ static int scalecuda_resize(AVFilterContext *ctx,
     }
 
     // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
-    ret = call_resize_kernel(ctx, s->cu_func,
+    ret = call_resize_kernel(ctx, s->cu_func[pass],
                              tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
-                             out, out->width, out->height, out->linesize[0], 
mpeg_range);
+                             out, out->width, out->height, out->linesize[0], 
mpeg_range,
+                             &s->filters[pass]);
     if (ret < 0)
         goto exit;
 
-    if (s->out_planes > 1) {
+    if (out_planes > 1) {
         // scale UV plane. Scale function sets both U and V plane, or singular 
interleaved plane.
-        ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
+        ret = call_resize_kernel(ctx, s->cu_func_uv[pass], tex,
                                  AV_CEIL_RSHIFT(in->crop_left, 
s->in_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(in->crop_top, 
s->in_desc->log2_chroma_h),
                                  AV_CEIL_RSHIFT(crop_width, 
s->in_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(crop_height, 
s->in_desc->log2_chroma_h),
                                  out,
-                                 AV_CEIL_RSHIFT(out->width, 
s->out_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(out->height, 
s->out_desc->log2_chroma_h),
-                                 out->linesize[1], mpeg_range);
+                                 AV_CEIL_RSHIFT(out->width, 
out_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(out->height, 
out_desc->log2_chroma_h),
+                                 out->linesize[1], mpeg_range,
+                                 &s->filters_uv[pass]);
         if (ret < 0)
             goto exit;
     }
@@ -558,7 +844,16 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     AVFrame *src = in;
     int ret;
 
-    ret = scalecuda_resize(ctx, s->frame, src);
+    if (s->inter_buf) {
+        /* Handle first pass separately */
+        s->inter_buf->color_range = in->color_range;
+        ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in);
+        if (ret < 0)
+            return ret;
+        src = s->inter_buf;
+    }
+
+    ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src);
     if (ret < 0)
         return ret;
 
@@ -653,6 +948,8 @@ static const AVOption options[] = {
         { "lanczos",  "lanczos",  0, AV_OPT_TYPE_CONST, { .i64 = 
INTERP_ALGO_LANCZOS  }, 0, 0, FLAGS, .unit = "interp_algo" },
     { "format", "Output video pixel format", OFFSET(format), 
AV_OPT_TYPE_PIXEL_FMT, { .i64 = AV_PIX_FMT_NONE }, INT_MIN, INT_MAX, 
.flags=FLAGS },
     { "passthrough", "Do not process frames at all if parameters match", 
OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
+    { "use_filters", "Use generic filters instead of fixed function kernels", 
OFFSET(use_filters), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 1, FLAGS, .unit = 
"use_filters" },
+        { "auto",    NULL,  0, AV_OPT_TYPE_CONST, {.i64 = -1}, 0, 0, FLAGS, 
.unit = "use_filters" },
     { "param", "Algorithm-Specific parameter", OFFSET(param), 
AV_OPT_TYPE_FLOAT, { .dbl = SCALE_CUDA_PARAM_DEFAULT }, -FLT_MAX, FLT_MAX, 
FLAGS },
     { "force_original_aspect_ratio", "decrease or increase w/h if necessary to 
keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { 
.i64 = 0 }, 0, SCALE_FORCE_OAR_NB-1, FLAGS, .unit = "force_oar" },
         { "disable",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 
SCALE_FORCE_OAR_DISABLE  }, 0, 0, FLAGS, .unit = "force_oar" },
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avfilter/scale_cude: convolve using generic separated filter kernel weights (PR #23559)

Reply via email to