PR #23606 opened by Niklas Haas (haasn)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23606
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23606.patch

The biggest impact here is that the previous math was pretty broken when 
combining different subsampling ratios; e.g. 420 -> 444 or vice versa. This is 
primarily because the luma and chroma might require a different intermediate 
buffer size; but this doesn't really map onto the AVFrame / AVPixelFormat layer.

The proper solution is to stop allocating the intermediate buffer as an AVFrame 
at all and just allocate raw CUDA buffers. This series accomplishes that, as 
well as fixing some pre-existing bugs.

I have verified that the new implementation round-trips correctly with the 
filter math in #23555 for odd chroma planes, though it does *not* round-trip 
with legacy swscale. I'll leave that issue open as out-of-scope for now, under 
assumption that this might be a genuine legacy swscale bug. (The new filter 
math round-trips with itself)


>From 2eb3955ca89d8d2f3bd12365df4762b0f879f518 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 13:19:10 +0200
Subject: [PATCH 1/9] avutil/hwcontext_cuda: fix off-by-one in
 cuda_transfer_data()

Height of the chroma plane was computed using a naive right shift instead of
AV_CEIL_RSHIFT, leading to the last line of chroma being uncopied.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavutil/hwcontext_cuda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c
index c57ea4bb7a..6342683a17 100644
--- a/libavutil/hwcontext_cuda.c
+++ b/libavutil/hwcontext_cuda.c
@@ -261,7 +261,7 @@ static int cuda_transfer_data(AVHWFramesContext *ctx, 
AVFrame *dst,
             .srcPitch      = src->linesize[i],
             .dstPitch      = dst->linesize[i],
             .WidthInBytes  = FFMIN(src->linesize[i], dst->linesize[i]),
-            .Height        = src->height >> ((i == 0 || i == 3) ? 0 : 
priv->shift_height),
+            .Height        = AV_CEIL_RSHIFT(src->height, ((i == 0 || i == 3) ? 
0 : priv->shift_height)),
         };
 
         if (src->hw_frames_ctx) {
-- 
2.52.0


>From c4545f5db2ff616422848904dfbd4595d65a8a8b Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 14:54:51 +0200
Subject: [PATCH 2/9] avfilter/vf_scale_cuda: eliminate redundant context
 push/pop

This is already done by cudascale_filter_frame().

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 2a7dc300f5..d429f791b0 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -756,7 +756,6 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass,
 {
     CUDAScaleContext *s = ctx->priv;
     CudaFunctions *cu = s->hwctx->internal->cuda_dl;
-    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
     int i, ret;
     int mpeg_range = in->color_range != AVCOL_RANGE_JPEG;
 
@@ -772,10 +771,6 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass,
     int crop_width = (in->width - in->crop_right) - in->crop_left;
     int crop_height = (in->height - in->crop_bottom) - in->crop_top;
 
-    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
-    if (ret < 0)
-        return ret;
-
     for (i = 0; i < s->in_planes; i++) {
         CUDA_TEXTURE_DESC tex_desc = {
             .filterMode = s->interp_use_linear ?
@@ -836,8 +831,6 @@ exit:
         if (tex[i])
             CHECK_CU(cu->cuTexObjectDestroy(tex[i]));
 
-    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
-
     return ret;
 }
 
-- 
2.52.0


>From f35ef1d7b802065eb2118ae6b1dbd4b85e6f32ad Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 15:43:45 +0200
Subject: [PATCH 3/9] avfilter/vf_scale_cuda: add fail: label (cosmetic)

Make the next commit a bit easier to review.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index d429f791b0..d0c3f10f20 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -838,26 +838,25 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
 {
     CUDAScaleContext *s = ctx->priv;
     AVFilterLink *outlink = ctx->outputs[0];
-    AVFrame *src = in;
-    int ret;
+    int ret = 0;
 
+    AVFrame *src = in;
     if (s->inter_buf) {
         /* Handle first pass separately */
         s->inter_buf->color_range = in->color_range;
         ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in);
         if (ret < 0)
-            return ret;
+            goto fail;
         src = s->inter_buf;
     }
 
     ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src);
     if (ret < 0)
-        return ret;
+        goto fail;
 
-    src = s->frame;
-    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    ret = av_hwframe_get_buffer(s->frame->hw_frames_ctx, s->tmp_frame, 0);
     if (ret < 0)
-        return ret;
+        goto fail;
 
     av_frame_move_ref(out, s->frame);
     av_frame_move_ref(s->frame, s->tmp_frame);
@@ -867,14 +866,15 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
 
     ret = av_frame_copy_props(out, in);
     if (ret < 0)
-        return ret;
+        goto fail;
 
     if (out->width != in->width || out->height != in->height) {
         av_frame_side_data_remove_by_props(&out->side_data, &out->nb_side_data,
                                            AV_SIDE_DATA_PROP_SIZE_DEPENDENT);
     }
 
-    return 0;
+fail:
+    return ret;
 }
 
 static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
-- 
2.52.0


>From c636b27272ad1dadb497c51c46bb50a51bc1f427 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 15:48:44 +0200
Subject: [PATCH 4/9] avfilter/vf_scale_cuda: introduce CUDATex and mapping
 helper

I want to disentangle the internal logic from AVFrame, because some
intermediate states (e.g. for partially subsampled chroma with simultaneous
scaling) may not directly map to a valid AVPixelFormat.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 186 +++++++++++++++++++++++-------------
 1 file changed, 122 insertions(+), 64 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index d0c3f10f20..540b869f7c 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -104,6 +104,15 @@ typedef struct CUDAScaleFilter {
     int dst_size;
 } CUDAScaleFilter;
 
+typedef struct CUDATex {
+    CUtexObject tex[4];
+    CUdeviceptr data[4];
+    int         linesize[4];
+    int         width, height;
+    int         crop_left, crop_top, crop_width, crop_height;
+    int         color_range;
+} CUDATex;
+
 typedef struct CUDAScaleContext {
     const AVClass *class;
 
@@ -175,6 +184,16 @@ static void filter_uninit(CudaFunctions *cu, 
CUDAScaleFilter *filter)
     memset(filter, 0, sizeof(*filter));
 }
 
+static void cuda_tex_uninit(CudaFunctions *cu, CUDATex *t)
+{
+    for (int i = 0; i < FF_ARRAY_ELEMS(t->tex); i++) {
+        if (t->tex[i])
+            cu->cuTexObjectDestroy(t->tex[i]);
+    }
+
+    memset(t, 0, sizeof(*t));
+}
+
 static av_cold void cudascale_uninit(AVFilterContext *ctx)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -711,9 +730,71 @@ fail:
     return ret;
 }
 
+/* if depths/channels are NULL, only maps pointers without creating textures */
+static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame,
+                              const int depths[4], const int channels[4],
+                              CUDATex *tex)
+{
+    CUDAScaleContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    const AVHWFramesContext *fctx = (const 
AVHWFramesContext*)frame->hw_frames_ctx->data;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fctx->sw_format);
+    const int planes = av_pix_fmt_count_planes(fctx->sw_format);
+
+    *tex = (CUDATex) {
+        .width       = frame->width,
+        .height      = frame->height,
+        .crop_left   = frame->crop_left,
+        .crop_top    = frame->crop_top,
+        .crop_width  = (frame->width  - frame->crop_right)  - frame->crop_left,
+        .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top,
+        .color_range = frame->color_range,
+    };
+
+    for (int i = 0; i < planes; i++) {
+        tex->data[i]     = (CUdeviceptr)frame->data[i];
+        tex->linesize[i] = frame->linesize[i];
+        if (!depths || !channels)
+            continue;
+
+        CUDA_TEXTURE_DESC tex_desc = {
+            .filterMode = s->interp_use_linear ?
+                          CU_TR_FILTER_MODE_LINEAR :
+                          CU_TR_FILTER_MODE_POINT,
+            .flags = s->interp_as_integer ? CU_TRSF_READ_AS_INTEGER : 0,
+        };
+
+        const int is_chroma = i == 1 || i == 2;
+        const int sub_x = is_chroma ? desc->log2_chroma_w : 0;
+        const int sub_y = is_chroma ? desc->log2_chroma_h : 0;
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = depths[i] <= 8 ?
+                                  CU_AD_FORMAT_UNSIGNED_INT8 :
+                                  CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = channels[i],
+            .res.pitch2D.pitchInBytes = tex->linesize[i],
+            .res.pitch2D.devPtr = tex->data[i],
+            .res.pitch2D.width  = AV_CEIL_RSHIFT(frame->width,  sub_x),
+            .res.pitch2D.height = AV_CEIL_RSHIFT(frame->height, sub_y),
+        };
+
+        int ret = CHECK_CU(cu->cuTexObjectCreate(&tex->tex[i], &res_desc, 
&tex_desc, NULL));
+        if (ret < 0) {
+            cuda_tex_uninit(cu, tex);
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
 static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
-                              CUtexObject src_tex[4], int src_left, int 
src_top, int src_width, int src_height,
-                              AVFrame *out_frame, int dst_width, int 
dst_height, int dst_pitch, int mpeg_range,
+                              const CUtexObject src_tex[4],
+                              int src_left, int src_top, int src_width, int 
src_height,
+                              const CUdeviceptr out_data[4],
+                              int dst_width, int dst_height, int dst_pitch, 
int mpeg_range,
                               const CUDAScaleFilter *filter)
 {
     CUDAScaleContext *s = ctx->priv;
@@ -722,10 +803,10 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
     CUDAScaleKernelParams params = {
         .src_tex = {src_tex[0], src_tex[1], src_tex[2], src_tex[3]},
         .dst = {
-            (CUdeviceptr)out_frame->data[0],
-            (CUdeviceptr)out_frame->data[1],
-            (CUdeviceptr)out_frame->data[2],
-            (CUdeviceptr)out_frame->data[3]
+            out_data[0],
+            out_data[1],
+            out_data[2],
+            out_data[3]
         },
         .dst_width = dst_width,
         .dst_height = dst_height,
@@ -752,12 +833,11 @@ static int call_resize_kernel(AVFilterContext *ctx, 
CUfunction func,
 }
 
 static int scalecuda_resize(AVFilterContext *ctx, int pass,
-                            AVFrame *out, AVFrame *in)
+                            const CUDATex *out, const CUDATex *in)
 {
     CUDAScaleContext *s = ctx->priv;
-    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
-    int i, ret;
     int mpeg_range = in->color_range != AVCOL_RANGE_JPEG;
+    int ret;
 
     const AVPixFmtDescriptor *out_desc = s->out_desc;
     int out_planes = s->out_planes;
@@ -766,91 +846,66 @@ static int scalecuda_resize(AVFilterContext *ctx, int 
pass,
         out_planes = s->in_planes;
     }
 
-    CUtexObject tex[4] = { 0, 0, 0, 0 };
-
-    int crop_width = (in->width - in->crop_right) - in->crop_left;
-    int crop_height = (in->height - in->crop_bottom) - in->crop_top;
-
-    for (i = 0; i < s->in_planes; i++) {
-        CUDA_TEXTURE_DESC tex_desc = {
-            .filterMode = s->interp_use_linear ?
-                          CU_TR_FILTER_MODE_LINEAR :
-                          CU_TR_FILTER_MODE_POINT,
-            .flags = s->interp_as_integer ? CU_TRSF_READ_AS_INTEGER : 0,
-        };
-
-        CUDA_RESOURCE_DESC res_desc = {
-            .resType = CU_RESOURCE_TYPE_PITCH2D,
-            .res.pitch2D.format = s->in_plane_depths[i] <= 8 ?
-                                  CU_AD_FORMAT_UNSIGNED_INT8 :
-                                  CU_AD_FORMAT_UNSIGNED_INT16,
-            .res.pitch2D.numChannels = s->in_plane_channels[i],
-            .res.pitch2D.pitchInBytes = in->linesize[i],
-            .res.pitch2D.devPtr = (CUdeviceptr)in->data[i],
-        };
-
-        if (i == 1 || i == 2) {
-            res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, 
s->in_desc->log2_chroma_w);
-            res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, 
s->in_desc->log2_chroma_h);
-        } else {
-            res_desc.res.pitch2D.width = in->width;
-            res_desc.res.pitch2D.height = in->height;
-        }
-
-        ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, 
NULL));
-        if (ret < 0)
-            goto exit;
-    }
-
     // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
     ret = call_resize_kernel(ctx, s->cu_func[pass],
-                             tex, in->crop_left, in->crop_top, crop_width, 
crop_height,
-                             out, out->width, out->height, out->linesize[0], 
mpeg_range,
+                             in->tex, in->crop_left, in->crop_top,
+                             in->crop_width, in->crop_height,
+                             out->data, out->width, out->height,
+                             out->linesize[0], mpeg_range,
                              &s->filters[pass]);
     if (ret < 0)
-        goto exit;
+        return ret;
 
     if (out_planes > 1) {
         // scale UV plane. Scale function sets both U and V plane, or singular 
interleaved plane.
-        ret = call_resize_kernel(ctx, s->cu_func_uv[pass], tex,
+        ret = call_resize_kernel(ctx, s->cu_func_uv[pass], in->tex,
                                  AV_CEIL_RSHIFT(in->crop_left, 
s->in_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(in->crop_top, 
s->in_desc->log2_chroma_h),
-                                 AV_CEIL_RSHIFT(crop_width, 
s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(crop_height, 
s->in_desc->log2_chroma_h),
-                                 out,
+                                 AV_CEIL_RSHIFT(in->crop_width, 
s->in_desc->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(in->crop_height, 
s->in_desc->log2_chroma_h),
+                                 out->data,
                                  AV_CEIL_RSHIFT(out->width, 
out_desc->log2_chroma_w),
                                  AV_CEIL_RSHIFT(out->height, 
out_desc->log2_chroma_h),
                                  out->linesize[1], mpeg_range,
                                  &s->filters_uv[pass]);
         if (ret < 0)
-            goto exit;
+            return ret;
     }
 
-exit:
-    for (i = 0; i < s->in_planes; i++)
-        if (tex[i])
-            CHECK_CU(cu->cuTexObjectDestroy(tex[i]));
-
-    return ret;
+    return 0;
 }
 
 static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in)
 {
     CUDAScaleContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
     AVFilterLink *outlink = ctx->outputs[0];
     int ret = 0;
 
-    AVFrame *src = in;
+    CUDATex in_tex = {0}, out_tex = {0}, inter_tex = {0};
+    ret = cuda_tex_map_frame(ctx, in, s->in_plane_depths, 
s->in_plane_channels, &in_tex);
+    if (ret < 0)
+        goto fail;
+
+    ret = cuda_tex_map_frame(ctx, s->frame, NULL, NULL, &out_tex);
+    if (ret < 0)
+        goto fail;
+
+    const CUDATex *src = &in_tex;
     if (s->inter_buf) {
         /* Handle first pass separately */
-        s->inter_buf->color_range = in->color_range;
-        ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in);
+        ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths,
+                                 s->in_plane_channels, &inter_tex);
         if (ret < 0)
             goto fail;
-        src = s->inter_buf;
+        inter_tex.color_range = in->color_range;
+        ret = scalecuda_resize(ctx, FILTER_TMP, &inter_tex, src);
+        if (ret < 0)
+            goto fail;
+        src = &inter_tex;
     }
 
-    ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src);
+    ret = scalecuda_resize(ctx, FILTER_OUT, &out_tex, src);
     if (ret < 0)
         goto fail;
 
@@ -874,6 +929,9 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     }
 
 fail:
+    cuda_tex_uninit(cu, &in_tex);
+    cuda_tex_uninit(cu, &out_tex);
+    cuda_tex_uninit(cu, &inter_tex);
     return ret;
 }
 
-- 
2.52.0


>From caa07f07a67dddc0cebb609408e42e1f0ae7b1d1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 16:42:19 +0200
Subject: [PATCH 5/9] avfilter/vf_scale_cuda: defer buffer allocation to
 setup_filters()

At this point, s->hwctx and CudaFunctions * are available.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 540b869f7c..a71f91d824 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -260,20 +260,22 @@ fail:
     return ret;
 }
 
-static av_cold int inter_buf_init(CUDAScaleContext *s, AVBufferRef *device_ctx,
-                                  enum AVPixelFormat format, int width, int 
height)
+static av_cold int inter_buf_init(CUDAScaleContext *s, AVFilterLink *inlink,
+                                  int width, int height)
 {
     AVBufferRef *ref = NULL;
     AVHWFramesContext *fctx;
     int ret;
 
-    ref = av_hwframe_ctx_alloc(device_ctx);
+    FilterLink *inl = ff_filter_link(inlink);
+    AVHWFramesContext *in_frames_ctx = 
(AVHWFramesContext*)inl->hw_frames_ctx->data;
+    ref = av_hwframe_ctx_alloc(in_frames_ctx->device_ref);
     if (!ref)
         return AVERROR(ENOMEM);
     fctx = (AVHWFramesContext*)ref->data;
 
     fctx->format    = AV_PIX_FMT_CUDA;
-    fctx->sw_format = format;
+    fctx->sw_format = in_frames_ctx->sw_format;
     fctx->width     = FFALIGN(width,  32);
     fctx->height    = FFALIGN(height, 32);
 
@@ -400,13 +402,6 @@ static av_cold int init_processing_chain(AVFilterContext 
*ctx, int in_width, int
             s->use_filters = 0;
         } else if (s->use_filters < 0 && (in_width < out_width || in_height < 
out_height))
             s->use_filters = 1; /* downscaling; needed for anti-aliasing */
-
-        if (s->use_filters) {
-            ret = inter_buf_init(s, in_frames_ctx->device_ref, in_format,
-                                 out_width, in_height);
-            if (ret < 0)
-                return ret;
-        }
     }
 
     outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
@@ -647,6 +642,10 @@ static av_cold int cudascale_setup_filters(AVFilterContext 
*ctx)
         }
     }
 
+    ret = inter_buf_init(s, inlink, outlink->w, inlink->h);
+    if (ret < 0)
+        goto fail;
+
     ret = 0;
 
 fail:
-- 
2.52.0


>From 64f36337a528c8e1565ce5c96d77b597ac45678a Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 16:44:37 +0200
Subject: [PATCH 6/9] avfilter/vf_scale_cuda: use persistent intermediate
 CUDATex

Instead of re-creating this object every frame.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index a71f91d824..1f0ef2555a 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -155,6 +155,7 @@ typedef struct CUDAScaleContext {
     CUDAScaleFilter filters[FILTER_NB];
     CUDAScaleFilter filters_uv[FILTER_NB];
     AVFrame *inter_buf; /* intermediate buffer for separated scaling */
+    CUDATex inter_tex;
     int use_filters; /* -1 for auto */
 
     float param;
@@ -204,6 +205,7 @@ static av_cold void cudascale_uninit(AVFilterContext *ctx)
 
         CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
 
+        cuda_tex_uninit(cu, &s->inter_tex);
         for (int i = 0; i < FF_ARRAY_ELEMS(s->filters); i++) {
             filter_uninit(cu, &s->filters[i]);
             filter_uninit(cu, &s->filters_uv[i]);
@@ -260,9 +262,15 @@ fail:
     return ret;
 }
 
-static av_cold int inter_buf_init(CUDAScaleContext *s, AVFilterLink *inlink,
+static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame,
+                              const int depths[4], const int channels[4],
+                              CUDATex *tex);
+
+static av_cold int inter_buf_init(AVFilterContext *ctx, AVFilterLink *inlink,
                                   int width, int height)
 {
+    CUDAScaleContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
     AVBufferRef *ref = NULL;
     AVHWFramesContext *fctx;
     int ret;
@@ -297,10 +305,16 @@ static av_cold int inter_buf_init(CUDAScaleContext *s, 
AVFilterLink *inlink,
     s->inter_buf->width  = width;
     s->inter_buf->height = height;
 
+    ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths,
+                             s->in_plane_channels, &s->inter_tex);
+    if (ret < 0)
+        goto fail;
+
     av_buffer_unref(&ref);
     return 0;
 
 fail:
+    cuda_tex_uninit(cu, &s->inter_tex);
     av_frame_free(&s->inter_buf);
     av_buffer_unref(&ref);
     return ret;
@@ -642,7 +656,7 @@ static av_cold int cudascale_setup_filters(AVFilterContext 
*ctx)
         }
     }
 
-    ret = inter_buf_init(s, inlink, outlink->w, inlink->h);
+    ret = inter_buf_init(ctx, inlink, outlink->w, inlink->h);
     if (ret < 0)
         goto fail;
 
@@ -881,7 +895,7 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     AVFilterLink *outlink = ctx->outputs[0];
     int ret = 0;
 
-    CUDATex in_tex = {0}, out_tex = {0}, inter_tex = {0};
+    CUDATex in_tex = {0}, out_tex = {0};
     ret = cuda_tex_map_frame(ctx, in, s->in_plane_depths, 
s->in_plane_channels, &in_tex);
     if (ret < 0)
         goto fail;
@@ -893,15 +907,11 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
     const CUDATex *src = &in_tex;
     if (s->inter_buf) {
         /* Handle first pass separately */
-        ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths,
-                                 s->in_plane_channels, &inter_tex);
+        s->inter_tex.color_range = in->color_range;
+        ret = scalecuda_resize(ctx, FILTER_TMP, &s->inter_tex, src);
         if (ret < 0)
             goto fail;
-        inter_tex.color_range = in->color_range;
-        ret = scalecuda_resize(ctx, FILTER_TMP, &inter_tex, src);
-        if (ret < 0)
-            goto fail;
-        src = &inter_tex;
+        src = &s->inter_tex;
     }
 
     ret = scalecuda_resize(ctx, FILTER_OUT, &out_tex, src);
@@ -930,7 +940,6 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
 fail:
     cuda_tex_uninit(cu, &in_tex);
     cuda_tex_uninit(cu, &out_tex);
-    cuda_tex_uninit(cu, &inter_tex);
     return ret;
 }
 
-- 
2.52.0


>From 684a4ce8fa3c6d2b34160cbd5ce41faa6ed63cb6 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 16:47:27 +0200
Subject: [PATCH 7/9] avfilter/vf_scale_cuda: allocate intermediate buffer
 directly

Instead of going via an AVFrame at all. This will allow us to fix the
intermediate chroma plane size for split downscaling.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 100 +++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 1f0ef2555a..6204da98dc 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -24,7 +24,6 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "libavutil/avassert.h"
 #include "libavutil/common.h"
 #include "libavutil/hwcontext.h"
 #include "libavutil/hwcontext_cuda_internal.h"
@@ -111,6 +110,7 @@ typedef struct CUDATex {
     int         width, height;
     int         crop_left, crop_top, crop_width, crop_height;
     int         color_range;
+    int         external_data;
 } CUDATex;
 
 typedef struct CUDAScaleContext {
@@ -154,7 +154,6 @@ typedef struct CUDAScaleContext {
 
     CUDAScaleFilter filters[FILTER_NB];
     CUDAScaleFilter filters_uv[FILTER_NB];
-    AVFrame *inter_buf; /* intermediate buffer for separated scaling */
     CUDATex inter_tex;
     int use_filters; /* -1 for auto */
 
@@ -190,6 +189,8 @@ static void cuda_tex_uninit(CudaFunctions *cu, CUDATex *t)
     for (int i = 0; i < FF_ARRAY_ELEMS(t->tex); i++) {
         if (t->tex[i])
             cu->cuTexObjectDestroy(t->tex[i]);
+        if (t->data[i] && !t->external_data)
+            cu->cuMemFree(t->data[i]);
     }
 
     memset(t, 0, sizeof(*t));
@@ -222,7 +223,6 @@ static av_cold void cudascale_uninit(AVFilterContext *ctx)
     av_frame_free(&s->frame);
     av_buffer_unref(&s->frames_ctx);
     av_frame_free(&s->tmp_frame);
-    av_frame_free(&s->inter_buf);
 }
 
 static av_cold int init_hwframe_ctx(CUDAScaleContext *s, AVBufferRef 
*device_ctx, int width, int height)
@@ -262,61 +262,64 @@ fail:
     return ret;
 }
 
-static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame,
-                              const int depths[4], const int channels[4],
-                              CUDATex *tex);
-
-static av_cold int inter_buf_init(AVFilterContext *ctx, AVFilterLink *inlink,
-                                  int width, int height)
+static av_cold int inter_buf_init(AVFilterContext *ctx, int width, int height)
 {
     CUDAScaleContext *s = ctx->priv;
     CudaFunctions *cu = s->hwctx->internal->cuda_dl;
-    AVBufferRef *ref = NULL;
-    AVHWFramesContext *fctx;
-    int ret;
+    int ret = 0;
 
-    FilterLink *inl = ff_filter_link(inlink);
-    AVHWFramesContext *in_frames_ctx = 
(AVHWFramesContext*)inl->hw_frames_ctx->data;
-    ref = av_hwframe_ctx_alloc(in_frames_ctx->device_ref);
-    if (!ref)
-        return AVERROR(ENOMEM);
-    fctx = (AVHWFramesContext*)ref->data;
+    cuda_tex_uninit(cu, &s->inter_tex);
+    s->inter_tex = (CUDATex) {
+        .width       = width,
+        .height      = height,
+        .crop_width  = width,
+        .crop_height = height,
+    };
 
-    fctx->format    = AV_PIX_FMT_CUDA;
-    fctx->sw_format = in_frames_ctx->sw_format;
-    fctx->width     = FFALIGN(width,  32);
-    fctx->height    = FFALIGN(height, 32);
+    for (int i = 0; i < s->in_planes; i++) {
+        const int is_chroma = i == 1 || i == 2;
+        const int sub_x   = is_chroma ? s->in_desc->log2_chroma_w : 0;
+        const int sub_y   = is_chroma ? s->in_desc->log2_chroma_h : 0;
+        const int plane_w = AV_CEIL_RSHIFT(width,  sub_x);
+        const int plane_h = AV_CEIL_RSHIFT(height, sub_y);
+        const int sizeof_pixel = (s->in_plane_depths[i] <= 8 ? 1 : 2) *
+                                  s->in_plane_channels[i];
 
-    ret = av_hwframe_ctx_init(ref);
-    if (ret < 0)
-        goto fail;
+        size_t pitch;
+        ret = CHECK_CU(cu->cuMemAllocPitch(&s->inter_tex.data[i], &pitch,
+                                           (size_t) plane_w * sizeof_pixel,
+                                           plane_h, 16));
+        if (ret < 0)
+            goto fail;
+        s->inter_tex.linesize[i] = pitch;
 
-    av_assert0(!s->inter_buf);
-    s->inter_buf = av_frame_alloc();
-    if (!s->inter_buf) {
-        ret = AVERROR(ENOMEM);
-        goto fail;
+        CUDA_TEXTURE_DESC tex_desc = {
+            /* inter tex is always read as float */
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+        };
+
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = s->in_plane_depths[i] <= 8 ?
+                                  CU_AD_FORMAT_UNSIGNED_INT8 :
+                                  CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels  = s->in_plane_channels[i],
+            .res.pitch2D.devPtr       = s->inter_tex.data[i],
+            .res.pitch2D.pitchInBytes = pitch,
+            .res.pitch2D.width        = plane_w,
+            .res.pitch2D.height       = plane_h,
+        };
+
+        ret = CHECK_CU(cu->cuTexObjectCreate(&s->inter_tex.tex[i], &res_desc,
+                                             &tex_desc, NULL));
+        if (ret < 0)
+            goto fail;
     }
 
-    ret = av_hwframe_get_buffer(ref, s->inter_buf, 0);
-    if (ret < 0)
-        goto fail;
-
-    s->inter_buf->width  = width;
-    s->inter_buf->height = height;
-
-    ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths,
-                             s->in_plane_channels, &s->inter_tex);
-    if (ret < 0)
-        goto fail;
-
-    av_buffer_unref(&ref);
     return 0;
 
 fail:
     cuda_tex_uninit(cu, &s->inter_tex);
-    av_frame_free(&s->inter_buf);
-    av_buffer_unref(&ref);
     return ret;
 }
 
@@ -501,7 +504,7 @@ static av_cold int cudascale_load_functions(AVFilterContext 
*ctx)
         goto fail;
     av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, 
av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt));
 
-    if (s->inter_buf) {
+    if (s->use_filters) {
         /* Intermediate pass is always horizontal */
         snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s", in_fmt_name, 
in_fmt_name);
         ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_TMP], 
s->cu_module, buf));
@@ -656,7 +659,7 @@ static av_cold int cudascale_setup_filters(AVFilterContext 
*ctx)
         }
     }
 
-    ret = inter_buf_init(ctx, inlink, outlink->w, inlink->h);
+    ret = inter_buf_init(ctx, outlink->w, inlink->h);
     if (ret < 0)
         goto fail;
 
@@ -763,6 +766,7 @@ static int cuda_tex_map_frame(AVFilterContext *ctx, const 
AVFrame *frame,
         .crop_width  = (frame->width  - frame->crop_right)  - frame->crop_left,
         .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top,
         .color_range = frame->color_range,
+        .external_data = 1,
     };
 
     for (int i = 0; i < planes; i++) {
@@ -905,7 +909,7 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame 
*out, AVFrame *in)
         goto fail;
 
     const CUDATex *src = &in_tex;
-    if (s->inter_buf) {
+    if (s->use_filters) {
         /* Handle first pass separately */
         s->inter_tex.color_range = in->color_range;
         ret = scalecuda_resize(ctx, FILTER_TMP, &s->inter_tex, src);
-- 
2.52.0


>From f3a436daaa4b95ae74baec14e257c5a82f18d769 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 17:07:05 +0200
Subject: [PATCH 8/9] avfilter/vf_scale_cuda: allocate inter buffer with
 correct subsampling

Since the input and output format can differ (e.g. 444 -> 420), we need to
reference the correct subsampling for the partially applied filter.

Keep track of this in the CUDATex itself.

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 62 ++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 6204da98dc..9ee962ac9a 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -108,6 +108,7 @@ typedef struct CUDATex {
     CUdeviceptr data[4];
     int         linesize[4];
     int         width, height;
+    int         log2_chroma_w, log2_chroma_h;
     int         crop_left, crop_top, crop_width, crop_height;
     int         color_range;
     int         external_data;
@@ -262,7 +263,7 @@ fail:
     return ret;
 }
 
-static av_cold int inter_buf_init(AVFilterContext *ctx, int width, int height)
+static av_cold int inter_buf_init(AVFilterContext *ctx, int out_width, int 
in_height)
 {
     CUDAScaleContext *s = ctx->priv;
     CudaFunctions *cu = s->hwctx->internal->cuda_dl;
@@ -270,18 +271,20 @@ static av_cold int inter_buf_init(AVFilterContext *ctx, 
int width, int height)
 
     cuda_tex_uninit(cu, &s->inter_tex);
     s->inter_tex = (CUDATex) {
-        .width       = width,
-        .height      = height,
-        .crop_width  = width,
-        .crop_height = height,
+        .width          = out_width,
+        .height         = in_height,
+        .crop_width     = out_width,
+        .crop_height    = in_height,
+        .log2_chroma_w  = s->out_desc->log2_chroma_w,
+        .log2_chroma_h  = s->in_desc->log2_chroma_h,
     };
 
     for (int i = 0; i < s->in_planes; i++) {
         const int is_chroma = i == 1 || i == 2;
-        const int sub_x   = is_chroma ? s->in_desc->log2_chroma_w : 0;
-        const int sub_y   = is_chroma ? s->in_desc->log2_chroma_h : 0;
-        const int plane_w = AV_CEIL_RSHIFT(width,  sub_x);
-        const int plane_h = AV_CEIL_RSHIFT(height, sub_y);
+        const int sub_x   = is_chroma ? s->inter_tex.log2_chroma_w : 0;
+        const int sub_y   = is_chroma ? s->inter_tex.log2_chroma_h : 0;
+        const int plane_w = AV_CEIL_RSHIFT(out_width, sub_x);
+        const int plane_h = AV_CEIL_RSHIFT(in_height, sub_y);
         const int sizeof_pixel = (s->in_plane_depths[i] <= 8 ? 1 : 2) *
                                   s->in_plane_channels[i];
 
@@ -609,8 +612,10 @@ static av_cold int cudascale_setup_filters(AVFilterContext 
*ctx)
     CUcontext dummy;
     int ret;
 
-    const int sub_x = s->in_desc->log2_chroma_w;
-    const int sub_y = s->in_desc->log2_chroma_h;
+    const int in_sub_x  = s->in_desc->log2_chroma_w;
+    const int in_sub_y  = s->in_desc->log2_chroma_h;
+    const int out_sub_x = s->out_desc->log2_chroma_w;
+    const int out_sub_y = s->out_desc->log2_chroma_h;
 
     ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
     if (ret < 0)
@@ -633,11 +638,11 @@ static av_cold int 
cudascale_setup_filters(AVFilterContext *ctx)
         if (ret < 0)
             goto fail;
         if (s->in_planes > 1) {
-            const int src_size = AV_CEIL_RSHIFT(inlink->w,  sub_x);
-            const int dst_size = AV_CEIL_RSHIFT(outlink->w, sub_x);
-            const double ratio = (double) outlink->w / inlink->w;
+            const int src_size = AV_CEIL_RSHIFT(inlink->w,  in_sub_x);
+            const int dst_size = AV_CEIL_RSHIFT(outlink->w, out_sub_x);
+            const double virtual_size = (double) outlink->w / (1 << out_sub_x);
             ret = cudascale_filter_init(ctx, &s->filters_uv[pass_x],
-                                        src_size, dst_size, src_size * ratio);
+                                        src_size, dst_size, virtual_size);
             if (ret < 0)
                 goto fail;
         }
@@ -649,11 +654,11 @@ static av_cold int 
cudascale_setup_filters(AVFilterContext *ctx)
         if (ret < 0)
             goto fail;
         if (s->in_planes > 1) {
-            const int src_size = AV_CEIL_RSHIFT(inlink->h,  sub_y);
-            const int dst_size = AV_CEIL_RSHIFT(outlink->h, sub_y);
-            const double ratio = (double) outlink->h / inlink->h;
+            const int src_size = AV_CEIL_RSHIFT(inlink->h,  in_sub_y);
+            const int dst_size = AV_CEIL_RSHIFT(outlink->h, out_sub_y);
+            const double virtual_size = (double) outlink->h / (1 << out_sub_y);
             ret = cudascale_filter_init(ctx, &s->filters_uv[pass_y],
-                                        src_size, dst_size, src_size * ratio);
+                                        src_size, dst_size, virtual_size);
             if (ret < 0)
                 goto fail;
         }
@@ -766,6 +771,8 @@ static int cuda_tex_map_frame(AVFilterContext *ctx, const 
AVFrame *frame,
         .crop_width  = (frame->width  - frame->crop_right)  - frame->crop_left,
         .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top,
         .color_range = frame->color_range,
+        .log2_chroma_w = desc->log2_chroma_w,
+        .log2_chroma_h = desc->log2_chroma_h,
         .external_data = 1,
     };
 
@@ -856,12 +863,9 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass,
     int mpeg_range = in->color_range != AVCOL_RANGE_JPEG;
     int ret;
 
-    const AVPixFmtDescriptor *out_desc = s->out_desc;
     int out_planes = s->out_planes;
-    if (pass == FILTER_TMP) {
-        out_desc   = s->in_desc;
+    if (pass == FILTER_TMP)
         out_planes = s->in_planes;
-    }
 
     // scale primary plane(s). Usually Y (and A), or single plane of RGB 
frames.
     ret = call_resize_kernel(ctx, s->cu_func[pass],
@@ -876,13 +880,13 @@ static int scalecuda_resize(AVFilterContext *ctx, int 
pass,
     if (out_planes > 1) {
         // scale UV plane. Scale function sets both U and V plane, or singular 
interleaved plane.
         ret = call_resize_kernel(ctx, s->cu_func_uv[pass], in->tex,
-                                 AV_CEIL_RSHIFT(in->crop_left, 
s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(in->crop_top, 
s->in_desc->log2_chroma_h),
-                                 AV_CEIL_RSHIFT(in->crop_width, 
s->in_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(in->crop_height, 
s->in_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(in->crop_left, 
in->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(in->crop_top, 
in->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(in->crop_width, 
in->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(in->crop_height, 
in->log2_chroma_h),
                                  out->data,
-                                 AV_CEIL_RSHIFT(out->width, 
out_desc->log2_chroma_w),
-                                 AV_CEIL_RSHIFT(out->height, 
out_desc->log2_chroma_h),
+                                 AV_CEIL_RSHIFT(out->width, 
out->log2_chroma_w),
+                                 AV_CEIL_RSHIFT(out->height, 
out->log2_chroma_h),
                                  out->linesize[1], mpeg_range,
                                  &s->filters_uv[pass]);
         if (ret < 0)
-- 
2.52.0


>From ecb43d3b62a1f7be919908efa49ea792e0b76599 Mon Sep 17 00:00:00 2001
From: Niklas Haas <[email protected]>
Date: Fri, 26 Jun 2026 17:24:36 +0200
Subject: [PATCH 9/9] avfilter/vf_scale_cuda: fix inverted downscaling check

Signed-off-by: Niklas Haas <[email protected]>
---
 libavfilter/vf_scale_cuda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
index 9ee962ac9a..409145c44f 100644
--- a/libavfilter/vf_scale_cuda.c
+++ b/libavfilter/vf_scale_cuda.c
@@ -420,7 +420,7 @@ static av_cold int init_processing_chain(AVFilterContext 
*ctx, int in_width, int
 
         if (s->interp_algo == INTERP_ALGO_NEAREST) {
             s->use_filters = 0;
-        } else if (s->use_filters < 0 && (in_width < out_width || in_height < 
out_height))
+        } else if (s->use_filters < 0 && (out_width < in_width || out_height < 
in_height))
             s->use_filters = 1; /* downscaling; needed for anti-aliasing */
     }
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to