PR #23606 opened by Niklas Haas (haasn) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23606 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23606.patch
The biggest impact here is that the previous math was pretty broken when combining different subsampling ratios; e.g. 420 -> 444 or vice versa. This is primarily because the luma and chroma might require a different intermediate buffer size; but this doesn't really map onto the AVFrame / AVPixelFormat layer. The proper solution is to stop allocating the intermediate buffer as an AVFrame at all and just allocate raw CUDA buffers. This series accomplishes that, as well as fixing some pre-existing bugs. I have verified that the new implementation round-trips correctly with the filter math in #23555 for odd chroma planes, though it does *not* round-trip with legacy swscale. I'll leave that issue open as out-of-scope for now, under assumption that this might be a genuine legacy swscale bug. (The new filter math round-trips with itself) >From 2eb3955ca89d8d2f3bd12365df4762b0f879f518 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 13:19:10 +0200 Subject: [PATCH 1/9] avutil/hwcontext_cuda: fix off-by-one in cuda_transfer_data() Height of the chroma plane was computed using a naive right shift instead of AV_CEIL_RSHIFT, leading to the last line of chroma being uncopied. Signed-off-by: Niklas Haas <[email protected]> --- libavutil/hwcontext_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c index c57ea4bb7a..6342683a17 100644 --- a/libavutil/hwcontext_cuda.c +++ b/libavutil/hwcontext_cuda.c @@ -261,7 +261,7 @@ static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst, .srcPitch = src->linesize[i], .dstPitch = dst->linesize[i], .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), - .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height), + .Height = AV_CEIL_RSHIFT(src->height, ((i == 0 || i == 3) ? 0 : priv->shift_height)), }; if (src->hw_frames_ctx) { -- 2.52.0 >From c4545f5db2ff616422848904dfbd4595d65a8a8b Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 14:54:51 +0200 Subject: [PATCH 2/9] avfilter/vf_scale_cuda: eliminate redundant context push/pop This is already done by cudascale_filter_frame(). Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 2a7dc300f5..d429f791b0 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -756,7 +756,6 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass, { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; - CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; int i, ret; int mpeg_range = in->color_range != AVCOL_RANGE_JPEG; @@ -772,10 +771,6 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass, int crop_width = (in->width - in->crop_right) - in->crop_left; int crop_height = (in->height - in->crop_bottom) - in->crop_top; - ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); - if (ret < 0) - return ret; - for (i = 0; i < s->in_planes; i++) { CUDA_TEXTURE_DESC tex_desc = { .filterMode = s->interp_use_linear ? @@ -836,8 +831,6 @@ exit: if (tex[i]) CHECK_CU(cu->cuTexObjectDestroy(tex[i])); - CHECK_CU(cu->cuCtxPopCurrent(&dummy)); - return ret; } -- 2.52.0 >From f35ef1d7b802065eb2118ae6b1dbd4b85e6f32ad Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 15:43:45 +0200 Subject: [PATCH 3/9] avfilter/vf_scale_cuda: add fail: label (cosmetic) Make the next commit a bit easier to review. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index d429f791b0..d0c3f10f20 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -838,26 +838,25 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) { CUDAScaleContext *s = ctx->priv; AVFilterLink *outlink = ctx->outputs[0]; - AVFrame *src = in; - int ret; + int ret = 0; + AVFrame *src = in; if (s->inter_buf) { /* Handle first pass separately */ s->inter_buf->color_range = in->color_range; ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in); if (ret < 0) - return ret; + goto fail; src = s->inter_buf; } ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src); if (ret < 0) - return ret; + goto fail; - src = s->frame; - ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0); + ret = av_hwframe_get_buffer(s->frame->hw_frames_ctx, s->tmp_frame, 0); if (ret < 0) - return ret; + goto fail; av_frame_move_ref(out, s->frame); av_frame_move_ref(s->frame, s->tmp_frame); @@ -867,14 +866,15 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) ret = av_frame_copy_props(out, in); if (ret < 0) - return ret; + goto fail; if (out->width != in->width || out->height != in->height) { av_frame_side_data_remove_by_props(&out->side_data, &out->nb_side_data, AV_SIDE_DATA_PROP_SIZE_DEPENDENT); } - return 0; +fail: + return ret; } static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) -- 2.52.0 >From c636b27272ad1dadb497c51c46bb50a51bc1f427 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 15:48:44 +0200 Subject: [PATCH 4/9] avfilter/vf_scale_cuda: introduce CUDATex and mapping helper I want to disentangle the internal logic from AVFrame, because some intermediate states (e.g. for partially subsampled chroma with simultaneous scaling) may not directly map to a valid AVPixelFormat. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 186 +++++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 64 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index d0c3f10f20..540b869f7c 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -104,6 +104,15 @@ typedef struct CUDAScaleFilter { int dst_size; } CUDAScaleFilter; +typedef struct CUDATex { + CUtexObject tex[4]; + CUdeviceptr data[4]; + int linesize[4]; + int width, height; + int crop_left, crop_top, crop_width, crop_height; + int color_range; +} CUDATex; + typedef struct CUDAScaleContext { const AVClass *class; @@ -175,6 +184,16 @@ static void filter_uninit(CudaFunctions *cu, CUDAScaleFilter *filter) memset(filter, 0, sizeof(*filter)); } +static void cuda_tex_uninit(CudaFunctions *cu, CUDATex *t) +{ + for (int i = 0; i < FF_ARRAY_ELEMS(t->tex); i++) { + if (t->tex[i]) + cu->cuTexObjectDestroy(t->tex[i]); + } + + memset(t, 0, sizeof(*t)); +} + static av_cold void cudascale_uninit(AVFilterContext *ctx) { CUDAScaleContext *s = ctx->priv; @@ -711,9 +730,71 @@ fail: return ret; } +/* if depths/channels are NULL, only maps pointers without creating textures */ +static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame, + const int depths[4], const int channels[4], + CUDATex *tex) +{ + CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + const AVHWFramesContext *fctx = (const AVHWFramesContext*)frame->hw_frames_ctx->data; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fctx->sw_format); + const int planes = av_pix_fmt_count_planes(fctx->sw_format); + + *tex = (CUDATex) { + .width = frame->width, + .height = frame->height, + .crop_left = frame->crop_left, + .crop_top = frame->crop_top, + .crop_width = (frame->width - frame->crop_right) - frame->crop_left, + .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top, + .color_range = frame->color_range, + }; + + for (int i = 0; i < planes; i++) { + tex->data[i] = (CUdeviceptr)frame->data[i]; + tex->linesize[i] = frame->linesize[i]; + if (!depths || !channels) + continue; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = s->interp_use_linear ? + CU_TR_FILTER_MODE_LINEAR : + CU_TR_FILTER_MODE_POINT, + .flags = s->interp_as_integer ? CU_TRSF_READ_AS_INTEGER : 0, + }; + + const int is_chroma = i == 1 || i == 2; + const int sub_x = is_chroma ? desc->log2_chroma_w : 0; + const int sub_y = is_chroma ? desc->log2_chroma_h : 0; + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = depths[i] <= 8 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels[i], + .res.pitch2D.pitchInBytes = tex->linesize[i], + .res.pitch2D.devPtr = tex->data[i], + .res.pitch2D.width = AV_CEIL_RSHIFT(frame->width, sub_x), + .res.pitch2D.height = AV_CEIL_RSHIFT(frame->height, sub_y), + }; + + int ret = CHECK_CU(cu->cuTexObjectCreate(&tex->tex[i], &res_desc, &tex_desc, NULL)); + if (ret < 0) { + cuda_tex_uninit(cu, tex); + return ret; + } + } + + return 0; +} + static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, - CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height, - AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch, int mpeg_range, + const CUtexObject src_tex[4], + int src_left, int src_top, int src_width, int src_height, + const CUdeviceptr out_data[4], + int dst_width, int dst_height, int dst_pitch, int mpeg_range, const CUDAScaleFilter *filter) { CUDAScaleContext *s = ctx->priv; @@ -722,10 +803,10 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, CUDAScaleKernelParams params = { .src_tex = {src_tex[0], src_tex[1], src_tex[2], src_tex[3]}, .dst = { - (CUdeviceptr)out_frame->data[0], - (CUdeviceptr)out_frame->data[1], - (CUdeviceptr)out_frame->data[2], - (CUdeviceptr)out_frame->data[3] + out_data[0], + out_data[1], + out_data[2], + out_data[3] }, .dst_width = dst_width, .dst_height = dst_height, @@ -752,12 +833,11 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, } static int scalecuda_resize(AVFilterContext *ctx, int pass, - AVFrame *out, AVFrame *in) + const CUDATex *out, const CUDATex *in) { CUDAScaleContext *s = ctx->priv; - CudaFunctions *cu = s->hwctx->internal->cuda_dl; - int i, ret; int mpeg_range = in->color_range != AVCOL_RANGE_JPEG; + int ret; const AVPixFmtDescriptor *out_desc = s->out_desc; int out_planes = s->out_planes; @@ -766,91 +846,66 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass, out_planes = s->in_planes; } - CUtexObject tex[4] = { 0, 0, 0, 0 }; - - int crop_width = (in->width - in->crop_right) - in->crop_left; - int crop_height = (in->height - in->crop_bottom) - in->crop_top; - - for (i = 0; i < s->in_planes; i++) { - CUDA_TEXTURE_DESC tex_desc = { - .filterMode = s->interp_use_linear ? - CU_TR_FILTER_MODE_LINEAR : - CU_TR_FILTER_MODE_POINT, - .flags = s->interp_as_integer ? CU_TRSF_READ_AS_INTEGER : 0, - }; - - CUDA_RESOURCE_DESC res_desc = { - .resType = CU_RESOURCE_TYPE_PITCH2D, - .res.pitch2D.format = s->in_plane_depths[i] <= 8 ? - CU_AD_FORMAT_UNSIGNED_INT8 : - CU_AD_FORMAT_UNSIGNED_INT16, - .res.pitch2D.numChannels = s->in_plane_channels[i], - .res.pitch2D.pitchInBytes = in->linesize[i], - .res.pitch2D.devPtr = (CUdeviceptr)in->data[i], - }; - - if (i == 1 || i == 2) { - res_desc.res.pitch2D.width = AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w); - res_desc.res.pitch2D.height = AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h); - } else { - res_desc.res.pitch2D.width = in->width; - res_desc.res.pitch2D.height = in->height; - } - - ret = CHECK_CU(cu->cuTexObjectCreate(&tex[i], &res_desc, &tex_desc, NULL)); - if (ret < 0) - goto exit; - } - // scale primary plane(s). Usually Y (and A), or single plane of RGB frames. ret = call_resize_kernel(ctx, s->cu_func[pass], - tex, in->crop_left, in->crop_top, crop_width, crop_height, - out, out->width, out->height, out->linesize[0], mpeg_range, + in->tex, in->crop_left, in->crop_top, + in->crop_width, in->crop_height, + out->data, out->width, out->height, + out->linesize[0], mpeg_range, &s->filters[pass]); if (ret < 0) - goto exit; + return ret; if (out_planes > 1) { // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane. - ret = call_resize_kernel(ctx, s->cu_func_uv[pass], tex, + ret = call_resize_kernel(ctx, s->cu_func_uv[pass], in->tex, AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w), AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h), - AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w), - AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h), - out, + AV_CEIL_RSHIFT(in->crop_width, s->in_desc->log2_chroma_w), + AV_CEIL_RSHIFT(in->crop_height, s->in_desc->log2_chroma_h), + out->data, AV_CEIL_RSHIFT(out->width, out_desc->log2_chroma_w), AV_CEIL_RSHIFT(out->height, out_desc->log2_chroma_h), out->linesize[1], mpeg_range, &s->filters_uv[pass]); if (ret < 0) - goto exit; + return ret; } -exit: - for (i = 0; i < s->in_planes; i++) - if (tex[i]) - CHECK_CU(cu->cuTexObjectDestroy(tex[i])); - - return ret; + return 0; } static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) { CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVFilterLink *outlink = ctx->outputs[0]; int ret = 0; - AVFrame *src = in; + CUDATex in_tex = {0}, out_tex = {0}, inter_tex = {0}; + ret = cuda_tex_map_frame(ctx, in, s->in_plane_depths, s->in_plane_channels, &in_tex); + if (ret < 0) + goto fail; + + ret = cuda_tex_map_frame(ctx, s->frame, NULL, NULL, &out_tex); + if (ret < 0) + goto fail; + + const CUDATex *src = &in_tex; if (s->inter_buf) { /* Handle first pass separately */ - s->inter_buf->color_range = in->color_range; - ret = scalecuda_resize(ctx, FILTER_TMP, s->inter_buf, in); + ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths, + s->in_plane_channels, &inter_tex); if (ret < 0) goto fail; - src = s->inter_buf; + inter_tex.color_range = in->color_range; + ret = scalecuda_resize(ctx, FILTER_TMP, &inter_tex, src); + if (ret < 0) + goto fail; + src = &inter_tex; } - ret = scalecuda_resize(ctx, FILTER_OUT, s->frame, src); + ret = scalecuda_resize(ctx, FILTER_OUT, &out_tex, src); if (ret < 0) goto fail; @@ -874,6 +929,9 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) } fail: + cuda_tex_uninit(cu, &in_tex); + cuda_tex_uninit(cu, &out_tex); + cuda_tex_uninit(cu, &inter_tex); return ret; } -- 2.52.0 >From caa07f07a67dddc0cebb609408e42e1f0ae7b1d1 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 16:42:19 +0200 Subject: [PATCH 5/9] avfilter/vf_scale_cuda: defer buffer allocation to setup_filters() At this point, s->hwctx and CudaFunctions * are available. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 540b869f7c..a71f91d824 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -260,20 +260,22 @@ fail: return ret; } -static av_cold int inter_buf_init(CUDAScaleContext *s, AVBufferRef *device_ctx, - enum AVPixelFormat format, int width, int height) +static av_cold int inter_buf_init(CUDAScaleContext *s, AVFilterLink *inlink, + int width, int height) { AVBufferRef *ref = NULL; AVHWFramesContext *fctx; int ret; - ref = av_hwframe_ctx_alloc(device_ctx); + FilterLink *inl = ff_filter_link(inlink); + AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + ref = av_hwframe_ctx_alloc(in_frames_ctx->device_ref); if (!ref) return AVERROR(ENOMEM); fctx = (AVHWFramesContext*)ref->data; fctx->format = AV_PIX_FMT_CUDA; - fctx->sw_format = format; + fctx->sw_format = in_frames_ctx->sw_format; fctx->width = FFALIGN(width, 32); fctx->height = FFALIGN(height, 32); @@ -400,13 +402,6 @@ static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int s->use_filters = 0; } else if (s->use_filters < 0 && (in_width < out_width || in_height < out_height)) s->use_filters = 1; /* downscaling; needed for anti-aliasing */ - - if (s->use_filters) { - ret = inter_buf_init(s, in_frames_ctx->device_ref, in_format, - out_width, in_height); - if (ret < 0) - return ret; - } } outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); @@ -647,6 +642,10 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) } } + ret = inter_buf_init(s, inlink, outlink->w, inlink->h); + if (ret < 0) + goto fail; + ret = 0; fail: -- 2.52.0 >From 64f36337a528c8e1565ce5c96d77b597ac45678a Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 16:44:37 +0200 Subject: [PATCH 6/9] avfilter/vf_scale_cuda: use persistent intermediate CUDATex Instead of re-creating this object every frame. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index a71f91d824..1f0ef2555a 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -155,6 +155,7 @@ typedef struct CUDAScaleContext { CUDAScaleFilter filters[FILTER_NB]; CUDAScaleFilter filters_uv[FILTER_NB]; AVFrame *inter_buf; /* intermediate buffer for separated scaling */ + CUDATex inter_tex; int use_filters; /* -1 for auto */ float param; @@ -204,6 +205,7 @@ static av_cold void cudascale_uninit(AVFilterContext *ctx) CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + cuda_tex_uninit(cu, &s->inter_tex); for (int i = 0; i < FF_ARRAY_ELEMS(s->filters); i++) { filter_uninit(cu, &s->filters[i]); filter_uninit(cu, &s->filters_uv[i]); @@ -260,9 +262,15 @@ fail: return ret; } -static av_cold int inter_buf_init(CUDAScaleContext *s, AVFilterLink *inlink, +static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame, + const int depths[4], const int channels[4], + CUDATex *tex); + +static av_cold int inter_buf_init(AVFilterContext *ctx, AVFilterLink *inlink, int width, int height) { + CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVBufferRef *ref = NULL; AVHWFramesContext *fctx; int ret; @@ -297,10 +305,16 @@ static av_cold int inter_buf_init(CUDAScaleContext *s, AVFilterLink *inlink, s->inter_buf->width = width; s->inter_buf->height = height; + ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths, + s->in_plane_channels, &s->inter_tex); + if (ret < 0) + goto fail; + av_buffer_unref(&ref); return 0; fail: + cuda_tex_uninit(cu, &s->inter_tex); av_frame_free(&s->inter_buf); av_buffer_unref(&ref); return ret; @@ -642,7 +656,7 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) } } - ret = inter_buf_init(s, inlink, outlink->w, inlink->h); + ret = inter_buf_init(ctx, inlink, outlink->w, inlink->h); if (ret < 0) goto fail; @@ -881,7 +895,7 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) AVFilterLink *outlink = ctx->outputs[0]; int ret = 0; - CUDATex in_tex = {0}, out_tex = {0}, inter_tex = {0}; + CUDATex in_tex = {0}, out_tex = {0}; ret = cuda_tex_map_frame(ctx, in, s->in_plane_depths, s->in_plane_channels, &in_tex); if (ret < 0) goto fail; @@ -893,15 +907,11 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) const CUDATex *src = &in_tex; if (s->inter_buf) { /* Handle first pass separately */ - ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths, - s->in_plane_channels, &inter_tex); + s->inter_tex.color_range = in->color_range; + ret = scalecuda_resize(ctx, FILTER_TMP, &s->inter_tex, src); if (ret < 0) goto fail; - inter_tex.color_range = in->color_range; - ret = scalecuda_resize(ctx, FILTER_TMP, &inter_tex, src); - if (ret < 0) - goto fail; - src = &inter_tex; + src = &s->inter_tex; } ret = scalecuda_resize(ctx, FILTER_OUT, &out_tex, src); @@ -930,7 +940,6 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) fail: cuda_tex_uninit(cu, &in_tex); cuda_tex_uninit(cu, &out_tex); - cuda_tex_uninit(cu, &inter_tex); return ret; } -- 2.52.0 >From 684a4ce8fa3c6d2b34160cbd5ce41faa6ed63cb6 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 16:47:27 +0200 Subject: [PATCH 7/9] avfilter/vf_scale_cuda: allocate intermediate buffer directly Instead of going via an AVFrame at all. This will allow us to fix the intermediate chroma plane size for split downscaling. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 100 +++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 1f0ef2555a..6204da98dc 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -24,7 +24,6 @@ #include <stdio.h> #include <string.h> -#include "libavutil/avassert.h" #include "libavutil/common.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" @@ -111,6 +110,7 @@ typedef struct CUDATex { int width, height; int crop_left, crop_top, crop_width, crop_height; int color_range; + int external_data; } CUDATex; typedef struct CUDAScaleContext { @@ -154,7 +154,6 @@ typedef struct CUDAScaleContext { CUDAScaleFilter filters[FILTER_NB]; CUDAScaleFilter filters_uv[FILTER_NB]; - AVFrame *inter_buf; /* intermediate buffer for separated scaling */ CUDATex inter_tex; int use_filters; /* -1 for auto */ @@ -190,6 +189,8 @@ static void cuda_tex_uninit(CudaFunctions *cu, CUDATex *t) for (int i = 0; i < FF_ARRAY_ELEMS(t->tex); i++) { if (t->tex[i]) cu->cuTexObjectDestroy(t->tex[i]); + if (t->data[i] && !t->external_data) + cu->cuMemFree(t->data[i]); } memset(t, 0, sizeof(*t)); @@ -222,7 +223,6 @@ static av_cold void cudascale_uninit(AVFilterContext *ctx) av_frame_free(&s->frame); av_buffer_unref(&s->frames_ctx); av_frame_free(&s->tmp_frame); - av_frame_free(&s->inter_buf); } static av_cold int init_hwframe_ctx(CUDAScaleContext *s, AVBufferRef *device_ctx, int width, int height) @@ -262,61 +262,64 @@ fail: return ret; } -static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame, - const int depths[4], const int channels[4], - CUDATex *tex); - -static av_cold int inter_buf_init(AVFilterContext *ctx, AVFilterLink *inlink, - int width, int height) +static av_cold int inter_buf_init(AVFilterContext *ctx, int width, int height) { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; - AVBufferRef *ref = NULL; - AVHWFramesContext *fctx; - int ret; + int ret = 0; - FilterLink *inl = ff_filter_link(inlink); - AVHWFramesContext *in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; - ref = av_hwframe_ctx_alloc(in_frames_ctx->device_ref); - if (!ref) - return AVERROR(ENOMEM); - fctx = (AVHWFramesContext*)ref->data; + cuda_tex_uninit(cu, &s->inter_tex); + s->inter_tex = (CUDATex) { + .width = width, + .height = height, + .crop_width = width, + .crop_height = height, + }; - fctx->format = AV_PIX_FMT_CUDA; - fctx->sw_format = in_frames_ctx->sw_format; - fctx->width = FFALIGN(width, 32); - fctx->height = FFALIGN(height, 32); + for (int i = 0; i < s->in_planes; i++) { + const int is_chroma = i == 1 || i == 2; + const int sub_x = is_chroma ? s->in_desc->log2_chroma_w : 0; + const int sub_y = is_chroma ? s->in_desc->log2_chroma_h : 0; + const int plane_w = AV_CEIL_RSHIFT(width, sub_x); + const int plane_h = AV_CEIL_RSHIFT(height, sub_y); + const int sizeof_pixel = (s->in_plane_depths[i] <= 8 ? 1 : 2) * + s->in_plane_channels[i]; - ret = av_hwframe_ctx_init(ref); - if (ret < 0) - goto fail; + size_t pitch; + ret = CHECK_CU(cu->cuMemAllocPitch(&s->inter_tex.data[i], &pitch, + (size_t) plane_w * sizeof_pixel, + plane_h, 16)); + if (ret < 0) + goto fail; + s->inter_tex.linesize[i] = pitch; - av_assert0(!s->inter_buf); - s->inter_buf = av_frame_alloc(); - if (!s->inter_buf) { - ret = AVERROR(ENOMEM); - goto fail; + CUDA_TEXTURE_DESC tex_desc = { + /* inter tex is always read as float */ + .filterMode = CU_TR_FILTER_MODE_POINT, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = s->in_plane_depths[i] <= 8 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = s->in_plane_channels[i], + .res.pitch2D.devPtr = s->inter_tex.data[i], + .res.pitch2D.pitchInBytes = pitch, + .res.pitch2D.width = plane_w, + .res.pitch2D.height = plane_h, + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&s->inter_tex.tex[i], &res_desc, + &tex_desc, NULL)); + if (ret < 0) + goto fail; } - ret = av_hwframe_get_buffer(ref, s->inter_buf, 0); - if (ret < 0) - goto fail; - - s->inter_buf->width = width; - s->inter_buf->height = height; - - ret = cuda_tex_map_frame(ctx, s->inter_buf, s->in_plane_depths, - s->in_plane_channels, &s->inter_tex); - if (ret < 0) - goto fail; - - av_buffer_unref(&ref); return 0; fail: cuda_tex_uninit(cu, &s->inter_tex); - av_frame_free(&s->inter_buf); - av_buffer_unref(&ref); return ret; } @@ -501,7 +504,7 @@ static av_cold int cudascale_load_functions(AVFilterContext *ctx) goto fail; av_log(ctx, AV_LOG_DEBUG, "Chroma filter: %s (%s -> %s)\n", buf, av_get_pix_fmt_name(s->in_fmt), av_get_pix_fmt_name(s->out_fmt)); - if (s->inter_buf) { + if (s->use_filters) { /* Intermediate pass is always horizontal */ snprintf(buf, sizeof(buf), "Subsample_Generic_h_%s_%s", in_fmt_name, in_fmt_name); ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func[FILTER_TMP], s->cu_module, buf)); @@ -656,7 +659,7 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) } } - ret = inter_buf_init(ctx, inlink, outlink->w, inlink->h); + ret = inter_buf_init(ctx, outlink->w, inlink->h); if (ret < 0) goto fail; @@ -763,6 +766,7 @@ static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame, .crop_width = (frame->width - frame->crop_right) - frame->crop_left, .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top, .color_range = frame->color_range, + .external_data = 1, }; for (int i = 0; i < planes; i++) { @@ -905,7 +909,7 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) goto fail; const CUDATex *src = &in_tex; - if (s->inter_buf) { + if (s->use_filters) { /* Handle first pass separately */ s->inter_tex.color_range = in->color_range; ret = scalecuda_resize(ctx, FILTER_TMP, &s->inter_tex, src); -- 2.52.0 >From f3a436daaa4b95ae74baec14e257c5a82f18d769 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 17:07:05 +0200 Subject: [PATCH 8/9] avfilter/vf_scale_cuda: allocate inter buffer with correct subsampling Since the input and output format can differ (e.g. 444 -> 420), we need to reference the correct subsampling for the partially applied filter. Keep track of this in the CUDATex itself. Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 62 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 6204da98dc..9ee962ac9a 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -108,6 +108,7 @@ typedef struct CUDATex { CUdeviceptr data[4]; int linesize[4]; int width, height; + int log2_chroma_w, log2_chroma_h; int crop_left, crop_top, crop_width, crop_height; int color_range; int external_data; @@ -262,7 +263,7 @@ fail: return ret; } -static av_cold int inter_buf_init(AVFilterContext *ctx, int width, int height) +static av_cold int inter_buf_init(AVFilterContext *ctx, int out_width, int in_height) { CUDAScaleContext *s = ctx->priv; CudaFunctions *cu = s->hwctx->internal->cuda_dl; @@ -270,18 +271,20 @@ static av_cold int inter_buf_init(AVFilterContext *ctx, int width, int height) cuda_tex_uninit(cu, &s->inter_tex); s->inter_tex = (CUDATex) { - .width = width, - .height = height, - .crop_width = width, - .crop_height = height, + .width = out_width, + .height = in_height, + .crop_width = out_width, + .crop_height = in_height, + .log2_chroma_w = s->out_desc->log2_chroma_w, + .log2_chroma_h = s->in_desc->log2_chroma_h, }; for (int i = 0; i < s->in_planes; i++) { const int is_chroma = i == 1 || i == 2; - const int sub_x = is_chroma ? s->in_desc->log2_chroma_w : 0; - const int sub_y = is_chroma ? s->in_desc->log2_chroma_h : 0; - const int plane_w = AV_CEIL_RSHIFT(width, sub_x); - const int plane_h = AV_CEIL_RSHIFT(height, sub_y); + const int sub_x = is_chroma ? s->inter_tex.log2_chroma_w : 0; + const int sub_y = is_chroma ? s->inter_tex.log2_chroma_h : 0; + const int plane_w = AV_CEIL_RSHIFT(out_width, sub_x); + const int plane_h = AV_CEIL_RSHIFT(in_height, sub_y); const int sizeof_pixel = (s->in_plane_depths[i] <= 8 ? 1 : 2) * s->in_plane_channels[i]; @@ -609,8 +612,10 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) CUcontext dummy; int ret; - const int sub_x = s->in_desc->log2_chroma_w; - const int sub_y = s->in_desc->log2_chroma_h; + const int in_sub_x = s->in_desc->log2_chroma_w; + const int in_sub_y = s->in_desc->log2_chroma_h; + const int out_sub_x = s->out_desc->log2_chroma_w; + const int out_sub_y = s->out_desc->log2_chroma_h; ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); if (ret < 0) @@ -633,11 +638,11 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) if (ret < 0) goto fail; if (s->in_planes > 1) { - const int src_size = AV_CEIL_RSHIFT(inlink->w, sub_x); - const int dst_size = AV_CEIL_RSHIFT(outlink->w, sub_x); - const double ratio = (double) outlink->w / inlink->w; + const int src_size = AV_CEIL_RSHIFT(inlink->w, in_sub_x); + const int dst_size = AV_CEIL_RSHIFT(outlink->w, out_sub_x); + const double virtual_size = (double) outlink->w / (1 << out_sub_x); ret = cudascale_filter_init(ctx, &s->filters_uv[pass_x], - src_size, dst_size, src_size * ratio); + src_size, dst_size, virtual_size); if (ret < 0) goto fail; } @@ -649,11 +654,11 @@ static av_cold int cudascale_setup_filters(AVFilterContext *ctx) if (ret < 0) goto fail; if (s->in_planes > 1) { - const int src_size = AV_CEIL_RSHIFT(inlink->h, sub_y); - const int dst_size = AV_CEIL_RSHIFT(outlink->h, sub_y); - const double ratio = (double) outlink->h / inlink->h; + const int src_size = AV_CEIL_RSHIFT(inlink->h, in_sub_y); + const int dst_size = AV_CEIL_RSHIFT(outlink->h, out_sub_y); + const double virtual_size = (double) outlink->h / (1 << out_sub_y); ret = cudascale_filter_init(ctx, &s->filters_uv[pass_y], - src_size, dst_size, src_size * ratio); + src_size, dst_size, virtual_size); if (ret < 0) goto fail; } @@ -766,6 +771,8 @@ static int cuda_tex_map_frame(AVFilterContext *ctx, const AVFrame *frame, .crop_width = (frame->width - frame->crop_right) - frame->crop_left, .crop_height = (frame->height - frame->crop_bottom) - frame->crop_top, .color_range = frame->color_range, + .log2_chroma_w = desc->log2_chroma_w, + .log2_chroma_h = desc->log2_chroma_h, .external_data = 1, }; @@ -856,12 +863,9 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass, int mpeg_range = in->color_range != AVCOL_RANGE_JPEG; int ret; - const AVPixFmtDescriptor *out_desc = s->out_desc; int out_planes = s->out_planes; - if (pass == FILTER_TMP) { - out_desc = s->in_desc; + if (pass == FILTER_TMP) out_planes = s->in_planes; - } // scale primary plane(s). Usually Y (and A), or single plane of RGB frames. ret = call_resize_kernel(ctx, s->cu_func[pass], @@ -876,13 +880,13 @@ static int scalecuda_resize(AVFilterContext *ctx, int pass, if (out_planes > 1) { // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane. ret = call_resize_kernel(ctx, s->cu_func_uv[pass], in->tex, - AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w), - AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h), - AV_CEIL_RSHIFT(in->crop_width, s->in_desc->log2_chroma_w), - AV_CEIL_RSHIFT(in->crop_height, s->in_desc->log2_chroma_h), + AV_CEIL_RSHIFT(in->crop_left, in->log2_chroma_w), + AV_CEIL_RSHIFT(in->crop_top, in->log2_chroma_h), + AV_CEIL_RSHIFT(in->crop_width, in->log2_chroma_w), + AV_CEIL_RSHIFT(in->crop_height, in->log2_chroma_h), out->data, - AV_CEIL_RSHIFT(out->width, out_desc->log2_chroma_w), - AV_CEIL_RSHIFT(out->height, out_desc->log2_chroma_h), + AV_CEIL_RSHIFT(out->width, out->log2_chroma_w), + AV_CEIL_RSHIFT(out->height, out->log2_chroma_h), out->linesize[1], mpeg_range, &s->filters_uv[pass]); if (ret < 0) -- 2.52.0 >From ecb43d3b62a1f7be919908efa49ea792e0b76599 Mon Sep 17 00:00:00 2001 From: Niklas Haas <[email protected]> Date: Fri, 26 Jun 2026 17:24:36 +0200 Subject: [PATCH 9/9] avfilter/vf_scale_cuda: fix inverted downscaling check Signed-off-by: Niklas Haas <[email protected]> --- libavfilter/vf_scale_cuda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 9ee962ac9a..409145c44f 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -420,7 +420,7 @@ static av_cold int init_processing_chain(AVFilterContext *ctx, int in_width, int if (s->interp_algo == INTERP_ALGO_NEAREST) { s->use_filters = 0; - } else if (s->use_filters < 0 && (in_width < out_width || in_height < out_height)) + } else if (s->use_filters < 0 && (out_width < in_width || out_height < in_height)) s->use_filters = 1; /* downscaling; needed for anti-aliasing */ } -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
