[FFmpeg-devel] [PR] avfilter: add transpose_cuda video filter (PR #22493)

nyanmisaka via ffmpeg-devel Fri, 13 Mar 2026 07:29:52 -0700

PR #22493 opened by nyanmisaka
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22493
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22493.patch


This patch adds the transpose_cuda video filter.
It's similar to the existing transpose filter but accelerated by CUDA.

It supports the same pixel formats as the scale_cuda filter.
This also supersedes the deprecated transpose_npp filter.

Example usage:
```
ffmpeg -hwaccel cuda -hwaccel_output_format cuda -i <INPUT> -vf 
"transpose_cuda=dir=clock" <OUTPUT>
```

Signed-off-by: nyanmisaka <[email protected]>



>From b367927ddf8bf12c9df09a3675986367b4440778 Mon Sep 17 00:00:00 2001
From: nyanmisaka <[email protected]>
Date: Fri, 13 Mar 2026 22:18:59 +0800
Subject: [PATCH] avfilter: add transpose_cuda video filter

This patch adds the transpose_cuda video filter.
It's similar to the existing transpose filter but accelerated by CUDA.

It supports the same pixel formats as the scale_cuda filter.
This also supersedes the deprecated transpose_npp filter.

Example usage:
ffmpeg -hwaccel cuda -hwaccel_output_format cuda -i <INPUT> -vf 
"transpose_cuda=dir=clock" <OUTPUT>

Signed-off-by: nyanmisaka <[email protected]>
---
 Changelog                        |   1 +
 configure                        |   2 +
 doc/filters.texi                 |  50 +++
 libavfilter/Makefile             |   2 +
 libavfilter/allfilters.c         |   1 +
 libavfilter/vf_transpose_cuda.c  | 516 +++++++++++++++++++++++++++++++
 libavfilter/vf_transpose_cuda.cu |  65 ++++
 7 files changed, 637 insertions(+)
 create mode 100644 libavfilter/vf_transpose_cuda.c
 create mode 100644 libavfilter/vf_transpose_cuda.cu

diff --git a/Changelog b/Changelog
index 1cd8f5461b..52277ac7ae 100644
--- a/Changelog
+++ b/Changelog
@@ -2,6 +2,7 @@ Entries are sorted chronologically from oldest to youngest 
within each release,
 releases are sorted from youngest to oldest.
 
 version <next>:
+- transpose_cuda filter
 
 
 version 8.1:
diff --git a/configure b/configure
index 2f6167fddb..c9a51efba2 100755
--- a/configure
+++ b/configure
@@ -3530,6 +3530,8 @@ scale_cuda_filter_deps="ffnvcodec"
 scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 thumbnail_cuda_filter_deps="ffnvcodec"
 thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+transpose_cuda_filter_deps="ffnvcodec"
+transpose_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
 overlay_cuda_filter_deps="ffnvcodec"
 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
diff --git a/doc/filters.texi b/doc/filters.texi
index 569ff516d4..086ad77138 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -27440,6 +27440,56 @@ Thumbnails are extracted from every @var{n}=150-frame 
batch, selecting one per b
 
 @end itemize
 
+@subsection transpose_cuda
+
+Transpose rows with columns in the input video and optionally flip it.
+For more in depth examples see the @ref{transpose} video filter, which shares 
mostly the same options.
+
+It accepts the following parameters:
+
+@table @option
+
+@item dir
+Specify the transposition direction.
+
+Can assume the following values:
+@table @samp
+@item cclock_flip
+Rotate by 90 degrees counterclockwise and vertically flip. (default)
+
+@item clock
+Rotate by 90 degrees clockwise.
+
+@item cclock
+Rotate by 90 degrees counterclockwise.
+
+@item clock_flip
+Rotate by 90 degrees clockwise and vertically flip.
+
+@item reversal
+Rotate by 180 degrees.
+
+@item hflip
+Flip horizontally.
+
+@item vflip
+Flip vertically.
+@end table
+
+@item passthrough
+Do not apply the transposition if the input geometry matches the one
+specified by the specified value. It accepts the following values:
+@table @samp
+@item none
+Always apply transposition. (default)
+@item portrait
+Preserve portrait geometry (when @var{height} >= @var{width}).
+@item landscape
+Preserve landscape geometry (when @var{width} >= @var{height}).
+@end table
+
+@end table
+
 @subsection yadif_cuda
 
 Deinterlace the input video using the @ref{yadif} algorithm, but implemented
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index a530cfae29..305e05d228 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -546,6 +546,8 @@ OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += 
vf_tonemap_opencl.o opencl.o \
 OBJS-$(CONFIG_TONEMAP_VAAPI_FILTER)          += vf_tonemap_vaapi.o vaapi_vpp.o
 OBJS-$(CONFIG_TPAD_FILTER)                   += vf_tpad.o
 OBJS-$(CONFIG_TRANSPOSE_FILTER)              += vf_transpose.o
+OBJS-$(CONFIG_TRANSPOSE_CUDA_FILTER)         += vf_transpose_cuda.o 
vf_transpose_cuda.ptx.o \
+                                                cuda/load_helper.o
 OBJS-$(CONFIG_TRANSPOSE_NPP_FILTER)          += vf_transpose_npp.o
 OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o 
opencl/transpose.o
 OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o 
vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index e26859e159..75592d55d7 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -513,6 +513,7 @@ extern const FFFilter ff_vf_tonemap_opencl;
 extern const FFFilter ff_vf_tonemap_vaapi;
 extern const FFFilter ff_vf_tpad;
 extern const FFFilter ff_vf_transpose;
+extern const FFFilter ff_vf_transpose_cuda;
 extern const FFFilter ff_vf_transpose_npp;
 extern const FFFilter ff_vf_transpose_opencl;
 extern const FFFilter ff_vf_transpose_vaapi;
diff --git a/libavfilter/vf_transpose_cuda.c b/libavfilter/vf_transpose_cuda.c
new file mode 100644
index 0000000000..234cb366de
--- /dev/null
+++ b/libavfilter/vf_transpose_cuda.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2026 NyanMisaka
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "transpose.h"
+#include "video.h"
+
+#include "cuda/load_helper.h"
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCK_X 32
+#define BLOCK_Y 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV422P,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_YUV420P10,
+    AV_PIX_FMT_YUV422P10,
+    AV_PIX_FMT_YUV444P10,
+    AV_PIX_FMT_YUV444P16,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_NV16,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P210,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_P216,
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+};
+
+typedef struct TransposeCUDAContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+    AVBufferRef         *frames_ctx;
+    AVFrame             *frame;
+    AVFrame             *tmp_frame;
+
+    const AVPixFmtDescriptor *pix_desc;
+
+    CUcontext  cu_ctx;
+    CUmodule   cu_module;
+    CUfunction cu_func_uchar;
+    CUfunction cu_func_ushort;
+    CUfunction cu_func_uchar2;
+    CUfunction cu_func_ushort2;
+    CUfunction cu_func_uchar4;
+    CUstream   cu_stream;
+
+    int passthrough;    ///< PassthroughType, landscape passthrough mode 
enabled
+    int dir;            ///< TransposeDir
+} TransposeCUDAContext;
+
+static av_cold int cudatranspose_init(AVFilterContext *ctx)
+{
+    TransposeCUDAContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void cudatranspose_uninit(AVFilterContext *ctx)
+{
+    TransposeCUDAContext *s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CUcontext dummy;
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CHECK_CU(cu->cuCtxPushCurrent(s->cu_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static av_cold int init_hwframe_ctx(TransposeCUDAContext *s,
+                                    AVBufferRef *device_ctx,
+                                    int width, int height,
+                                    enum AVPixelFormat sw_format)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext*)out_ref->data;
+
+    out_ctx->format    = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = sw_format;
+    out_ctx->width     = FFALIGN(width,  32);
+    out_ctx->height    = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width  = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+
+    return 0;
+}
+
+static int init_processing_chain(AVFilterContext *ctx,
+                                 int out_width, int out_height)
+{
+    FilterLink *inl  = ff_filter_link(ctx->inputs[0]);
+    FilterLink *outl = ff_filter_link(ctx->outputs[0]);
+    TransposeCUDAContext *s = ctx->priv;
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat format;
+    int ret;
+
+    /* check that we have a hw context */
+    if (!inl->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+
+    in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    format        = in_frames_ctx->sw_format;
+    s->pix_desc   = av_pix_fmt_desc_get(format);
+
+    if (!format_is_supported(format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(format));
+        return AVERROR(ENOSYS);
+    }
+
+    ret = init_hwframe_ctx(s, in_frames_ctx->device_ref,
+                           out_width, out_height, format);
+    if (ret < 0)
+        return ret;
+
+    s->hwctx = in_frames_ctx->device_ctx->hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!outl->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static int cudatranspose_config_props(AVFilterLink *outlink)
+{
+    extern const unsigned char ff_vf_transpose_cuda_ptx_data[];
+    extern const unsigned int ff_vf_transpose_cuda_ptx_len;
+    FilterLink     *outl = ff_filter_link(outlink);
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = ctx->inputs[0];
+    FilterLink      *inl = ff_filter_link(inlink);
+    TransposeCUDAContext *s = ctx->priv;
+    CUcontext dummy, cuda_ctx;
+    CudaFunctions *cu;
+    int ret;
+
+    if ((inlink->w >= inlink->h && s->passthrough == 
TRANSPOSE_PT_TYPE_LANDSCAPE) ||
+        (inlink->w <= inlink->h && s->passthrough == 
TRANSPOSE_PT_TYPE_PORTRAIT)) {
+        if (inl->hw_frames_ctx) {
+            outl->hw_frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
+            if (!outl->hw_frames_ctx)
+                return AVERROR(ENOMEM);
+        }
+
+        av_log(ctx, AV_LOG_VERBOSE,
+               "w:%d h:%d -> w:%d h:%d (passthrough mode)\n",
+               inlink->w, inlink->h, inlink->w, inlink->h);
+        return 0;
+    } else {
+        s->passthrough = TRANSPOSE_PT_TYPE_NONE;
+    }
+
+    switch (s->dir) {
+    case TRANSPOSE_CCLOCK_FLIP:
+    case TRANSPOSE_CCLOCK:
+    case TRANSPOSE_CLOCK:
+    case TRANSPOSE_CLOCK_FLIP:
+        outlink->w = inlink->h;
+        outlink->h = inlink->w;
+        break;
+    default:
+        outlink->w = inlink->w;
+        outlink->h = inlink->h;
+        break;
+    }
+
+    if (inlink->sample_aspect_ratio.num)
+        outlink->sample_aspect_ratio = av_div_q((AVRational) { 1, 1 },
+                                                inlink->sample_aspect_ratio);
+    else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    ret = init_processing_chain(ctx, outlink->w, outlink->h);
+    if (ret < 0)
+        return ret;
+
+    cuda_ctx = s->cu_ctx = s->hwctx->cuda_ctx;
+    cu = s->hwctx->internal->cuda_dl;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_transpose_cuda_ptx_data, 
ff_vf_transpose_cuda_ptx_len);
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, 
"Transpose_Cuda_uchar"));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, 
"Transpose_Cuda_ushort"));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, 
"Transpose_Cuda_uchar2"));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, 
"Transpose_Cuda_ushort2"));
+    if (ret < 0)
+        goto exit;
+
+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, 
"Transpose_Cuda_uchar4"));
+    if (ret < 0)
+        goto exit;
+
+    av_log(ctx, AV_LOG_VERBOSE,
+           "w:%d h:%d dir:%d -> w:%d h:%d\n",
+           inlink->w, inlink->h, s->dir, outlink->w, outlink->h);
+exit:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return 0;
+}
+
+static CUresult call_kernel(AVFilterContext *ctx,
+                            CUfunction cu_func,
+                            int is_422_uv,    // Dst* & Src* are 4:2:2 UV 
planes
+                            CUdeviceptr dst0,
+                            CUdeviceptr dst1, // Dst1 is for planar V, optional
+                            int dst_width,    // Width is pixels per channel
+                            int dst_height,   // Height is pixels per channel
+                            int dst_pitch,
+                            CUdeviceptr src0,
+                            CUdeviceptr src1, // Dst1 is for planar V, optional
+                            int src_pitch)
+{
+    TransposeCUDAContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    void* kernel_args[] = {
+        &dst0, &dst1, &dst_width, &dst_height, &dst_pitch,
+        &src0, &src1, &src_pitch, &is_422_uv, &s->dir,
+    };
+
+    return CHECK_CU(cu->cuLaunchKernel(cu_func,
+                                       DIV_UP(dst_width, BLOCK_X), 
DIV_UP(dst_height, BLOCK_Y), 1,
+                                       BLOCK_X, BLOCK_Y, 1, 0, s->cu_stream, 
kernel_args, NULL));
+}
+
+static int cudatranspose_rotate(AVFilterContext *ctx,
+                                AVFrame *out, AVFrame *in)
+{
+    TransposeCUDAContext *s = ctx->priv;
+    int ret;
+
+    for (int c = 0; c < s->pix_desc->nb_components; c++) {
+        const AVComponentDescriptor *comp = &s->pix_desc->comp[c];
+        const int p = comp->plane;
+        int pix_size, channels;
+        int is_planar_u, is_planar_v, is_422_uv;
+        CUfunction func;
+
+        pix_size = (comp->depth + 7) / 8;
+        channels = comp->step / pix_size;
+        if (pix_size > 2 || channels > 4)
+            av_unreachable("Unsupported pixel format!");
+
+        is_planar_u = p == 1 && channels == 1;
+        is_planar_v = p == 2 && channels == 1;
+        is_422_uv = p && s->pix_desc->log2_chroma_w == 1 && 
!s->pix_desc->log2_chroma_h;
+
+        if (comp->plane < c || is_planar_v) {
+            // We process planes as a whole, so don't reprocess
+            // them for additional components
+            continue;
+        }
+
+        switch (pix_size) {
+        case 1:
+            func = channels == 4 ? s->cu_func_uchar4 :
+                   channels == 2 ? s->cu_func_uchar2 : s->cu_func_uchar;
+            break;
+        case 2:
+            func = channels == 2 ? s->cu_func_ushort2 : s->cu_func_ushort;
+            break;
+        default:
+            av_unreachable("Unsupported pixel format!");
+        }
+
+        ret = call_kernel(ctx, func, is_422_uv,
+                          (CUdeviceptr)out->data[p],
+                          (CUdeviceptr)(is_planar_u ? out->data[p+1] : NULL),
+                          AV_CEIL_RSHIFT(out->width, p ? 
s->pix_desc->log2_chroma_w : 0),
+                          AV_CEIL_RSHIFT(out->height, p ? 
s->pix_desc->log2_chroma_h : 0),
+                          out->linesize[p],
+                          (CUdeviceptr)in->data[p],
+                          (CUdeviceptr)(is_planar_u ? in->data[p+1] : NULL),
+                          in->linesize[p]);
+        if (ret < 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+static int cudatranspose_transpose(AVFilterContext *ctx,
+                                   AVFrame *out, AVFrame *in)
+{
+    TransposeCUDAContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *src = in;
+    int ret;
+
+    ret = cudatranspose_rotate(ctx, s->frame, src);
+    if (ret < 0)
+        return ret;
+
+    src = s->frame;
+    ret = av_hwframe_get_buffer(src->hw_frames_ctx, s->tmp_frame, 0);
+    if (ret < 0)
+        return ret;
+
+    av_frame_move_ref(out, s->frame);
+    av_frame_move_ref(s->frame, s->tmp_frame);
+
+    s->frame->width  = outlink->w;
+    s->frame->height = outlink->h;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int cudatranspose_filter_frame(AVFilterLink *link, AVFrame *in)
+{
+    AVFilterContext      *ctx = link->dst;
+    TransposeCUDAContext *s = ctx->priv;
+    AVFilterLink         *outlink = ctx->outputs[0];
+    CudaFunctions *cu;
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    cu = s->hwctx->internal->cuda_dl;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->cu_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = cudatranspose_transpose(ctx, out, in);
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+
+    return ff_filter_frame(outlink, out);
+
+fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+static AVFrame *cudatranspose_get_video_buffer(AVFilterLink *inlink, int w, 
int h)
+{
+    TransposeCUDAContext *s = inlink->dst->priv;
+
+    return s->passthrough ?
+        ff_null_get_video_buffer   (inlink, w, h) :
+        ff_default_get_video_buffer(inlink, w, h);
+}
+
+#define OFFSET(x) offsetof(TransposeCUDAContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption cudatranspose_options[] = {
+    { "dir", "set transpose direction", OFFSET(dir), AV_OPT_TYPE_INT, { .i64 = 
TRANSPOSE_CCLOCK_FLIP }, 0, 6, FLAGS, .unit = "dir" },
+        { "cclock_flip", "rotate counter-clockwise with vertical flip", 0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_CCLOCK_FLIP }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "clock",       "rotate clockwise",                            0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_CLOCK       }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "cclock",      "rotate counter-clockwise",                    0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_CCLOCK      }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "clock_flip",  "rotate clockwise with vertical flip",         0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_CLOCK_FLIP  }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "reversal",    "rotate by half-turn",                         0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_REVERSAL    }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "hflip",       "flip horizontally",                           0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_HFLIP       }, 0, 0, FLAGS, .unit = "dir" 
},
+        { "vflip",       "flip vertically",                             0, 
AV_OPT_TYPE_CONST, { .i64 = TRANSPOSE_VFLIP       }, 0, 0, FLAGS, .unit = "dir" 
},
+
+    { "passthrough", "do not apply transposition if the input matches the 
specified geometry", OFFSET(passthrough), AV_OPT_TYPE_INT, { .i64 = 
TRANSPOSE_PT_TYPE_NONE },  0, 2, FLAGS, .unit = "passthrough" },
+        { "none",      "always apply transposition",  0, AV_OPT_TYPE_CONST, { 
.i64 = TRANSPOSE_PT_TYPE_NONE },      0, 0, FLAGS, .unit = "passthrough" },
+        { "landscape", "preserve landscape geometry", 0, AV_OPT_TYPE_CONST, { 
.i64 = TRANSPOSE_PT_TYPE_LANDSCAPE }, 0, 0, FLAGS, .unit = "passthrough" },
+        { "portrait",  "preserve portrait geometry",  0, AV_OPT_TYPE_CONST, { 
.i64 = TRANSPOSE_PT_TYPE_PORTRAIT },  0, 0, FLAGS, .unit = "passthrough" },
+
+    { NULL },
+};
+
+AVFILTER_DEFINE_CLASS(cudatranspose);
+
+static const AVFilterPad cudatranspose_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = cudatranspose_filter_frame,
+        .get_buffer.video = cudatranspose_get_video_buffer,
+    },
+};
+
+static const AVFilterPad cudatranspose_outputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .config_props = cudatranspose_config_props,
+    },
+};
+
+const FFFilter ff_vf_transpose_cuda = {
+    .p.name         = "transpose_cuda",
+    .p.description  = NULL_IF_CONFIG_SMALL("Transpose input video using CUDA"),
+    .p.priv_class   = &cudatranspose_class,
+    .init           = cudatranspose_init,
+    .uninit         = cudatranspose_uninit,
+    .priv_size      = sizeof(TransposeCUDAContext),
+    FILTER_INPUTS(cudatranspose_inputs),
+    FILTER_OUTPUTS(cudatranspose_outputs),
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_transpose_cuda.cu b/libavfilter/vf_transpose_cuda.cu
new file mode 100644
index 0000000000..f6ee401358
--- /dev/null
+++ b/libavfilter/vf_transpose_cuda.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2026 NyanMisaka
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+__inline__ __device__ void map_input_coords(int *xi, int *yi,
+                                            int  xo, int  yo,
+                                            int dst_width,
+                                            int dst_height,
+                                            int is_422_uv,
+                                            int dir)
+{
+    *xi = (dir < 4) ? ((dir &  2) ? (dst_height - 1 - yo) : yo)
+                    : ((dir == 6) ? xo : (dst_width  - 1 - xo));
+    *yi = (dir < 4) ? ((dir &  1) ? (dst_width  - 1 - xo) : xo)
+                    : ((dir == 5) ? yo : (dst_height - 1 - yo));
+
+    *xi >>= (dir < 4 && is_422_uv ? 1 : 0);
+    *yi <<= (dir < 4 && is_422_uv ? 1 : 0);
+}
+
+extern "C" {
+
+#define TRANSPOSE_KERNELS(NAME, TYPE) \
+__global__ void Transpose_Cuda_ ## NAME( \
+    TYPE *dst0, TYPE *dst1, int dst_width, int dst_height, int dst_pitch, \
+    TYPE *src0, TYPE *src1, int src_pitch, int is_422_uv, int dir) \
+{ \
+    int xo = blockIdx.x * blockDim.x + threadIdx.x; \
+    int yo = blockIdx.y * blockDim.y + threadIdx.y; \
+    if (xo >= dst_width || yo >= dst_height) \
+        return; \
+    int xi, yi; \
+    map_input_coords(&xi, &yi, xo, yo, \
+                     dst_width, dst_height, \
+                     is_422_uv, dir); \
+    dst_pitch /= sizeof(TYPE); \
+    src_pitch /= sizeof(TYPE); \
+    dst0[yo*dst_pitch+xo] = src0[yi*src_pitch+xi]; \
+    if (dst1 && src1) \
+        dst1[yo*dst_pitch+xo] = src1[yi*src_pitch+xi]; \
+}
+
+TRANSPOSE_KERNELS(uchar, unsigned char)
+TRANSPOSE_KERNELS(ushort, unsigned short)
+TRANSPOSE_KERNELS(uchar2, uchar2)
+TRANSPOSE_KERNELS(ushort2, ushort2)
+TRANSPOSE_KERNELS(uchar4, uchar4)
+
+} /* extern "C" */
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avfilter: add transpose_cuda video filter (PR #22493)

Reply via email to