[FFmpeg-devel] [PR] avfilter/dnn: add ONNX Runtime backend with GPU execution provider support (PR #21532)

Steven Xiao via ffmpeg-devel Tue, 20 Jan 2026 12:44:39 -0800

PR #21532 opened by Steven Xiao (younengxiao)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21532
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21532.patch


This patch adds ONNX Runtime as a new DNN backend for FFmpeg's dnn_processing
filter, enabling hardware-accelerated neural network inference on multiple
GPU platforms.

Features:
- CPU execution provider (default)
- CUDA execution provider (NVIDIA GPUs)
- DirectML execution provider (AMD/Intel/NVIDIA GPUs on Windows)
- Configurable GPU device selection via gpu_device_id option
- Thread count configuration via num_threads option

New filter options for dnn_processing with dnn_backend=onnx:
- execution_provider: cpu, cuda, or dml (default: cpu)
- gpu_device_id: GPU device index (default: 0)
- num_threads: inference thread count (default: 0, auto)

Example usage:
  **CPU inference**
  ffmpeg -i input.mp4 -vf "dnn_processing=dnn_backend=onnx:model=model.onnx" 
output.mp4

  **CUDA GPU inference**
  ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:execution_provider=cuda:gpu_device_id=0"
 output.mp4

  **DirectML GPU inference (Windows)**
  ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:execution_provider=dml:gpu_device_id=0"
 output.mp4

Build Instructions (MSYS2 MinGW):

1. Install MSYS2 from https://www.msys2.org/

2. Install build tools in MSYS2 MinGW 64-bit terminal:
   pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-nasm make diffutils 
pkg-config

3. Download ONNX Runtime (GPU version):
   
https://github.com/microsoft/onnxruntime/releases/download/v1.23.2/onnxruntime-win-x64-gpu-1.23.2.zip

4. Extract to C:\onnxruntime with structure:
   C:\onnxruntime\include\onnxruntime_c_api.h
   C:\onnxruntime\lib\onnxruntime.dll
   C:\onnxruntime\lib\onnxruntime.lib

5. Configure and build:
   ./configure \
       --enable-gpl \
       --enable-libonnxruntime \
       --extra-cflags="-I/c/onnxruntime/include -D_stdcall=__stdcall" \
       --extra-ldflags="-L/c/onnxruntime/lib" \
       --extra-libs="-lonnxruntime"
   make -j$(nproc)

Note: The -D_stdcall=__stdcall flag is required for MinGW GCC compatibility
with ONNX Runtime headers (MSVC uses _stdcall, GCC uses __stdcall).

External dependency:
- onnxruntime_c_api.h (header)
- libonnxruntime (library)


>From 26780145b167a25b51f256ad752775ea81b4a89c Mon Sep 17 00:00:00 2001
From: stevxiao <[email protected]>
Date: Tue, 20 Jan 2026 15:31:40 -0500
Subject: [PATCH] avfilter/dnn: add ONNX Runtime backend with GPU execution
 provider support

This patch adds ONNX Runtime as a new DNN backend for FFmpeg's dnn_processing
filter, enabling hardware-accelerated neural network inference on multiple
GPU platforms.

Features:
- CPU execution provider (default)
- CUDA execution provider (NVIDIA GPUs)
- DirectML execution provider (AMD/Intel/NVIDIA GPUs on Windows)
- Configurable GPU device selection via gpu_device_id option
- Thread count configuration via num_threads option

New filter options for dnn_processing with dnn_backend=onnx:
- execution_provider: cpu, cuda, or dml (default: cpu)
- gpu_device_id: GPU device index (default: 0)
- num_threads: inference thread count (default: 0, auto)

Example usage:
  # CPU inference
  ffmpeg -i input.mp4 -vf "dnn_processing=dnn_backend=onnx:model=model.onnx" 
output.mp4

  # CUDA GPU inference
  ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:execution_provider=cuda:gpu_device_id=0"
 output.mp4

  # DirectML GPU inference (Windows)
  ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:execution_provider=dml:gpu_device_id=0"
 output.mp4

Build Instructions (MSYS2 MinGW):

1. Install MSYS2 from https://www.msys2.org/

2. Install build tools in MSYS2 MinGW 64-bit terminal:
   pacman -S mingw-w64-x86_64-gcc mingw-w64-x86_64-nasm make diffutils 
pkg-config

3. Download ONNX Runtime (GPU version):
   
https://github.com/microsoft/onnxruntime/releases/download/v1.23.2/onnxruntime-win-x64-gpu-1.23.2.zip

4. Extract to C:\onnxruntime with structure:
   C:\onnxruntime\include\onnxruntime_c_api.h
   C:\onnxruntime\lib\onnxruntime.dll
   C:\onnxruntime\lib\onnxruntime.lib

5. Configure and build:
   ./configure \
       --enable-gpl \
       --enable-libonnxruntime \
       --extra-cflags="-I/c/onnxruntime/include -D_stdcall=__stdcall" \
       --extra-ldflags="-L/c/onnxruntime/lib" \
       --extra-libs="-lonnxruntime"
   make -j$(nproc)

Note: The -D_stdcall=__stdcall flag is required for MinGW GCC compatibility
with ONNX Runtime headers (MSVC uses _stdcall, GCC uses __stdcall).

External dependency:
- onnxruntime_c_api.h (header)
- libonnxruntime (library)
---
 configure                          |   5 +-
 libavfilter/dnn/Makefile           |   1 +
 libavfilter/dnn/dnn_backend_onnx.c | 851 +++++++++++++++++++++++++++++
 libavfilter/dnn/dnn_interface.c    |   4 +
 libavfilter/dnn_interface.h        |  13 +-
 libavfilter/vf_dnn_processing.c    |   5 +-
 6 files changed, 876 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/dnn/dnn_backend_onnx.c

diff --git a/configure b/configure
index 01edfacacc..53b53780be 100755
--- a/configure
+++ b/configure
@@ -289,6 +289,7 @@ External library support:
   --enable-libtls          enable LibreSSL (via libtls), needed for https 
support
                            if openssl, gnutls or mbedtls is not used [no]
   --enable-libtorch        enable Torch as one DNN backend [no]
+  --enable-libonnxruntime  enable ONNX Runtime as a DNN module backend [no]
   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
   --enable-libv4l2         enable libv4l2/v4l-utils [no]
@@ -2058,6 +2059,7 @@ EXTERNAL_LIBRARY_LIST="
     libtheora
     libtls
     libtorch
+    libonnxruntime
     libtwolame
     libuavs3d
     libv4l2
@@ -2978,7 +2980,7 @@ dirac_parse_select="golomb"
 dovi_rpudec_select="golomb"
 dovi_rpuenc_select="golomb"
 dnn_deps="avformat swscale"
-dnn_deps_any="libtensorflow libopenvino libtorch"
+dnn_deps_any="libtensorflow libopenvino libtorch libonnxruntime"
 error_resilience_select="me_cmp"
 evcparse_select="golomb"
 faandct_deps="faan"
@@ -7321,6 +7323,7 @@ enabled libtheora         && require libtheora 
theora/theoraenc.h th_info_init -
 enabled libtls            && require_pkg_config libtls libtls tls.h 
tls_configure &&
                              { enabled gpl && ! enabled nonfree && die "ERROR: 
LibreSSL is incompatible with the gpl"; }
 enabled libtorch          && check_cxxflags -std=c++17 && require_cxx libtorch 
torch/torch.h "torch::Tensor" -ltorch -lc10 -ltorch_cpu -lstdc++ -lpthread
+enabled libonnxruntime    && require libonnxruntime onnxruntime_c_api.h 
OrtGetApiBase -lonnxruntime
 enabled libtwolame        && require libtwolame twolame.h twolame_init 
-ltwolame &&
                              { check_lib libtwolame twolame.h 
twolame_encode_buffer_float32_interleaved -ltwolame ||
                                die "ERROR: libtwolame must be installed and 
version must be >= 0.3.10"; }
diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index 3d09927c98..7c5d7d8ab6 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -7,5 +7,6 @@ OBJS-$(CONFIG_DNN)                           += 
dnn/dnn_backend_common.o
 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn/dnn_backend_tf.o
 DNN-OBJS-$(CONFIG_LIBOPENVINO)               += dnn/dnn_backend_openvino.o
 DNN-OBJS-$(CONFIG_LIBTORCH)                  += dnn/dnn_backend_torch.o
+DNN-OBJS-$(CONFIG_LIBONNXRUNTIME)            += dnn/dnn_backend_onnx.o
 
 OBJS-$(CONFIG_DNN)                           += $(DNN-OBJS-yes)
diff --git a/libavfilter/dnn/dnn_backend_onnx.c 
b/libavfilter/dnn/dnn_backend_onnx.c
new file mode 100644
index 0000000000..ce3c656fbc
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_onnx.c
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DNN ONNX Runtime backend implementation.
+ */
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+#include "../filters.h"
+#include "dnn_io_proc.h"
+#include "dnn_backend_common.h"
+#include "queue.h"
+#include "safe_queue.h"
+#include <onnxruntime_c_api.h>
+#include <string.h>
+
+typedef struct ONNXModel {
+    DNNModel model;
+    DnnContext *ctx;
+    OrtEnv *env;
+    OrtSession *session;
+    OrtSessionOptions *session_options;
+    OrtAllocator *allocator;
+    SafeQueue *request_queue;
+    Queue *task_queue;
+    Queue *lltask_queue;
+} ONNXModel;
+
+typedef struct ONNXInferRequest {
+    OrtValue *input_tensor;
+    OrtValue *output_tensor;
+} ONNXInferRequest;
+
+typedef struct ONNXRequestItem {
+    ONNXInferRequest *infer_request;
+    LastLevelTaskItem *lltask;
+    DNNAsyncExecModule exec_module;
+} ONNXRequestItem;
+
+#define OFFSET(x) offsetof(ONNXOptions, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM
+static const AVOption dnn_onnx_options[] = {
+    { "num_threads",        "number of threads for ONNX inference",            
       OFFSET(num_threads),        AV_OPT_TYPE_INT,    { .i64 = 0 },     0, 
INT_MAX, FLAGS },
+    { "execution_provider", "execution provider for ONNX inference (cpu, cuda, 
dml)", OFFSET(execution_provider), AV_OPT_TYPE_STRING, { .str = "cpu" }, 0, 0,  
     FLAGS },
+    { "gpu_device_id",      "GPU device ID for ONNX inference",                
       OFFSET(gpu_device_id),      AV_OPT_TYPE_INT,    { .i64 = 0 },     0, 
INT_MAX, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(dnn_onnx);
+
+static const OrtApi *g_ort = NULL;
+
+#define ORT_ABORT_ON_ERROR(expr)                                \
+    do {                                                        \
+        OrtStatus *status = (expr);                             \
+        if (status != NULL) {                                   \
+            const char *msg = g_ort->GetErrorMessage(status);   \
+            av_log(ctx, AV_LOG_ERROR, "ONNX Runtime error: %s\n", msg); \
+            g_ort->ReleaseStatus(status);                       \
+            goto err;                                           \
+        }                                                       \
+    } while (0)
+
+static int extract_lltask_from_task(TaskItem *task, Queue *lltask_queue)
+{
+    ONNXModel     *onnx_model = (ONNXModel *)task->model;
+    DnnContext           *ctx = onnx_model->ctx;
+    LastLevelTaskItem *lltask = av_malloc(sizeof(*lltask));
+
+    if (!lltask) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for 
LastLevelTaskItem\n");
+        return AVERROR(ENOMEM);
+    }
+    task->inference_todo = 1;
+    task->inference_done = 0;
+    lltask->task = task;
+    if (ff_queue_push_back(lltask_queue, lltask) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to push back lltask_queue.\n");
+        av_freep(&lltask);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static void onnx_free_request(ONNXInferRequest *request)
+{
+    if (!request)
+        return;
+    if (request->input_tensor) {
+        g_ort->ReleaseValue(request->input_tensor);
+        request->input_tensor = NULL;
+    }
+    if (request->output_tensor) {
+        g_ort->ReleaseValue(request->output_tensor);
+        request->output_tensor = NULL;
+    }
+}
+
+static inline void destroy_request_item(ONNXRequestItem **arg)
+{
+    ONNXRequestItem *item;
+    if (!arg || !*arg)
+        return;
+    item = *arg;
+    onnx_free_request(item->infer_request);
+    av_freep(&item->infer_request);
+    av_freep(&item->lltask);
+    ff_dnn_async_module_cleanup(&item->exec_module);
+    av_freep(arg);
+}
+
+static void dnn_free_model_onnx(DNNModel **model)
+{
+    ONNXModel *onnx_model;
+    if (!model || !*model)
+        return;
+
+    onnx_model = (ONNXModel *)(*model);
+    
+    while (ff_safe_queue_size(onnx_model->request_queue) != 0) {
+        ONNXRequestItem *item = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+        destroy_request_item(&item);
+    }
+    ff_safe_queue_destroy(onnx_model->request_queue);
+
+    while (ff_queue_size(onnx_model->lltask_queue) != 0) {
+        LastLevelTaskItem *item = (LastLevelTaskItem 
*)ff_queue_pop_front(onnx_model->lltask_queue);
+        av_freep(&item);
+    }
+    ff_queue_destroy(onnx_model->lltask_queue);
+
+    while (ff_queue_size(onnx_model->task_queue) != 0) {
+        TaskItem *item = (TaskItem 
*)ff_queue_pop_front(onnx_model->task_queue);
+        av_frame_free(&item->in_frame);
+        av_frame_free(&item->out_frame);
+        av_freep(&item);
+    }
+    ff_queue_destroy(onnx_model->task_queue);
+
+    if (onnx_model->session)
+        g_ort->ReleaseSession(onnx_model->session);
+    if (onnx_model->session_options)
+        g_ort->ReleaseSessionOptions(onnx_model->session_options);
+    if (onnx_model->env)
+        g_ort->ReleaseEnv(onnx_model->env);
+
+    av_freep(&onnx_model);
+    *model = NULL;
+}
+
+static int get_input_onnx(DNNModel *model, DNNData *input, const char 
*input_name)
+{
+    ONNXModel  *onnx_model = (ONNXModel *)model;
+    DnnContext        *ctx = onnx_model->ctx;
+    OrtTypeInfo *type_info = NULL;
+    const OrtTensorTypeAndShapeInfo *tensor_info = NULL;
+    size_t num_dims;
+    int64_t *dims;
+    ONNXTensorElementDataType tensor_type;
+    OrtStatus *status;
+
+    status = g_ort->SessionGetInputTypeInfo(onnx_model->session, 0, 
&type_info);
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to get input type info: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    status = g_ort->CastTypeInfoToTensorInfo(type_info, &tensor_info);
+    if (status != NULL) {
+        g_ort->ReleaseTypeInfo(type_info);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    status = g_ort->GetDimensionsCount(tensor_info, &num_dims);
+    if (status != NULL) {
+        g_ort->ReleaseTypeInfo(type_info);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    dims = av_malloc(num_dims * sizeof(int64_t));
+    if (!dims) {
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOMEM);
+    }
+
+    g_ort->GetDimensions(tensor_info, dims, num_dims);
+    g_ort->GetTensorElementType(tensor_info, &tensor_type);
+    
+    // Assume NCHW layout for now
+    input->layout = DL_NCHW;
+    input->dims[0] = dims[0] > 0 ? dims[0] : 1;
+    input->dims[1] = dims[1] > 0 ? dims[1] : 3;
+    input->dims[2] = dims[2] > 0 ? dims[2] : -1;
+    input->dims[3] = dims[3] > 0 ? dims[3] : -1;
+    
+    if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+        input->dt = DNN_FLOAT;
+    } else if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8) {
+        input->dt = DNN_UINT8;
+    } else {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported tensor data type\n");
+        av_free(dims);
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOSYS);
+    }
+
+    input->order = DCO_RGB;
+    av_free(dims);
+    g_ort->ReleaseTypeInfo(type_info);
+    return 0;
+}
+
+static int fill_model_input_onnx(ONNXModel *onnx_model, ONNXRequestItem 
*request)
+{
+    LastLevelTaskItem       *lltask = NULL;
+    TaskItem                  *task = NULL;
+    ONNXInferRequest *infer_request = NULL;
+    DNNData                   input = { 0 };
+    DnnContext                 *ctx = onnx_model->ctx;
+    int ret, width_idx, height_idx, channel_idx;
+    int64_t input_shape[4];
+    size_t input_tensor_size;
+    OrtMemoryInfo *memory_info;
+    OrtStatus *status;
+
+    lltask = (LastLevelTaskItem *)ff_queue_pop_front(onnx_model->lltask_queue);
+    if (!lltask) {
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+    request->lltask = lltask;
+    task = lltask->task;
+    infer_request = request->infer_request;
+
+    ret = get_input_onnx(&onnx_model->model, &input, NULL);
+    if (ret != 0) {
+        goto err;
+    }
+
+    width_idx   = dnn_get_width_idx_by_layout(input.layout);
+    height_idx  = dnn_get_height_idx_by_layout(input.layout);
+    channel_idx = dnn_get_channel_idx_by_layout(input.layout);
+    
+    input.dims[height_idx] = task->in_frame->height;
+    input.dims[width_idx]  = task->in_frame->width;
+    
+    input_shape[0] = input.dims[0];
+    input_shape[1] = input.dims[channel_idx];
+    input_shape[2] = input.dims[height_idx];
+    input_shape[3] = input.dims[width_idx];
+
+    input_tensor_size = input_shape[0] * input_shape[1] * input_shape[2] * 
input_shape[3];
+    
+    if (input.dt == DNN_FLOAT) {
+        input_tensor_size *= sizeof(float);
+    } else {
+        input_tensor_size *= sizeof(uint8_t);
+    }
+
+    input.data = av_malloc(input_tensor_size);
+    if (!input.data) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    switch (onnx_model->model.func_type) {
+    case DFT_PROCESS_FRAME:
+        if (input.dt == DNN_FLOAT) {
+            input.scale = 255;
+        }
+        if (task->do_ioproc) {
+            if (onnx_model->model.frame_pre_proc != NULL) {
+                onnx_model->model.frame_pre_proc(task->in_frame, &input, 
onnx_model->model.filter_ctx);
+            } else {
+                ff_proc_from_frame_to_dnn(task->in_frame, &input, ctx);
+            }
+        }
+        break;
+    case DFT_ANALYTICS_DETECT:
+        ff_frame_to_dnn_detect(task->in_frame, &input, ctx);
+        break;
+    default:
+        avpriv_report_missing_feature(ctx, "model function type %d", 
onnx_model->model.func_type);
+        av_freep(&input.data);
+        ret = AVERROR(ENOSYS);
+        goto err;
+    }
+
+    status = g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, 
&memory_info);
+    if (status != NULL) {
+        av_freep(&input.data);
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    if (input.dt == DNN_FLOAT) {
+        status = g_ort->CreateTensorWithDataAsOrtValue(
+            memory_info, input.data, input_tensor_size,
+            input_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+            &infer_request->input_tensor);
+    } else {
+        status = g_ort->CreateTensorWithDataAsOrtValue(
+            memory_info, input.data, input_tensor_size,
+            input_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8,
+            &infer_request->input_tensor);
+    }
+
+    g_ort->ReleaseMemoryInfo(memory_info);
+
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to create input tensor: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        av_freep(&input.data);
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    return 0;
+
+err:
+    onnx_free_request(infer_request);
+    return ret;
+}
+
+static int onnx_start_inference(void *args)
+{
+    ONNXRequestItem        *request = (ONNXRequestItem *)args;
+    ONNXInferRequest *infer_request = NULL;
+    LastLevelTaskItem       *lltask = NULL;
+    TaskItem                  *task = NULL;
+    ONNXModel           *onnx_model = NULL;
+    DnnContext                 *ctx = NULL;
+    OrtStatus *status;
+    const char  *input_names[] = {"input"};
+    const char *output_names[] = {"output"};
+
+    if (!request) {
+        av_log(NULL, AV_LOG_ERROR, "ONNXRequestItem is NULL\n");
+        return AVERROR(EINVAL);
+    }
+
+    infer_request = request->infer_request;
+    lltask = request->lltask;
+    task = lltask->task;
+    onnx_model = (ONNXModel *)task->model;
+    ctx = onnx_model->ctx;
+
+    if (!infer_request->input_tensor) {
+        av_log(ctx, AV_LOG_ERROR, "Input tensor is NULL\n");
+        return DNN_GENERIC_ERROR;
+    }
+
+    status = g_ort->Run(onnx_model->session, NULL,
+                        input_names, (const OrtValue *const 
*)&infer_request->input_tensor, 1,
+                        output_names, 1, &infer_request->output_tensor);
+
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "ONNX inference failed: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        return DNN_GENERIC_ERROR;
+    }
+
+    return 0;
+}
+
+static void infer_completion_callback(void *args)
+{
+    ONNXRequestItem  *request = (ONNXRequestItem *)args;
+    LastLevelTaskItem *lltask = request->lltask;
+    TaskItem            *task = lltask->task;
+    DNNData           outputs = { 0 };
+    ONNXInferRequest *infer_request = request->infer_request;
+    ONNXModel           *onnx_model = (ONNXModel *)task->model;
+    DnnContext                 *ctx = onnx_model->ctx;
+    OrtTensorTypeAndShapeInfo *tensor_info;
+    size_t num_dims;
+    int64_t *dims;
+    void *output_data;
+    OrtStatus *status;
+
+    if (!infer_request->output_tensor) {
+        av_log(ctx, AV_LOG_ERROR, "Output tensor is NULL\n");
+        goto err;
+    }
+
+    status = g_ort->GetTensorTypeAndShape(infer_request->output_tensor, 
&tensor_info);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get output tensor info\n");
+        goto err;
+    }
+
+    g_ort->GetDimensionsCount(tensor_info, &num_dims);
+    dims = av_malloc(num_dims * sizeof(int64_t));
+    g_ort->GetDimensions(tensor_info, dims, num_dims);
+
+    outputs.layout = DL_NCHW;
+    outputs.order = DCO_RGB;
+    outputs.dt = DNN_FLOAT;
+
+    if (num_dims == 4) {
+        outputs.dims[0] = dims[0];
+        outputs.dims[1] = dims[1];
+        outputs.dims[2] = dims[2];
+        outputs.dims[3] = dims[3];
+    } else {
+        avpriv_report_missing_feature(ctx, "Support for %zu dimensional 
output", num_dims);
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    status = g_ort->GetTensorMutableData(infer_request->output_tensor, 
&output_data);
+    if (status != NULL) {
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    outputs.data = output_data;
+
+    switch (onnx_model->model.func_type) {
+    case DFT_PROCESS_FRAME:
+        if (task->do_ioproc) {
+            outputs.scale = 255;
+            if (onnx_model->model.frame_post_proc != NULL) {
+                onnx_model->model.frame_post_proc(task->out_frame, &outputs, 
onnx_model->model.filter_ctx);
+            } else {
+                ff_proc_from_dnn_to_frame(task->out_frame, &outputs, ctx);
+            }
+        } else {
+            task->out_frame->width = 
outputs.dims[dnn_get_width_idx_by_layout(outputs.layout)];
+            task->out_frame->height = 
outputs.dims[dnn_get_height_idx_by_layout(outputs.layout)];
+        }
+        break;
+    default:
+        avpriv_report_missing_feature(ctx, "model function type %d", 
onnx_model->model.func_type);
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    av_free(dims);
+    g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+    task->inference_done++;
+
+err:
+    onnx_free_request(infer_request);
+    if (ff_safe_queue_push_back(onnx_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+        av_log(ctx, AV_LOG_ERROR, "Unable to push back request_queue.\n");
+    }
+}
+
+static int execute_model_onnx(ONNXRequestItem *request, Queue *lltask_queue)
+{
+    ONNXModel *onnx_model = NULL;
+    LastLevelTaskItem *lltask;
+    TaskItem *task = NULL;
+    int ret = 0;
+
+    if (ff_queue_size(lltask_queue) == 0) {
+        destroy_request_item(&request);
+        return 0;
+    }
+
+    lltask = (LastLevelTaskItem *)ff_queue_peek_front(lltask_queue);
+    if (lltask == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "Failed to get LastLevelTaskItem\n");
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+    task = lltask->task;
+    onnx_model = (ONNXModel *)task->model;
+
+    ret = fill_model_input_onnx(onnx_model, request);
+    if (ret != 0) {
+        goto err;
+    }
+
+    if (task->async) {
+        avpriv_report_missing_feature(onnx_model->ctx, "ONNX async inference");
+        ret = AVERROR(ENOSYS);
+        goto err;
+    } else {
+        ret = onnx_start_inference((void *)request);
+        if (ret != 0) {
+            goto err;
+        }
+        infer_completion_callback(request);
+        return (task->inference_done == task->inference_todo) ? 0 : 
DNN_GENERIC_ERROR;
+    }
+
+err:
+    onnx_free_request(request->infer_request);
+    if (ff_safe_queue_push_back(onnx_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+    }
+    return ret;
+}
+
+static int get_output_onnx(DNNModel *model, const char *input_name, int 
input_width, int input_height,
+                           const char *output_name, int *output_width, int 
*output_height)
+{
+    int ret = 0;
+    ONNXModel    *onnx_model = (ONNXModel *)model;
+    DnnContext          *ctx = onnx_model->ctx;
+    TaskItem            task = { 0 };
+    ONNXRequestItem *request = NULL;
+    DNNExecBaseParams exec_params = {
+        .input_name   = input_name,
+        .output_names = &output_name,
+        .nb_output    = 1,
+        .in_frame     = NULL,
+        .out_frame    = NULL,
+    };
+
+    ret = ff_dnn_fill_gettingoutput_task(&task, &exec_params, onnx_model, 
input_height, input_width, ctx);
+    if (ret != 0) {
+        goto err;
+    }
+
+    ret = extract_lltask_from_task(&task, onnx_model->lltask_queue);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to extract last level task from 
task.\n");
+        goto err;
+    }
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to get infer request.\n");
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+
+    ret = execute_model_onnx(request, onnx_model->lltask_queue);
+    *output_width = task.out_frame->width;
+    *output_height = task.out_frame->height;
+
+err:
+    av_frame_free(&task.out_frame);
+    av_frame_free(&task.in_frame);
+    return ret;
+}
+
+static ONNXInferRequest *onnx_create_inference_request(void)
+{
+    ONNXInferRequest *request = av_malloc(sizeof(ONNXInferRequest));
+    if (!request)
+        return NULL;
+    request->input_tensor = NULL;
+    request->output_tensor = NULL;
+    return request;
+}
+
+static DNNModel *dnn_load_model_onnx(DnnContext *ctx, DNNFunctionType 
func_type, AVFilterContext *filter_ctx)
+{
+    DNNModel       *model = NULL;
+    ONNXModel *onnx_model = NULL;
+    ONNXRequestItem *item = NULL;
+    ONNXOptions  *options = &ctx->onnx_option;
+    OrtStatus *status;
+
+    if (!g_ort) {
+        g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+        if (!g_ort) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get ONNX Runtime API\n");
+            return NULL;
+        }
+    }
+
+    onnx_model = av_mallocz(sizeof(ONNXModel));
+    if (!onnx_model)
+        return NULL;
+
+    model = &onnx_model->model;
+    onnx_model->ctx = ctx;
+
+    // Create ONNX Runtime environment
+    status = g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "FFmpeg", 
&onnx_model->env);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to create ONNX Runtime 
environment\n");
+        goto fail;
+    }
+
+    // Create session options
+    status = g_ort->CreateSessionOptions(&onnx_model->session_options);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to create session options\n");
+        goto fail;
+    }
+
+    // Set number of threads
+    if (options->num_threads > 0) {
+        g_ort->SetIntraOpNumThreads(onnx_model->session_options, 
options->num_threads);
+    }
+
+    // Set graph optimization level
+    g_ort->SetSessionGraphOptimizationLevel(onnx_model->session_options, 
ORT_ENABLE_ALL);
+
+    // Configure execution provider based on user selection
+    if (options->execution_provider && 
av_strcasecmp(options->execution_provider, "cpu") != 0) {
+        // GPU execution providers
+        if (av_strcasecmp(options->execution_provider, "cuda") == 0) {
+            // CUDA execution provider for NVIDIA GPUs
+            if (g_ort->SessionOptionsAppendExecutionProvider_CUDA) {
+                OrtCUDAProviderOptions cuda_options;
+                memset(&cuda_options, 0, sizeof(cuda_options));
+                cuda_options.device_id = options->gpu_device_id;
+                
+                status = g_ort->SessionOptionsAppendExecutionProvider_CUDA(
+                    onnx_model->session_options, &cuda_options);
+                if (status != NULL) {
+                    const char *msg = g_ort->GetErrorMessage(status);
+                    av_log(ctx, AV_LOG_WARNING, "Failed to enable CUDA (device 
%d): %s. Falling back to CPU\n", 
+                           options->gpu_device_id, msg);
+                    g_ort->ReleaseStatus(status);
+                } else {
+                    av_log(ctx, AV_LOG_INFO, "Using CUDA execution provider on 
device %d\n", options->gpu_device_id);
+                }
+            } else {
+                av_log(ctx, AV_LOG_WARNING, "CUDA provider function not 
available in this ONNX Runtime API version. Falling back to CPU\n");
+            }
+        } else if (av_strcasecmp(options->execution_provider, "dml") == 0) {
+            // DirectML execution provider for AMD/Intel/NVIDIA GPUs on Windows
+#ifdef _WIN32
+            // Use generic SessionOptionsAppendExecutionProvider with "DML" 
provider name
+            const char* dml_options_keys[] = {"device_id"};
+            const char* dml_options_values[] = {NULL};
+            char device_id_str[32];
+            snprintf(device_id_str, sizeof(device_id_str), "%d", 
options->gpu_device_id);
+            dml_options_values[0] = device_id_str;
+            
+            if (g_ort->SessionOptionsAppendExecutionProvider) {
+                status = g_ort->SessionOptionsAppendExecutionProvider(
+                    onnx_model->session_options, "DML",
+                    dml_options_keys, dml_options_values, 1);
+                if (status != NULL) {
+                    const char *msg = g_ort->GetErrorMessage(status);
+                    av_log(ctx, AV_LOG_WARNING, "Failed to enable DirectML 
(device %d): %s. Falling back to CPU\n", 
+                           options->gpu_device_id, msg);
+                    g_ort->ReleaseStatus(status);
+                } else {
+                    av_log(ctx, AV_LOG_INFO, "Using DirectML execution 
provider on device %d\n", options->gpu_device_id);
+                }
+            } else {
+                av_log(ctx, AV_LOG_WARNING, "DirectML provider function not 
available in this ONNX Runtime API version. Falling back to CPU\n");
+            }
+#else
+            av_log(ctx, AV_LOG_WARNING, "DirectML is only available on 
Windows. Falling back to CPU\n");
+#endif
+        } else {
+            av_log(ctx, AV_LOG_WARNING, "Unknown execution provider '%s'. 
Supported: cpu, cuda. Using CPU\n", 
+                   options->execution_provider);
+        }
+    } else {
+        av_log(ctx, AV_LOG_INFO, "Using CPU execution provider\n");
+    }
+
+    // Create session
+#ifdef _WIN32
+    {
+        wchar_t *wfilename;
+        int wlen = MultiByteToWideChar(CP_UTF8, 0, ctx->model_filename, -1, 
NULL, 0);
+        wfilename = av_malloc(wlen * sizeof(wchar_t));
+        if (!wfilename) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for wide 
filename\n");
+            goto fail;
+        }
+        MultiByteToWideChar(CP_UTF8, 0, ctx->model_filename, -1, wfilename, 
wlen);
+        status = g_ort->CreateSession(onnx_model->env, wfilename, 
+                                      onnx_model->session_options, 
&onnx_model->session);
+        av_free(wfilename);
+    }
+#else
+    status = g_ort->CreateSession(onnx_model->env, ctx->model_filename, 
+                                  onnx_model->session_options, 
&onnx_model->session);
+#endif
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to create ONNX session: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        goto fail;
+    }
+
+    // Get allocator
+    status = g_ort->GetAllocatorWithDefaultOptions(&onnx_model->allocator);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get allocator\n");
+        goto fail;
+    }
+
+    // Create request queue
+    onnx_model->request_queue = ff_safe_queue_create();
+    if (!onnx_model->request_queue) {
+        goto fail;
+    }
+
+    // Create and add initial request item
+    item = av_mallocz(sizeof(ONNXRequestItem));
+    if (!item) {
+        goto fail;
+    }
+    item->lltask = NULL;
+    item->infer_request = onnx_create_inference_request();
+    if (!item->infer_request) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for ONNX 
inference request\n");
+        goto fail;
+    }
+    item->exec_module.start_inference = &onnx_start_inference;
+    item->exec_module.callback = &infer_completion_callback;
+    item->exec_module.args = item;
+
+    if (ff_safe_queue_push_back(onnx_model->request_queue, item) < 0) {
+        goto fail;
+    }
+    item = NULL;
+
+    // Create task queues
+    onnx_model->task_queue = ff_queue_create();
+    if (!onnx_model->task_queue) {
+        goto fail;
+    }
+
+    onnx_model->lltask_queue = ff_queue_create();
+    if (!onnx_model->lltask_queue) {
+        goto fail;
+    }
+
+    model->get_input  = &get_input_onnx;
+    model->get_output = &get_output_onnx;
+    model->filter_ctx = filter_ctx;
+    model->func_type  = func_type;
+
+    return model;
+
+fail:
+    if (item) {
+        destroy_request_item(&item);
+    }
+    dnn_free_model_onnx(&model);
+    return NULL;
+}
+
+static int dnn_execute_model_onnx(const DNNModel *model, DNNExecBaseParams 
*exec_params)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    DnnContext *ctx = onnx_model->ctx;
+    TaskItem *task;
+    ONNXRequestItem *request;
+    int ret = 0;
+
+    ret = ff_check_exec_params(ctx, DNN_ONNX, model->func_type, exec_params);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Exec parameter checking failed.\n");
+        return ret;
+    }
+
+    task = av_malloc(sizeof(TaskItem));
+    if (!task) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to alloc memory for task item.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    ret = ff_dnn_fill_task(task, exec_params, onnx_model, 0, 1);
+    if (ret != 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "Unable to fill task.\n");
+        return ret;
+    }
+
+    ret = ff_queue_push_back(onnx_model->task_queue, task);
+    if (ret < 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "Unable to push back task_queue.\n");
+        return ret;
+    }
+
+    ret = extract_lltask_from_task(task, onnx_model->lltask_queue);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to extract last level task from 
task.\n");
+        return ret;
+    }
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to get infer request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_onnx(request, onnx_model->lltask_queue);
+}
+
+static DNNAsyncStatusType dnn_get_result_onnx(const DNNModel *model, AVFrame 
**in, AVFrame **out)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    return ff_dnn_get_result_common(onnx_model->task_queue, in, out);
+}
+
+static int dnn_flush_onnx(const DNNModel *model)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    ONNXRequestItem *request;
+
+    if (ff_queue_size(onnx_model->lltask_queue) == 0)
+        return 0;
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(onnx_model->ctx, AV_LOG_ERROR, "Unable to get infer 
request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_onnx(request, onnx_model->lltask_queue);
+}
+
+const DNNModule ff_dnn_backend_onnx = {
+    .clazz = DNN_DEFINE_CLASS(dnn_onnx),
+    .type = DNN_ONNX,
+    .load_model = dnn_load_model_onnx,
+    .execute_model = dnn_execute_model_onnx,
+    .get_result = dnn_get_result_onnx,
+    .flush = dnn_flush_onnx,
+    .free_model = dnn_free_model_onnx,
+};
diff --git a/libavfilter/dnn/dnn_interface.c b/libavfilter/dnn/dnn_interface.c
index c4e410756b..31f22e26b0 100644
--- a/libavfilter/dnn/dnn_interface.c
+++ b/libavfilter/dnn/dnn_interface.c
@@ -33,6 +33,7 @@
 extern const DNNModule ff_dnn_backend_openvino;
 extern const DNNModule ff_dnn_backend_tf;
 extern const DNNModule ff_dnn_backend_torch;
+extern const DNNModule ff_dnn_backend_onnx;
 
 #define OFFSET(x) offsetof(DnnContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM
@@ -78,6 +79,9 @@ static const DnnBackendInfo dnn_backend_info_list[] = {
 #if CONFIG_LIBTORCH
         {offsetof(DnnContext, torch_option), .module = &ff_dnn_backend_torch},
 #endif
+#if CONFIG_LIBONNXRUNTIME
+        {offsetof(DnnContext, onnx_option), .module = &ff_dnn_backend_onnx},
+#endif
 };
 
 const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx)
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 66086409be..6aede1f37c 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -35,7 +35,8 @@
 typedef enum {
     DNN_TF = 1,
     DNN_OV = 1 << 1,
-    DNN_TH = 1 << 2
+    DNN_TH = 1 << 2,
+    DNN_ONNX = 1 << 3
 } DNNBackendType;
 
 typedef enum {DNN_FLOAT = 1, DNN_UINT8 = 4} DNNDataType;
@@ -138,6 +139,13 @@ typedef struct THOptions {
     int optimize;
 } THOptions;
 
+typedef struct ONNXOptions {
+    const AVClass *clazz;
+    int num_threads;
+    char *execution_provider;  // "cpu", "cuda", "dml" (DirectML)
+    int gpu_device_id;
+} ONNXOptions;
+
 typedef struct DNNModule DNNModule;
 
 typedef struct DnnContext {
@@ -169,6 +177,9 @@ typedef struct DnnContext {
 #if CONFIG_LIBTORCH
     THOptions torch_option;
 #endif
+#if CONFIG_LIBONNXRUNTIME
+    ONNXOptions onnx_option;
+#endif
 } DnnContext;
 
 // Stores pointers to functions for loading, executing, freeing DNN models for 
one of the backends.
diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
index 0771ceb5fc..7ffa700cc5 100644
--- a/libavfilter/vf_dnn_processing.c
+++ b/libavfilter/vf_dnn_processing.c
@@ -52,11 +52,14 @@ static const AVOption dnn_processing_options[] = {
 #endif
 #if (CONFIG_LIBTORCH == 1)
     { "torch",       "torch backend flag",         0,                        
AV_OPT_TYPE_CONST,     { .i64 = DNN_TH },    0, 0, FLAGS, "backend" },
+#endif
+#if (CONFIG_LIBONNXRUNTIME == 1)
+    { "onnx",        "onnx backend flag",          0,                        
AV_OPT_TYPE_CONST,     { .i64 = DNN_ONNX },  0, 0, FLAGS, "backend" },
 #endif
     { NULL }
 };
 
-AVFILTER_DNN_DEFINE_CLASS(dnn_processing, DNN_TF | DNN_OV | DNN_TH);
+AVFILTER_DNN_DEFINE_CLASS(dnn_processing, DNN_TF | DNN_OV | DNN_TH | DNN_ONNX);
 
 static av_cold int init(AVFilterContext *context)
 {
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avfilter/dnn: add ONNX Runtime backend with GPU execution provider support (PR #21532)

Reply via email to