[FFmpeg-cvslog] [ffmpeg] avfilter/dnn: add ONNX Runtime backend with GPU execution provider support (branch master)

stevxiao via ffmpeg-cvslog Mon, 22 Jun 2026 05:08:37 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


The following commit(s) were added to refs/heads/master by this push:
     new a2856b3c30 avfilter/dnn: add ONNX Runtime backend with GPU execution 
provider support
a2856b3c30 is described below

commit a2856b3c3084ea45ca5b94a5daded25f85c41dac
Author:     stevxiao <[email protected]>
AuthorDate: Tue Jan 20 15:31:40 2026 -0500
Commit:     Guo Yejun <[email protected]>
CommitDate: Mon Jun 22 20:06:40 2026 +0800

    avfilter/dnn: add ONNX Runtime backend with GPU execution provider support
    
    This patch adds ONNX Runtime as a new DNN backend for FFmpeg's 
dnn_processing
    filter, enabling hardware-accelerated neural network inference on multiple
    GPU and NPU platforms.
    
    Execution Providers Supported:
    - CPU execution provider (default)
    - CUDA execution provider (NVIDIA GPUs)
    - DirectML execution provider (AMD/Intel/NVIDIA GPUs on Windows)
    - VitisAI execution provider (AMD Ryzen AI NPU)
    
    The options for dnn_processing with dnn_backend=onnx:
    - device: execution provider — cpu, cuda, dml, or vitisai (default: cpu)
    - device_id: GPU device index (default: 0)
    - threads_per_operation: inference thread count for CPU EP (default: 0, 
auto)
    - input: input tensor name. When omitted the backend resolves it from 
loaded session
    - output: output tensor name. When omitted the backend resolves it from 
loaded session
    
    Example usage:
      # CPU inference
      ffmpeg -i input.mp4 -vf 
"format=rgb24,dnn_processing=dnn_backend=onnx:model=model.onnx:input=image_in:output=image_out"
 output.mp4
    
      # CUDA GPU inference
      ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:device=cuda:device_id=0" 
output.mp4
    
      # DirectML GPU inference (Windows)
      ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:device=dml:device_id=0" 
output.mp4
    
      # VitisAI NPU inference
      ffmpeg -i input.mp4 -vf 
"dnn_processing=dnn_backend=onnx:model=model.onnx:device=vitisai" output.mp4
    
      Note: depending on the model, you may need a format filter (e.g. 
format=rgb24 or format=grayf32) before dnn_processing to convert the frames to 
the pixel format the model's input tensor expects.
    
    Signed-off-by: younengxiao <[email protected]>
    Reviewed-by: Guo Yejun <[email protected]>
---
 Changelog                          |    1 +
 configure                          |    5 +-
 doc/filters.texi                   |   46 +-
 doc/general_contents.texi          |   33 ++
 libavfilter/dnn/Makefile           |    1 +
 libavfilter/dnn/dnn_backend_onnx.c | 1104 ++++++++++++++++++++++++++++++++++++
 libavfilter/dnn/dnn_interface.c    |    8 +
 libavfilter/dnn_filter_common.c    |   25 +-
 libavfilter/dnn_interface.h        |   14 +-
 libavfilter/vf_dnn_processing.c    |    5 +-
 10 files changed, 1234 insertions(+), 8 deletions(-)

diff --git a/Changelog b/Changelog
index 2ad3ee255f..8cfac4c1a7 100644
--- a/Changelog
+++ b/Changelog
@@ -18,6 +18,7 @@ version <next>:
 - Remove ogg/celt parsing
 - Bitstream filter to split Dolby Vision multi-layer HEVC
 - Add AMF hardware memory mapping support.
+- ONNX Runtime DNN backend with GPU execution provider support
 
 
 version 8.1:
diff --git a/configure b/configure
index a6bbb86807..c8a0eac79b 100755
--- a/configure
+++ b/configure
@@ -253,6 +253,7 @@ External library support:
   --enable-libmp3lame      enable MP3 encoding via libmp3lame [no]
   --enable-libmpeghdec     enable MPEG-H 3DA decoding via libmpeghdec [no]
   --enable-liboapv         enable APV encoding via liboapv [no]
+  --enable-libonnxruntime  enable ONNX Runtime as a DNN module backend [no]
   --enable-libopencore-amrnb enable AMR-NB de/encoding via libopencore-amrnb 
[no]
   --enable-libopencore-amrwb enable AMR-WB decoding via libopencore-amrwb [no]
   --enable-libopencv       enable video filtering via libopencv [no]
@@ -2108,6 +2109,7 @@ EXTERNAL_LIBRARY_LIST="
     libmp3lame
     libmysofa
     liboapv
+    libonnxruntime
     libopencv
     libopencolorio
     libopenh264
@@ -3089,7 +3091,7 @@ dirac_parse_select="golomb"
 dovi_rpudec_select="golomb"
 dovi_rpuenc_select="golomb"
 dnn_deps="avformat swscale"
-dnn_deps_any="libtensorflow libopenvino libtorch"
+dnn_deps_any="libtensorflow libopenvino libtorch libonnxruntime"
 error_resilience_select="me_cmp"
 evcparse_select="golomb"
 faandct_deps="faan"
@@ -7440,6 +7442,7 @@ enabled libnpp            && { test_cpp_condition "$(cd 
"$source_path"; pwd)/lib
                                die "ERROR: libnpp not found"; } &&
                              { check_func_headers "nppi.h" 
nppiYCbCr420_8u_P2P3R $libnpp_extralibs ||
                                die "ERROR: libnpp support is deprecated, 
version 13.0 and up are not supported"; }
+enabled libonnxruntime    && require libonnxruntime onnxruntime_c_api.h 
OrtGetApiBase -lonnxruntime
 enabled libopencore_amrnb && { check_pkg_config libopencore_amrnb 
opencore-amrnb opencore-amrnb/interf_dec.h Decoder_Interface_init ||
                                require libopencore_amrnb 
opencore-amrnb/interf_dec.h Decoder_Interface_init -lopencore-amrnb; }
 enabled libopencore_amrwb && { check_pkg_config libopencore_amrwb 
opencore-amrwb opencore-amrwb/dec_if.h D_IF_init ||
diff --git a/doc/filters.texi b/doc/filters.texi
index 2cae41c7c5..1a649cf794 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -12221,18 +12221,42 @@ and configure FFmpeg with @code{--enable-libtorch
 --extra-cflags=-I/libtorch_root/libtorch/include/torch/csrc/api/include
 --extra-ldflags=-L/libtorch_root/libtorch/lib/}
 
+@item onnx
+ONNX Runtime backend. To enable this backend you need to install the
+ONNX Runtime library (see @url{https://onnxruntime.ai/}) and configure
+FFmpeg with @code{--enable-libonnxruntime}.
+
+The current ONNX Runtime backend expects 4-D input and output tensors
+with NCHW layout and 32-bit floating-point element type (ONNX
+@code{FLOAT}); models with integer or other element types (e.g.
+@code{UINT8}) are not supported and will be rejected at load time.
+Models using NHWC layout or other ranks are not yet
+supported. Only single-input models are supported; the backend binds
+exactly one input tensor when running the model.
+
+The @option{input} and @option{output} options are optional for the
+ONNX Runtime backend; when they are omitted the backend resolves the
+tensor names from the session.
+
+The ONNX Runtime backend runs inference synchronously using a single
+inference request. The shared @option{async} and @option{nireq} options
+therefore have no effect for @code{dnn_backend=onnx}; inference always
+runs synchronously regardless of their values.
+
 @end table
 
 @item model
 Set path to model file specifying network architecture and its parameters.
-Note that different backends use different file formats. TensorFlow, OpenVINO
-and Libtorch backend can load files for only its format.
+Note that different backends use different file formats. TensorFlow, OpenVINO,
+Libtorch, and ONNX Runtime backends can load files for only their respective 
formats.
 
 @item input
-Set the input name of the dnn network.
+Set the input name of the dnn network. Required for the TensorFlow backend;
+optional for the ONNX Runtime backend.
 
 @item output
-Set the output name of the dnn network.
+Set the output name of the dnn network. Required for the TensorFlow backend;
+optional for the ONNX Runtime backend.
 
 @item backend_configs
 Set the configs to be passed into backend. To use async execution, set async 
(default: set).
@@ -12241,6 +12265,20 @@ Roll back to sync execution if the backend does not 
support async.
 For tensorflow backend, you can set its configs with @option{sess_config} 
options,
 please use tools/python/tf_sess_config.py to get the configs of TensorFlow 
backend for your system.
 
+@item device
+Set the device to run the model. For the ONNX Runtime backend this selects the
+execution provider: @code{cpu} (default), @code{cuda} (NVIDIA GPU),
+@code{dml} (DirectML, Windows only) or @code{vitisai} (AMD Ryzen AI NPU).
+
+@item device_id
+Set the device index used by GPU execution providers (e.g. @code{cuda} or
+@code{dml}) for the ONNX Runtime backend. Default is 0.
+
+@item threads_per_operation
+ONNX Runtime backend only. Set the number of CPU threads used per ONNX
+Runtime operator when running with @code{device=cpu}. Default is 0 (let
+ONNX Runtime choose automatically). Has no effect for GPU/NPU providers.
+
 @end table
 
 @subsection Examples
diff --git a/doc/general_contents.texi b/doc/general_contents.texi
index 5fed093642..7a1c4f3a21 100644
--- a/doc/general_contents.texi
+++ b/doc/general_contents.texi
@@ -205,6 +205,39 @@ FFmpeg can make use of this library, originating in 
Modplug-XMMS, to read from M
 See @url{https://github.com/Konstanty/libmodplug}. Pass 
@code{--enable-libmodplug} to configure to
 enable it.
 
+@section ONNX Runtime
+
+FFmpeg can make use of the ONNX Runtime library as a backend for DNN based 
filters.
+
+Go to @url{https://onnxruntime.ai/} and follow the instructions for installing
+the library. Pre-built packages are available for various platforms.
+
+Configure FFmpeg with ONNX Runtime support:
+
+@example
+./configure \
+    --enable-libonnxruntime \
+    --extra-cflags="-I/path/to/onnxruntime/include" \
+    --extra-ldflags="-L/path/to/onnxruntime/lib"
+@end example
+
+On ELF-based systems (Linux, BSD) you may also want to embed the
+library search path so that the installed @file{ffmpeg} binary finds the
+shared library at run time without setting @env{LD_LIBRARY_PATH}:
+
+@example
+./configure \
+    --enable-libonnxruntime \
+    --extra-cflags="-I/path/to/onnxruntime/include" \
+    --extra-ldflags="-L/path/to/onnxruntime/lib 
-Wl,-rpath,/path/to/onnxruntime/lib"
+@end example
+
+The @code{-Wl,-rpath} flag is ELF-specific and should be omitted on
+other platforms (Windows, macOS).
+
+For MinGW GCC builds, add the @code{-D_stdcall=__stdcall} flag for 
compatibility
+with ONNX Runtime headers.
+
 @section OpenCORE, VisualOn, and Fraunhofer libraries
 
 Spun off Google Android sources, OpenCore, VisualOn and Fraunhofer
diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index 3d09927c98..7c5d7d8ab6 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -7,5 +7,6 @@ OBJS-$(CONFIG_DNN)                           += 
dnn/dnn_backend_common.o
 DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn/dnn_backend_tf.o
 DNN-OBJS-$(CONFIG_LIBOPENVINO)               += dnn/dnn_backend_openvino.o
 DNN-OBJS-$(CONFIG_LIBTORCH)                  += dnn/dnn_backend_torch.o
+DNN-OBJS-$(CONFIG_LIBONNXRUNTIME)            += dnn/dnn_backend_onnx.o
 
 OBJS-$(CONFIG_DNN)                           += $(DNN-OBJS-yes)
diff --git a/libavfilter/dnn/dnn_backend_onnx.c 
b/libavfilter/dnn/dnn_backend_onnx.c
new file mode 100644
index 0000000000..0ff0ffb285
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_onnx.c
@@ -0,0 +1,1104 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * DNN ONNX Runtime backend implementation.
+ */
+
+#include "libavutil/opt.h"
+#include "libavutil/avassert.h"
+#include "libavutil/mem.h"
+#include "libavutil/avstring.h"
+#include "libavutil/thread.h"
+#include "libavutil/wchar_filename.h"
+#include "../filters.h"
+#include "dnn_io_proc.h"
+#include "dnn_backend_common.h"
+#include "queue.h"
+#include "safe_queue.h"
+#include <onnxruntime_c_api.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct ONNXModel {
+    DNNModel model;
+    DnnContext *ctx;
+    OrtEnv *env;
+    OrtSession *session;
+    OrtSessionOptions *session_options;
+    OrtAllocator *allocator;
+    SafeQueue *request_queue;
+    Queue *task_queue;
+    Queue *lltask_queue;
+    DNNData input_info;
+    int     input_resolved;
+    int     output_resolved;
+} ONNXModel;
+
+typedef struct ONNXInferRequest {
+    OrtValue *input_tensor;
+    OrtValue *output_tensor;
+    void     *input_data;
+} ONNXInferRequest;
+
+typedef struct ONNXRequestItem {
+    ONNXInferRequest *infer_request;
+    LastLevelTaskItem *lltask;
+    DNNAsyncExecModule exec_module;
+} ONNXRequestItem;
+
+#define OFFSET(x) offsetof(ONNXOptions, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM
+static const AVOption dnn_onnx_options[] = {
+    { "threads_per_operation", "number of CPU threads per ORT operator 
(device=cpu only)",
+      OFFSET(num_threads),       AV_OPT_TYPE_INT,    { .i64 = 0 },    0, 
INT_MAX, FLAGS },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(dnn_onnx);
+
+static const OrtApi *g_ort = NULL;
+static AVOnce g_ort_init_once = AV_ONCE_INIT;
+
+static void init_ort_api(void)
+{
+    g_ort = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+}
+
+#define ORT_ABORT_ON_ERROR(expr)                                \
+    do {                                                        \
+        OrtStatus *status = (expr);                             \
+        if (status != NULL) {                                   \
+            const char *msg = g_ort->GetErrorMessage(status);   \
+            av_log(ctx, AV_LOG_ERROR, "ONNX Runtime error: %s\n", msg); \
+            g_ort->ReleaseStatus(status);                       \
+            goto err;                                           \
+        }                                                       \
+    } while (0)
+
+static int extract_lltask_from_task(TaskItem *task, Queue *lltask_queue)
+{
+    ONNXModel     *onnx_model = (ONNXModel *)task->model;
+    DnnContext           *ctx = onnx_model->ctx;
+    LastLevelTaskItem *lltask = av_malloc(sizeof(*lltask));
+
+    if (!lltask) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for 
LastLevelTaskItem\n");
+        return AVERROR(ENOMEM);
+    }
+    task->inference_todo = 1;
+    task->inference_done = 0;
+    lltask->task = task;
+    if (ff_queue_push_back(lltask_queue, lltask) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to push back lltask_queue.\n");
+        av_freep(&lltask);
+        return AVERROR(ENOMEM);
+    }
+    return 0;
+}
+
+static void onnx_free_request(ONNXInferRequest *request)
+{
+    if (!request)
+        return;
+    if (request->input_tensor) {
+        g_ort->ReleaseValue(request->input_tensor);
+        request->input_tensor = NULL;
+    }
+    av_freep(&request->input_data);
+    if (request->output_tensor) {
+        g_ort->ReleaseValue(request->output_tensor);
+        request->output_tensor = NULL;
+    }
+}
+
+static inline void destroy_request_item(ONNXRequestItem **arg)
+{
+    ONNXRequestItem *item;
+    if (!arg || !*arg)
+        return;
+    item = *arg;
+    onnx_free_request(item->infer_request);
+    av_freep(&item->infer_request);
+    av_freep(&item->lltask);
+    ff_dnn_async_module_cleanup(&item->exec_module);
+    av_freep(arg);
+}
+
+static void dnn_free_model_onnx(DNNModel **model)
+{
+    ONNXModel *onnx_model;
+    if (!model || !*model)
+        return;
+
+    onnx_model = (ONNXModel *)(*model);
+
+    while (ff_safe_queue_size(onnx_model->request_queue) != 0) {
+        ONNXRequestItem *item = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+        destroy_request_item(&item);
+    }
+    ff_safe_queue_destroy(onnx_model->request_queue);
+
+    while (ff_queue_size(onnx_model->lltask_queue) != 0) {
+        LastLevelTaskItem *item = (LastLevelTaskItem 
*)ff_queue_pop_front(onnx_model->lltask_queue);
+        av_freep(&item);
+    }
+    ff_queue_destroy(onnx_model->lltask_queue);
+
+    while (ff_queue_size(onnx_model->task_queue) != 0) {
+        TaskItem *item = (TaskItem 
*)ff_queue_pop_front(onnx_model->task_queue);
+        av_frame_free(&item->in_frame);
+        av_frame_free(&item->out_frame);
+        av_freep(&item);
+    }
+    ff_queue_destroy(onnx_model->task_queue);
+
+    if (onnx_model->session)
+        g_ort->ReleaseSession(onnx_model->session);
+    if (onnx_model->session_options)
+        g_ort->ReleaseSessionOptions(onnx_model->session_options);
+    if (onnx_model->env)
+        g_ort->ReleaseEnv(onnx_model->env);
+
+    av_freep(&onnx_model);
+    *model = NULL;
+}
+
+static int get_input_onnx(DNNModel *model, DNNData *input, const char 
*input_name)
+{
+    ONNXModel  *onnx_model = (ONNXModel *)model;
+    DnnContext        *ctx = onnx_model->ctx;
+    OrtTypeInfo *type_info = NULL;
+    const OrtTensorTypeAndShapeInfo *tensor_info = NULL;
+    size_t num_dims;
+    size_t input_count = 0;
+    size_t input_index = 0;
+    int    found_input = 0;
+    int64_t *dims;
+    ONNXTensorElementDataType tensor_type;
+    OrtStatus *status;
+
+    if (!input_name || !*input_name) {
+        av_log(ctx, AV_LOG_ERROR, "ONNX input name is not specified\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (onnx_model->input_resolved) {
+        *input = onnx_model->input_info;
+        return 0;
+    }
+
+    status = g_ort->SessionGetInputCount(onnx_model->session, &input_count);
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to get input count: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    for (size_t i = 0; i < input_count; i++) {
+        char *name = NULL;
+        status = g_ort->SessionGetInputName(onnx_model->session, i,
+                                            onnx_model->allocator, &name);
+        if (status != NULL) {
+            g_ort->ReleaseStatus(status);
+            continue;
+        }
+        if (!strcmp(name, input_name)) {
+            input_index = i;
+            found_input = 1;
+        }
+        onnx_model->allocator->Free(onnx_model->allocator, name);
+        if (found_input)
+            break;
+    }
+
+    if (!found_input) {
+        av_log(ctx, AV_LOG_ERROR, "Input name '%s' not found in ONNX model\n",
+               input_name);
+        return AVERROR(EINVAL);
+    }
+
+    status = g_ort->SessionGetInputTypeInfo(onnx_model->session, input_index,
+                                            &type_info);
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to get input type info: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    status = g_ort->CastTypeInfoToTensorInfo(type_info, &tensor_info);
+    if (status != NULL) {
+        g_ort->ReleaseTypeInfo(type_info);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    status = g_ort->GetDimensionsCount(tensor_info, &num_dims);
+    if (status != NULL) {
+        g_ort->ReleaseTypeInfo(type_info);
+        g_ort->ReleaseStatus(status);
+        return AVERROR(EINVAL);
+    }
+
+    if (num_dims != 4) {
+        avpriv_report_missing_feature(ctx, "Support for %zu dimensional 
input", num_dims);
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOSYS);
+    }
+
+    dims = av_malloc(num_dims * sizeof(int64_t));
+    if (!dims) {
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOMEM);
+    }
+
+    g_ort->GetDimensions(tensor_info, dims, num_dims);
+    g_ort->GetTensorElementType(tensor_info, &tensor_type);
+
+    if (dims[0] > 1) {
+        av_log(ctx, AV_LOG_ERROR,
+               "ONNX model has fixed batch size %"PRId64", but the backend "
+               "only supports a batch size of 1\n", dims[0]);
+        av_free(dims);
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOSYS);
+    }
+
+    /*
+     * The ONNX backend assumes a 4-D NCHW input tensor (the rank check
+     * above already rejects anything else).
+     */
+    input->layout = DL_NCHW;
+    input->dims[0] = dims[0] > 0 ? dims[0] : 1;
+    input->dims[1] = dims[1] > 0 ? dims[1] : 3;
+    input->dims[2] = dims[2] > 0 ? dims[2] : -1;
+    input->dims[3] = dims[3] > 0 ? dims[3] : -1;
+
+    if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+        input->dt = DNN_FLOAT;
+    } else {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input tensor data type, only 
float is supported\n");
+        av_free(dims);
+        g_ort->ReleaseTypeInfo(type_info);
+        return AVERROR(ENOSYS);
+    }
+
+    /*
+     * The DCO_RGB setting below is only consulted by the dnn_detect and 
dnn_classify;
+     * the dnn_processing path lets the source AVFrame pixel format determine 
the
+     * tensor channel order, so both RGB24 and BGR24 inputs work transparently
+     * for that flow.
+     */
+    input->order = DCO_RGB;
+    av_free(dims);
+    g_ort->ReleaseTypeInfo(type_info);
+
+    onnx_model->input_info = *input;
+    onnx_model->input_resolved = 1;
+    return 0;
+}
+
+static int fill_model_input_onnx(ONNXModel *onnx_model, ONNXRequestItem 
*request)
+{
+    LastLevelTaskItem       *lltask = NULL;
+    TaskItem                  *task = NULL;
+    ONNXInferRequest *infer_request = NULL;
+    DNNData                   input = { 0 };
+    DnnContext                 *ctx = onnx_model->ctx;
+    int ret, width_idx, height_idx, channel_idx;
+    int64_t input_shape[4];
+    size_t input_tensor_size;
+    OrtMemoryInfo *memory_info;
+    OrtStatus *status;
+
+    lltask = (LastLevelTaskItem *)ff_queue_pop_front(onnx_model->lltask_queue);
+    if (!lltask) {
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+    request->lltask = lltask;
+    task = lltask->task;
+    infer_request = request->infer_request;
+
+    ret = get_input_onnx(&onnx_model->model, &input, task->input_name);
+    if (ret != 0) {
+        goto err;
+    }
+
+    width_idx   = dnn_get_width_idx_by_layout(input.layout);
+    height_idx  = dnn_get_height_idx_by_layout(input.layout);
+    channel_idx = dnn_get_channel_idx_by_layout(input.layout);
+
+    input.dims[height_idx] = task->in_frame->height;
+    input.dims[width_idx]  = task->in_frame->width;
+
+    input_shape[0] = input.dims[0];
+    input_shape[1] = input.dims[channel_idx];
+    input_shape[2] = input.dims[height_idx];
+    input_shape[3] = input.dims[width_idx];
+
+    input_tensor_size = input_shape[0] * input_shape[1] * input_shape[2] * 
input_shape[3];
+    input_tensor_size *= sizeof(float);
+
+    input.data = av_malloc(input_tensor_size);
+    if (!input.data) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+    infer_request->input_data = input.data;
+
+    switch (onnx_model->model.func_type) {
+    case DFT_PROCESS_FRAME:
+        input.scale = 255;
+        if (task->do_ioproc) {
+            if (onnx_model->model.frame_pre_proc != NULL) {
+                onnx_model->model.frame_pre_proc(task->in_frame, &input, 
onnx_model->model.filter_ctx);
+            } else {
+                ff_proc_from_frame_to_dnn(task->in_frame, &input, ctx);
+            }
+        }
+        break;
+    case DFT_ANALYTICS_DETECT:
+        ff_frame_to_dnn_detect(task->in_frame, &input, ctx);
+        break;
+    default:
+        avpriv_report_missing_feature(ctx, "model function type %d", 
onnx_model->model.func_type);
+        ret = AVERROR(ENOSYS);
+        goto err;
+    }
+
+    status = g_ort->CreateCpuMemoryInfo(OrtArenaAllocator, OrtMemTypeDefault, 
&memory_info);
+    if (status != NULL) {
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    status = g_ort->CreateTensorWithDataAsOrtValue(
+        memory_info, input.data, input_tensor_size,
+        input_shape, 4, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
+        &infer_request->input_tensor);
+
+    g_ort->ReleaseMemoryInfo(memory_info);
+
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to create input tensor: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        ret = AVERROR(ENOMEM);
+        goto err;
+    }
+
+    return 0;
+
+err:
+    onnx_free_request(infer_request);
+    return ret;
+}
+
+static int onnx_start_inference(void *args)
+{
+    ONNXRequestItem        *request = (ONNXRequestItem *)args;
+    ONNXInferRequest *infer_request = NULL;
+    LastLevelTaskItem       *lltask = NULL;
+    TaskItem                  *task = NULL;
+    ONNXModel           *onnx_model = NULL;
+    DnnContext                 *ctx = NULL;
+    OrtStatus *status;
+    const char  *input_names[1];
+    const char *output_names[1];
+
+    if (!request) {
+        av_log(NULL, AV_LOG_ERROR, "ONNXRequestItem is NULL\n");
+        return AVERROR(EINVAL);
+    }
+
+    infer_request = request->infer_request;
+    lltask = request->lltask;
+    task = lltask->task;
+    onnx_model = (ONNXModel *)task->model;
+    ctx = onnx_model->ctx;
+
+    if (task->nb_output > 1) {
+        avpriv_report_missing_feature(ctx,
+            "Multiple output tensors (%u) for ONNX backend", task->nb_output);
+        return AVERROR(ENOSYS);
+    }
+
+    if (!task->input_name || !task->output_names || !task->output_names[0]) {
+        av_log(ctx, AV_LOG_ERROR,
+               "ONNX backend: input/output tensor name was not resolved at 
load time\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!infer_request->input_tensor) {
+        av_log(ctx, AV_LOG_ERROR, "Input tensor is NULL\n");
+        return DNN_GENERIC_ERROR;
+    }
+
+    if (!onnx_model->output_resolved) {
+        size_t output_count = 0;
+        int    found_output = 0;
+
+        status = g_ort->SessionGetOutputCount(onnx_model->session, 
&output_count);
+        if (status != NULL) {
+            const char *msg = g_ort->GetErrorMessage(status);
+            av_log(ctx, AV_LOG_ERROR, "Failed to get output count: %s\n", msg);
+            g_ort->ReleaseStatus(status);
+            return AVERROR(EINVAL);
+        }
+
+        for (size_t i = 0; i < output_count; i++) {
+            char *name = NULL;
+            status = g_ort->SessionGetOutputName(onnx_model->session, i,
+                                                 onnx_model->allocator, &name);
+            if (status != NULL) {
+                g_ort->ReleaseStatus(status);
+                continue;
+            }
+            if (!strcmp(name, task->output_names[0]))
+                found_output = 1;
+            onnx_model->allocator->Free(onnx_model->allocator, name);
+            if (found_output)
+                break;
+        }
+
+        if (!found_output) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Output name '%s' not found in ONNX model\n",
+                   task->output_names[0]);
+            return AVERROR(EINVAL);
+        }
+
+        onnx_model->output_resolved = 1;
+    }
+
+    input_names[0]  = task->input_name;
+    output_names[0] = task->output_names[0];
+
+    status = g_ort->Run(onnx_model->session, NULL,
+                        input_names, (const OrtValue *const 
*)&infer_request->input_tensor, 1,
+                        output_names, 1, &infer_request->output_tensor);
+
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "ONNX inference failed: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        return DNN_GENERIC_ERROR;
+    }
+
+    return 0;
+}
+
+static void infer_completion_callback(void *args)
+{
+    ONNXRequestItem  *request = (ONNXRequestItem *)args;
+    LastLevelTaskItem *lltask = request->lltask;
+    TaskItem            *task = lltask->task;
+    DNNData           outputs = { 0 };
+    ONNXInferRequest *infer_request = request->infer_request;
+    ONNXModel           *onnx_model = (ONNXModel *)task->model;
+    DnnContext                 *ctx = onnx_model->ctx;
+    OrtTensorTypeAndShapeInfo *tensor_info;
+    ONNXTensorElementDataType tensor_type;
+    size_t num_dims;
+    int64_t *dims;
+    void *output_data;
+    OrtStatus *status;
+
+    if (!infer_request->output_tensor) {
+        av_log(ctx, AV_LOG_ERROR, "Output tensor is NULL\n");
+        goto err;
+    }
+
+    status = g_ort->GetTensorTypeAndShape(infer_request->output_tensor, 
&tensor_info);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get output tensor info\n");
+        g_ort->ReleaseStatus(status);
+        goto err;
+    }
+
+    g_ort->GetDimensionsCount(tensor_info, &num_dims);
+    dims = av_malloc(num_dims * sizeof(int64_t));
+    if (!dims) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for 
dimensions\n");
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+    g_ort->GetDimensions(tensor_info, dims, num_dims);
+
+    /* Output is interpreted as NCHW, matching the input assumption. */
+    outputs.layout = DL_NCHW;
+    outputs.order = DCO_RGB;
+
+    g_ort->GetTensorElementType(tensor_info, &tensor_type);
+    if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+        outputs.dt = DNN_FLOAT;
+    } else {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output tensor data type, only 
float is supported\n");
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    if (num_dims == 4) {
+        outputs.dims[0] = dims[0];
+        outputs.dims[1] = dims[1];
+        outputs.dims[2] = dims[2];
+        outputs.dims[3] = dims[3];
+    } else {
+        avpriv_report_missing_feature(ctx, "Support for %zu dimensional 
output", num_dims);
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    status = g_ort->GetTensorMutableData(infer_request->output_tensor, 
&output_data);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get tensor data\n");
+        g_ort->ReleaseStatus(status);
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    outputs.data = output_data;
+
+    switch (onnx_model->model.func_type) {
+    case DFT_PROCESS_FRAME:
+        if (task->do_ioproc) {
+            outputs.scale = 255;
+            if (onnx_model->model.frame_post_proc != NULL) {
+                onnx_model->model.frame_post_proc(task->out_frame, &outputs, 
onnx_model->model.filter_ctx);
+            } else {
+                ff_proc_from_dnn_to_frame(task->out_frame, &outputs, ctx);
+            }
+        } else {
+            task->out_frame->width = 
outputs.dims[dnn_get_width_idx_by_layout(outputs.layout)];
+            task->out_frame->height = 
outputs.dims[dnn_get_height_idx_by_layout(outputs.layout)];
+        }
+        break;
+    default:
+        avpriv_report_missing_feature(ctx, "model function type %d", 
onnx_model->model.func_type);
+        av_free(dims);
+        g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+        goto err;
+    }
+
+    av_free(dims);
+    g_ort->ReleaseTensorTypeAndShapeInfo(tensor_info);
+    task->inference_done++;
+
+err:
+    av_freep(&request->lltask);
+    onnx_free_request(infer_request);
+    if (ff_safe_queue_push_back(onnx_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+        av_log(ctx, AV_LOG_ERROR, "Unable to push back request_queue.\n");
+    }
+}
+
+static int execute_model_onnx(ONNXRequestItem *request, Queue *lltask_queue)
+{
+    ONNXModel *onnx_model = NULL;
+    LastLevelTaskItem *lltask;
+    TaskItem *task = NULL;
+    int ret = 0;
+
+    if (ff_queue_size(lltask_queue) == 0) {
+        destroy_request_item(&request);
+        return 0;
+    }
+
+    lltask = (LastLevelTaskItem *)ff_queue_peek_front(lltask_queue);
+    if (lltask == NULL) {
+        av_log(NULL, AV_LOG_ERROR, "Failed to get LastLevelTaskItem\n");
+        destroy_request_item(&request);
+        return AVERROR(EINVAL);
+    }
+    task = lltask->task;
+    onnx_model = (ONNXModel *)task->model;
+
+    ret = fill_model_input_onnx(onnx_model, request);
+    if (ret != 0) {
+        goto err;
+    }
+
+    if (task->async) {
+        avpriv_report_missing_feature(onnx_model->ctx, "ONNX async inference");
+        ret = AVERROR(ENOSYS);
+        goto err;
+    } else {
+        ret = onnx_start_inference((void *)request);
+        if (ret != 0) {
+            goto err;
+        }
+        infer_completion_callback(request);
+        return (task->inference_done == task->inference_todo) ? 0 : 
DNN_GENERIC_ERROR;
+    }
+
+err:
+    av_freep(&request->lltask);
+    onnx_free_request(request->infer_request);
+    if (ff_safe_queue_push_back(onnx_model->request_queue, request) < 0) {
+        destroy_request_item(&request);
+    }
+    return ret;
+}
+
+static int get_output_onnx(DNNModel *model, const char *input_name, int 
input_width, int input_height,
+                           const char *output_name, int *output_width, int 
*output_height)
+{
+    int ret = 0;
+    ONNXModel    *onnx_model = (ONNXModel *)model;
+    DnnContext          *ctx = onnx_model->ctx;
+    TaskItem            task = { 0 };
+    ONNXRequestItem *request = NULL;
+    DNNExecBaseParams exec_params = {
+        .input_name   = input_name,
+        .output_names = &output_name,
+        .nb_output    = 1,
+        .in_frame     = NULL,
+        .out_frame    = NULL,
+    };
+
+    ret = ff_dnn_fill_gettingoutput_task(&task, &exec_params, onnx_model, 
input_height, input_width, ctx);
+    if (ret != 0) {
+        goto err;
+    }
+
+    ret = extract_lltask_from_task(&task, onnx_model->lltask_queue);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to extract last level task from 
task.\n");
+        goto err;
+    }
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to get infer request.\n");
+        ret = AVERROR(EINVAL);
+        goto err;
+    }
+
+    ret = execute_model_onnx(request, onnx_model->lltask_queue);
+    *output_width = task.out_frame->width;
+    *output_height = task.out_frame->height;
+
+err:
+    av_frame_free(&task.out_frame);
+    av_frame_free(&task.in_frame);
+    return ret;
+}
+
+static ONNXInferRequest *onnx_create_inference_request(void)
+{
+    ONNXInferRequest *request = av_malloc(sizeof(ONNXInferRequest));
+    if (!request)
+        return NULL;
+    request->input_tensor  = NULL;
+    request->output_tensor = NULL;
+    request->input_data    = NULL;
+    return request;
+}
+
+static DNNModel *dnn_load_model_onnx(DnnContext *ctx, DNNFunctionType 
func_type, AVFilterContext *filter_ctx)
+{
+    DNNModel       *model = NULL;
+    ONNXModel *onnx_model = NULL;
+    ONNXRequestItem *item = NULL;
+    ONNXOptions  *options = &ctx->onnx_option;
+    OrtStatus *status;
+
+    ff_thread_once(&g_ort_init_once, init_ort_api);
+    if (!g_ort) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get ONNX Runtime API\n");
+        return NULL;
+    }
+
+    onnx_model = av_mallocz(sizeof(ONNXModel));
+    if (!onnx_model)
+        return NULL;
+
+    model = &onnx_model->model;
+    onnx_model->ctx = ctx;
+
+    status = g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "FFmpeg", 
&onnx_model->env);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to create ONNX Runtime 
environment\n");
+        goto fail;
+    }
+
+    status = g_ort->CreateSessionOptions(&onnx_model->session_options);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to create session options\n");
+        goto fail;
+    }
+
+    if (options->num_threads > 0 &&
+        (!ctx->device || av_strcasecmp(ctx->device, "cpu") == 0)) {
+        g_ort->SetIntraOpNumThreads(onnx_model->session_options, 
options->num_threads);
+    }
+    g_ort->SetSessionGraphOptimizationLevel(onnx_model->session_options, 
ORT_ENABLE_ALL);
+
+    if (ctx->device && av_strcasecmp(ctx->device, "cpu") != 0) {
+        if (av_strcasecmp(ctx->device, "cuda") == 0) {
+            if (g_ort->SessionOptionsAppendExecutionProvider_CUDA) {
+                OrtCUDAProviderOptions cuda_options;
+                memset(&cuda_options, 0, sizeof(cuda_options));
+                cuda_options.device_id = ctx->device_id;
+
+                status = g_ort->SessionOptionsAppendExecutionProvider_CUDA(
+                    onnx_model->session_options, &cuda_options);
+                if (status != NULL) {
+                    const char *msg = g_ort->GetErrorMessage(status);
+                    av_log(ctx, AV_LOG_WARNING, "Failed to enable CUDA (device 
%d): %s. Falling back to CPU\n",
+                           ctx->device_id, msg);
+                    g_ort->ReleaseStatus(status);
+                } else {
+                    av_log(ctx, AV_LOG_INFO, "Using CUDA execution provider on 
device %d\n", ctx->device_id);
+                }
+            } else {
+                av_log(ctx, AV_LOG_WARNING, "CUDA provider function not 
available in this ONNX Runtime API version. Falling back to CPU\n");
+            }
+        } else if (av_strcasecmp(ctx->device, "dml") == 0) {
+#ifdef _WIN32
+            const char* dml_options_keys[] = {"device_id"};
+            const char* dml_options_values[] = {NULL};
+            char device_id_str[32];
+            snprintf(device_id_str, sizeof(device_id_str), "%d", 
ctx->device_id);
+            dml_options_values[0] = device_id_str;
+
+            /* DirectML cannot use ORT's memory-pattern optimizer and only
+             * supports sequential execution. */
+            status = 
g_ort->SetSessionExecutionMode(onnx_model->session_options, ORT_SEQUENTIAL);
+            if (status)
+                g_ort->ReleaseStatus(status);
+            status = g_ort->DisableMemPattern(onnx_model->session_options);
+            if (status)
+                g_ort->ReleaseStatus(status);
+
+            if (g_ort->SessionOptionsAppendExecutionProvider) {
+                status = g_ort->SessionOptionsAppendExecutionProvider(
+                    onnx_model->session_options, "DML",
+                    dml_options_keys, dml_options_values, 1);
+                if (status != NULL) {
+                    const char *msg = g_ort->GetErrorMessage(status);
+                    av_log(ctx, AV_LOG_WARNING, "Failed to enable DirectML 
(device %d): %s. Falling back to CPU\n",
+                           ctx->device_id, msg);
+                    g_ort->ReleaseStatus(status);
+                } else {
+                    av_log(ctx, AV_LOG_INFO, "Using DirectML execution 
provider on device %d\n", ctx->device_id);
+                }
+            } else {
+                av_log(ctx, AV_LOG_WARNING, "DirectML provider function not 
available in this ONNX Runtime API version. Falling back to CPU\n");
+            }
+#else
+            av_log(ctx, AV_LOG_WARNING, "DirectML is only available on 
Windows. Falling back to CPU\n");
+#endif
+        } else if (av_strcasecmp(ctx->device, "vitisai") == 0) {
+            if (g_ort->SessionOptionsAppendExecutionProvider) {
+                status = g_ort->SessionOptionsAppendExecutionProvider(
+                    onnx_model->session_options, "VitisAI",
+                    NULL, NULL, 0);
+                if (status != NULL) {
+                    const char *msg = g_ort->GetErrorMessage(status);
+                    av_log(ctx, AV_LOG_WARNING,
+                           "Failed to enable VitisAI EP: %s. Falling back to 
CPU\n", msg);
+                    g_ort->ReleaseStatus(status);
+                } else {
+                    av_log(ctx, AV_LOG_INFO, "Using VitisAI execution provider 
(AMD Ryzen AI NPU)\n");
+                }
+            } else {
+                av_log(ctx, AV_LOG_WARNING,
+                       "VitisAI provider function not available in this ONNX 
Runtime API version. Falling back to CPU.\n");
+            }
+        } else {
+#ifdef _WIN32
+            av_log(ctx, AV_LOG_WARNING,
+                   "Unknown device '%s'. Supported: cpu, cuda, dml, vitisai. 
Using CPU\n",
+                   ctx->device);
+#else
+            av_log(ctx, AV_LOG_WARNING,
+                   "Unknown device '%s'. Supported: cpu, cuda, vitisai. Using 
CPU\n",
+                   ctx->device);
+#endif
+        }
+    } else {
+        av_log(ctx, AV_LOG_INFO, "Using CPU execution provider\n");
+    }
+
+#ifdef _WIN32
+    {
+        wchar_t *wfilename = NULL;
+        if (utf8towchar(ctx->model_filename, &wfilename)) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to convert model filename to 
UTF-16\n");
+            goto fail;
+        }
+        if (!wfilename) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to convert model filename to 
UTF-16\n");
+            goto fail;
+        }
+
+        status = g_ort->CreateSession(onnx_model->env, wfilename,
+                                      onnx_model->session_options, 
&onnx_model->session);
+        av_free(wfilename);
+    }
+#else
+    status = g_ort->CreateSession(onnx_model->env, ctx->model_filename,
+                                  onnx_model->session_options, 
&onnx_model->session);
+#endif
+    if (status != NULL) {
+        const char *msg = g_ort->GetErrorMessage(status);
+        av_log(ctx, AV_LOG_ERROR, "Failed to create ONNX session: %s\n", msg);
+        g_ort->ReleaseStatus(status);
+        goto fail;
+    }
+
+    status = g_ort->GetAllocatorWithDefaultOptions(&onnx_model->allocator);
+    if (status != NULL) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to get allocator\n");
+        goto fail;
+    }
+
+    /*
+     * The ONNX backend binds exactly one input tensor to Run(), so only
+     * single-input models are supported.
+     */
+    {
+        size_t input_count = 0;
+        status = g_ort->SessionGetInputCount(onnx_model->session, 
&input_count);
+        if (status != NULL) {
+            const char *msg = g_ort->GetErrorMessage(status);
+            av_log(ctx, AV_LOG_ERROR, "Failed to get model input count: %s\n", 
msg);
+            g_ort->ReleaseStatus(status);
+            goto fail;
+        }
+        if (input_count == 0) {
+            av_log(ctx, AV_LOG_ERROR, "ONNX model exposes no input tensors\n");
+            goto fail;
+        }
+        if (input_count > 1) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "ONNX model exposes %zu input tensors; the ONNX backend "
+                   "supports single-input models only.\n",
+                   input_count);
+            goto fail;
+        }
+    }
+
+    /* Auto-detect the input tensor name when the user did not pass 
input=NAME. */
+    if (!ctx->model_inputname || !*ctx->model_inputname) {
+        char *name = NULL;
+        status = g_ort->SessionGetInputName(onnx_model->session, 0,
+                                            onnx_model->allocator, &name);
+        if (status != NULL) {
+            const char *msg = g_ort->GetErrorMessage(status);
+            av_log(ctx, AV_LOG_ERROR, "Failed to get model input name: %s\n", 
msg);
+            g_ort->ReleaseStatus(status);
+            goto fail;
+        }
+        av_freep(&ctx->model_inputname);
+        ctx->model_inputname = av_strdup(name);
+        onnx_model->allocator->Free(onnx_model->allocator, name);
+        if (!ctx->model_inputname)
+            goto fail;
+        av_log(ctx, AV_LOG_INFO, "Auto-detected ONNX input tensor '%s'\n",
+               ctx->model_inputname);
+    }
+
+    /* Auto-detect the output tensor name when the user did not pass 
output=NAME. */
+    if (!ctx->model_outputnames) {
+        size_t output_count = 0;
+        char *name = NULL;
+        status = g_ort->SessionGetOutputCount(onnx_model->session, 
&output_count);
+        if (status != NULL) {
+            const char *msg = g_ort->GetErrorMessage(status);
+            av_log(ctx, AV_LOG_ERROR, "Failed to get model output count: 
%s\n", msg);
+            g_ort->ReleaseStatus(status);
+            goto fail;
+        }
+        if (output_count == 0) {
+            av_log(ctx, AV_LOG_ERROR, "ONNX model exposes no output 
tensors\n");
+            goto fail;
+        }
+        status = g_ort->SessionGetOutputName(onnx_model->session, 0,
+                                             onnx_model->allocator, &name);
+        if (status != NULL) {
+            const char *msg = g_ort->GetErrorMessage(status);
+            av_log(ctx, AV_LOG_ERROR, "Failed to get model output name: %s\n", 
msg);
+            g_ort->ReleaseStatus(status);
+            goto fail;
+        }
+        ctx->model_outputnames = av_calloc(1, sizeof(*ctx->model_outputnames));
+        if (!ctx->model_outputnames) {
+            onnx_model->allocator->Free(onnx_model->allocator, name);
+            goto fail;
+        }
+        ctx->model_outputnames[0] = av_strdup(name);
+        onnx_model->allocator->Free(onnx_model->allocator, name);
+        if (!ctx->model_outputnames[0]) {
+            av_freep(&ctx->model_outputnames);
+            goto fail;
+        }
+        ctx->nb_outputs = 1;
+        if (output_count == 1) {
+            av_log(ctx, AV_LOG_INFO, "Auto-detected ONNX output tensor '%s'\n",
+                   ctx->model_outputnames[0]);
+        } else {
+            av_log(ctx, AV_LOG_WARNING,
+                   "ONNX model exposes %zu output tensors; auto-using index 0 
('%s'). "
+                   "Specify output=NAME to choose a different one.\n",
+                   output_count, ctx->model_outputnames[0]);
+        }
+    }
+
+    onnx_model->request_queue = ff_safe_queue_create();
+    if (!onnx_model->request_queue) {
+        goto fail;
+    }
+
+    item = av_mallocz(sizeof(ONNXRequestItem));
+    if (!item) {
+        goto fail;
+    }
+    item->lltask = NULL;
+    item->infer_request = onnx_create_inference_request();
+    if (!item->infer_request) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate memory for ONNX 
inference request\n");
+        goto fail;
+    }
+    item->exec_module.start_inference = &onnx_start_inference;
+    item->exec_module.callback = &infer_completion_callback;
+    item->exec_module.args = item;
+
+    if (ff_safe_queue_push_back(onnx_model->request_queue, item) < 0) {
+        goto fail;
+    }
+    item = NULL;
+
+    onnx_model->task_queue = ff_queue_create();
+    if (!onnx_model->task_queue) {
+        goto fail;
+    }
+
+    onnx_model->lltask_queue = ff_queue_create();
+    if (!onnx_model->lltask_queue) {
+        goto fail;
+    }
+
+    model->get_input  = &get_input_onnx;
+    model->get_output = &get_output_onnx;
+    model->filter_ctx = filter_ctx;
+    model->func_type  = func_type;
+
+    return model;
+
+fail:
+    if (item) {
+        destroy_request_item(&item);
+    }
+    dnn_free_model_onnx(&model);
+    return NULL;
+}
+
+static int dnn_execute_model_onnx(const DNNModel *model, DNNExecBaseParams 
*exec_params)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    DnnContext *ctx = onnx_model->ctx;
+    TaskItem *task;
+    ONNXRequestItem *request;
+    int ret = 0;
+
+    ret = ff_check_exec_params(ctx, DNN_ONNX, model->func_type, exec_params);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Exec parameter checking failed.\n");
+        return ret;
+    }
+
+    task = av_malloc(sizeof(TaskItem));
+    if (!task) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to alloc memory for task item.\n");
+        return AVERROR(ENOMEM);
+    }
+
+    ret = ff_dnn_fill_task(task, exec_params, onnx_model, 0, 1);
+    if (ret != 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "Unable to fill task.\n");
+        return ret;
+    }
+
+    ret = ff_queue_push_back(onnx_model->task_queue, task);
+    if (ret < 0) {
+        av_freep(&task);
+        av_log(ctx, AV_LOG_ERROR, "Unable to push back task_queue.\n");
+        return ret;
+    }
+
+    ret = extract_lltask_from_task(task, onnx_model->lltask_queue);
+    if (ret != 0) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to extract last level task from 
task.\n");
+        return ret;
+    }
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(ctx, AV_LOG_ERROR, "Unable to get infer request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_onnx(request, onnx_model->lltask_queue);
+}
+
+static DNNAsyncStatusType dnn_get_result_onnx(const DNNModel *model, AVFrame 
**in, AVFrame **out)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    return ff_dnn_get_result_common(onnx_model->task_queue, in, out);
+}
+
+static int dnn_flush_onnx(const DNNModel *model)
+{
+    ONNXModel *onnx_model = (ONNXModel *)model;
+    ONNXRequestItem *request;
+
+    if (ff_queue_size(onnx_model->lltask_queue) == 0)
+        return 0;
+
+    request = (ONNXRequestItem 
*)ff_safe_queue_pop_front(onnx_model->request_queue);
+    if (!request) {
+        av_log(onnx_model->ctx, AV_LOG_ERROR, "Unable to get infer 
request.\n");
+        return AVERROR(EINVAL);
+    }
+
+    return execute_model_onnx(request, onnx_model->lltask_queue);
+}
+
+const DNNModule ff_dnn_backend_onnx = {
+    .clazz = DNN_DEFINE_CLASS(dnn_onnx),
+    .type = DNN_ONNX,
+    .load_model = dnn_load_model_onnx,
+    .execute_model = dnn_execute_model_onnx,
+    .get_result = dnn_get_result_onnx,
+    .flush = dnn_flush_onnx,
+    .free_model = dnn_free_model_onnx,
+};
diff --git a/libavfilter/dnn/dnn_interface.c b/libavfilter/dnn/dnn_interface.c
index 7080ab12e4..010677dd81 100644
--- a/libavfilter/dnn/dnn_interface.c
+++ b/libavfilter/dnn/dnn_interface.c
@@ -33,6 +33,9 @@
 extern const DNNModule ff_dnn_backend_openvino;
 extern const DNNModule ff_dnn_backend_tf;
 extern const DNNModule ff_dnn_backend_torch;
+#if CONFIG_LIBONNXRUNTIME
+extern const DNNModule ff_dnn_backend_onnx;
+#endif
 
 #define OFFSET(x) offsetof(DnnContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM
@@ -53,6 +56,8 @@ static const AVOption dnn_base_options[] = {
                 OFFSET(async), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS},
         {"device", "device to run model",
                 OFFSET(device), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, 
FLAGS},
+        {"device_id", "device ID to run model",
+                OFFSET(device_id), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, 
FLAGS},
         {NULL}
 };
 
@@ -78,6 +83,9 @@ static const DnnBackendInfo dnn_backend_info_list[] = {
 #if CONFIG_LIBTORCH
         {offsetof(DnnContext, torch_option), .module = &ff_dnn_backend_torch},
 #endif
+#if CONFIG_LIBONNXRUNTIME
+        {offsetof(DnnContext, onnx_option), .module = &ff_dnn_backend_onnx},
+#endif
 };
 
 const DNNModule *ff_get_dnn_module(DNNBackendType backend_type, void *log_ctx)
diff --git a/libavfilter/dnn_filter_common.c b/libavfilter/dnn_filter_common.c
index 6b9c6f8d7f..73c5e6b33c 100644
--- a/libavfilter/dnn_filter_common.c
+++ b/libavfilter/dnn_filter_common.c
@@ -31,12 +31,15 @@ static char **separate_output_names(const char *expr, const 
char *val_sep, int *
         return NULL;
     }
 
-    parsed_vals = av_calloc(MAX_SUPPORTED_OUTPUTS_NB, sizeof(*parsed_vals));
+    parsed_vals = av_calloc(MAX_SUPPORTED_OUTPUTS_NB + 1, 
sizeof(*parsed_vals));
     if (!parsed_vals) {
         return NULL;
     }
 
     do {
+        if (val_num >= MAX_SUPPORTED_OUTPUTS_NB) {
+            goto err;
+        }
         val = av_get_token(&expr, val_sep);
         if(val) {
             parsed_vals[val_num] = val;
@@ -51,6 +54,12 @@ static char **separate_output_names(const char *expr, const 
char *val_sep, int *
     *separated_nb = val_num;
 
     return parsed_vals;
+
+err:
+    for (int i = 0; i < val_num; i++)
+        av_free(parsed_vals[i]);
+    av_freep(&parsed_vals);
+    return NULL;
 }
 
 typedef struct DnnFilterBase {
@@ -97,6 +106,20 @@ int ff_dnn_init(DnnContext *ctx, DNNFunctionType func_type, 
AVFilterContext *fil
             av_log(filter_ctx, AV_LOG_ERROR, "could not parse model output 
names\n");
             return AVERROR(EINVAL);
         }
+    } else if (backend == DNN_ONNX) {
+        /* ONNX: input and output tensor names are optional.*/
+        if (ctx->model_outputnames_string) {
+            ctx->model_outputnames = 
separate_output_names(ctx->model_outputnames_string, "&", &ctx->nb_outputs);
+            if (!ctx->model_outputnames) {
+                av_log(filter_ctx, AV_LOG_ERROR, "could not parse model output 
names\n");
+                return AVERROR(EINVAL);
+            }
+            if (ctx->nb_outputs != 1) {
+                av_log(filter_ctx, AV_LOG_ERROR,
+                       "ONNX backend supports a single output name only\n");
+                return AVERROR(EINVAL);
+            }
+        }
     }
 
     ctx->dnn_module = ff_get_dnn_module(ctx->backend_type, filter_ctx);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 66086409be..69a8b0a669 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -35,7 +35,8 @@
 typedef enum {
     DNN_TF = 1,
     DNN_OV = 1 << 1,
-    DNN_TH = 1 << 2
+    DNN_TH = 1 << 2,
+    DNN_ONNX = 1 << 3
 } DNNBackendType;
 
 typedef enum {DNN_FLOAT = 1, DNN_UINT8 = 4} DNNDataType;
@@ -138,6 +139,13 @@ typedef struct THOptions {
     int optimize;
 } THOptions;
 
+#if CONFIG_LIBONNXRUNTIME
+typedef struct ONNXOptions {
+    const AVClass *clazz;
+    int num_threads;
+} ONNXOptions;
+#endif
+
 typedef struct DNNModule DNNModule;
 
 typedef struct DnnContext {
@@ -158,6 +166,7 @@ typedef struct DnnContext {
 
     int nireq;
     char *device;
+    int device_id;
 
 #if CONFIG_LIBTENSORFLOW
     TFOptions tf_option;
@@ -169,6 +178,9 @@ typedef struct DnnContext {
 #if CONFIG_LIBTORCH
     THOptions torch_option;
 #endif
+#if CONFIG_LIBONNXRUNTIME
+    ONNXOptions onnx_option;
+#endif
 } DnnContext;
 
 // Stores pointers to functions for loading, executing, freeing DNN models for 
one of the backends.
diff --git a/libavfilter/vf_dnn_processing.c b/libavfilter/vf_dnn_processing.c
index 0771ceb5fc..7ffa700cc5 100644
--- a/libavfilter/vf_dnn_processing.c
+++ b/libavfilter/vf_dnn_processing.c
@@ -52,11 +52,14 @@ static const AVOption dnn_processing_options[] = {
 #endif
 #if (CONFIG_LIBTORCH == 1)
     { "torch",       "torch backend flag",         0,                        
AV_OPT_TYPE_CONST,     { .i64 = DNN_TH },    0, 0, FLAGS, "backend" },
+#endif
+#if (CONFIG_LIBONNXRUNTIME == 1)
+    { "onnx",        "onnx backend flag",          0,                        
AV_OPT_TYPE_CONST,     { .i64 = DNN_ONNX },  0, 0, FLAGS, "backend" },
 #endif
     { NULL }
 };
 
-AVFILTER_DNN_DEFINE_CLASS(dnn_processing, DNN_TF | DNN_OV | DNN_TH);
+AVFILTER_DNN_DEFINE_CLASS(dnn_processing, DNN_TF | DNN_OV | DNN_TH | DNN_ONNX);
 
 static av_cold int init(AVFilterContext *context)
 {

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] avfilter/dnn: add ONNX Runtime backend with GPU execution provider support (branch master)

Reply via email to