[FFmpeg-devel] [PR] avfilter/dnn: implement persistent input buffer for torch backend (PR #23169)

Raja-89 via ffmpeg-devel Wed, 20 May 2026 02:43:24 -0700

PR #23169 opened by Raja-89
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23169
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23169.patch


Replace the per-frame av_malloc/av_free pattern with a persistent
buffer in THInferRequest that grows lazily on resolution increases
but is reused for every subsequent frame of the same or smaller size.

Key changes:
- Add input_data/input_data_size fields to THInferRequest to hold the
  persistent pixel buffer across frames
- Add persistent_buf_deleter() no-op deleter: memory is owned by
  THInferRequest, not the LibTorch tensor. This same ownership pattern
  will be reused for zero-copy CUDA tensors in a follow-up commit.
- Update th_create_inference_request() to zero-initialise the new fields
- Update th_free_request() to release the persistent buffer on teardown
- Add AV_PIX_FMT_CUDA detection with a clear ENOSYS error as a hook
  point for the zero-copy GPU path (follow-up commit)
- Fix pre-existing SIGSEGV: parameters().begin() was unconditionally
  dereferenced in th_start_inference() even when the model has no
  learnable parameters. Parameterless TorchScript models now default
  to the CPU device instead of crashing.

The lazy reallocation logic also lays the groundwork for dynamic-shape
handling (Phase 3 of the GSoC project).

Tested with:
  ./ffmpeg -f lavfi -i testsrc=duration=5:size=640x480:rate=25 \
    -vf format=rgb24,dnn_processing=dnn_backend=torch:model=dummy_model.pt \
    -vcodec rawvideo -f null /dev/null
  (125 frames @ 16.2x speed, exit 0, sync and async modes)

Signed-off-by: Raja Rathour <[email protected]>


>From af2d996fc608e352e7b1a30fd5ae7f4c5b387d23 Mon Sep 17 00:00:00 2001
From: Raja Rathour <[email protected]>
Date: Wed, 20 May 2026 14:53:27 +0530
Subject: [PATCH] avfilter/dnn: implement persistent input buffer for torch
 backend

Replace the per-frame av_malloc/av_free pattern with a persistent
buffer in THInferRequest that grows lazily on resolution increases
but is reused for every subsequent frame of the same or smaller size.

Key changes:
- Add input_data/input_data_size fields to THInferRequest to hold the
  persistent pixel buffer across frames
- Add persistent_buf_deleter() no-op deleter: memory is owned by
  THInferRequest, not the LibTorch tensor. This same ownership pattern
  will be reused for zero-copy CUDA tensors in a follow-up commit.
- Update th_create_inference_request() to zero-initialise the new fields
- Update th_free_request() to release the persistent buffer on teardown
- Add AV_PIX_FMT_CUDA detection with a clear ENOSYS error as a hook
  point for the zero-copy GPU path (follow-up commit)
- Fix pre-existing SIGSEGV: parameters().begin() was unconditionally
  dereferenced in th_start_inference() even when the model has no
  learnable parameters. Parameterless TorchScript models now default
  to the CPU device instead of crashing.

The lazy reallocation logic also lays the groundwork for dynamic-shape
handling (Phase 3 of the GSoC project).

Tested with:
  ./ffmpeg -f lavfi -i testsrc=duration=5:size=640x480:rate=25 \
    -vf format=rgb24,dnn_processing=dnn_backend=torch:model=dummy_model.pt \
    -vcodec rawvideo -f null /dev/null
  (125 frames @ 16.2x speed, exit 0, sync and async modes)

Signed-off-by: Raja Rathour <[email protected]>
---
 libavfilter/dnn/dnn_backend_torch.cpp | 61 ++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_torch.cpp 
b/libavfilter/dnn/dnn_backend_torch.cpp
index 24a202f493..e1f972510b 100644
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@@ -31,6 +31,7 @@ extern "C" {
 #include "dnn_backend_common.h"
 #include "libavutil/opt.h"
 #include "libavutil/mem.h"
+#include "libavutil/pixfmt.h"
 #include "queue.h"
 #include "safe_queue.h"
 }
@@ -47,6 +48,8 @@ typedef struct THModel {
 typedef struct THInferRequest {
     torch::Tensor *output;
     torch::Tensor *input_tensor;
+    uint8_t *input_data;      ///< Persistent buffer for input pixels
+    size_t   input_data_size; ///< Current allocated size of input_data
 } THInferRequest;
 
 typedef struct THRequestItem {
@@ -95,6 +98,10 @@ static void th_free_request(THInferRequest *request)
         delete(request->input_tensor);
         request->input_tensor = NULL;
     }
+    if (request->input_data) {
+        av_freep(&request->input_data);
+        request->input_data_size = 0;
+    }
     return;
 }
 
@@ -152,9 +159,9 @@ static int get_input_th(DNNModel *model, DNNData *input, 
const char *input_name)
     return 0;
 }
 
-static void deleter(void *arg)
+static void persistent_buf_deleter(void *arg)
 {
-    av_freep(&arg);
+    (void)arg;
 }
 
 static int fill_model_input_th(THModel *th_model, THRequestItem *request)
@@ -165,6 +172,7 @@ static int fill_model_input_th(THModel *th_model, 
THRequestItem *request)
     DNNData input = { 0 };
     DnnContext *ctx = th_model->ctx;
     int ret, width_idx, height_idx, channel_idx;
+    size_t required_size;
 
     lltask = (LastLevelTaskItem *)ff_queue_pop_front(th_model->lltask_queue);
     if (!lltask) {
@@ -175,19 +183,38 @@ static int fill_model_input_th(THModel *th_model, 
THRequestItem *request)
     task = lltask->task;
     infer_request = request->infer_request;
 
-    ret = get_input_th(&th_model->model, &input, NULL);
-    if ( ret != 0) {
+    if (task->in_frame->format == AV_PIX_FMT_CUDA) {
+        av_log(ctx, AV_LOG_ERROR,
+               "CUDA frame input is not yet supported. "
+               "Use the 'format=rgb24' filter before dnn_processing.\n");
+        ret = AVERROR(ENOSYS);
         goto err;
     }
-    width_idx = dnn_get_width_idx_by_layout(input.layout);
+
+    ret = get_input_th(&th_model->model, &input, NULL);
+    if (ret != 0) {
+        goto err;
+    }
+    width_idx  = dnn_get_width_idx_by_layout(input.layout);
     height_idx = dnn_get_height_idx_by_layout(input.layout);
     channel_idx = dnn_get_channel_idx_by_layout(input.layout);
     input.dims[height_idx] = task->in_frame->height;
-    input.dims[width_idx] = task->in_frame->width;
-    input.data = av_malloc(input.dims[height_idx] * input.dims[width_idx] *
-                           input.dims[channel_idx] * sizeof(float));
-    if (!input.data)
-        return AVERROR(ENOMEM);
+    input.dims[width_idx]  = task->in_frame->width;
+
+    required_size = (size_t)input.dims[height_idx] * input.dims[width_idx] *
+                    input.dims[channel_idx] * sizeof(float);
+
+    if (infer_request->input_data_size < required_size) {
+        av_freep(&infer_request->input_data);
+        infer_request->input_data = (uint8_t *)av_malloc(required_size);
+        if (!infer_request->input_data) {
+            infer_request->input_data_size = 0;
+            return AVERROR(ENOMEM);
+        }
+        infer_request->input_data_size = required_size;
+    }
+    input.data = infer_request->input_data;
+
     infer_request->input_tensor = new torch::Tensor();
     infer_request->output = new torch::Tensor();
 
@@ -208,7 +235,7 @@ static int fill_model_input_th(THModel *th_model, 
THRequestItem *request)
     }
     *infer_request->input_tensor = torch::from_blob(input.data,
         {1, input.dims[channel_idx], input.dims[height_idx], 
input.dims[width_idx]},
-        deleter, torch::kFloat32);
+        persistent_buf_deleter, torch::kFloat32);
     return 0;
 
 err:
@@ -246,8 +273,10 @@ static int th_start_inference(void *args)
         av_log(ctx, AV_LOG_ERROR, "input or output tensor is NULL\n");
         return DNN_GENERIC_ERROR;
     }
-    // Transfer tensor to the same device as model
-    c10::Device device = (*th_model->jit_model->parameters().begin()).device();
+    auto params = th_model->jit_model->parameters();
+    c10::Device device(torch::kCPU);
+    if (params.begin() != params.end())
+        device = (*params.begin()).device();
     if (infer_request->input_tensor->device() != device)
         *infer_request->input_tensor = infer_request->input_tensor->to(device);
     inputs.push_back(*infer_request->input_tensor);
@@ -410,8 +439,10 @@ static THInferRequest *th_create_inference_request(void)
     if (!request) {
         return NULL;
     }
-    request->input_tensor = NULL;
-    request->output = NULL;
+    request->input_tensor    = NULL;
+    request->output          = NULL;
+    request->input_data      = NULL;
+    request->input_data_size = 0;
     return request;
 }
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avfilter/dnn: implement persistent input buffer for torch backend (PR #23169)

Reply via email to