PR #23169 opened by Raja-89
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23169
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23169.patch
Replace the per-frame av_malloc/av_free pattern with a persistent
buffer in THInferRequest that grows lazily on resolution increases
but is reused for every subsequent frame of the same or smaller size.
Key changes:
- Add input_data/input_data_size fields to THInferRequest to hold the
persistent pixel buffer across frames
- Add persistent_buf_deleter() no-op deleter: memory is owned by
THInferRequest, not the LibTorch tensor. This same ownership pattern
will be reused for zero-copy CUDA tensors in a follow-up commit.
- Update th_create_inference_request() to zero-initialise the new fields
- Update th_free_request() to release the persistent buffer on teardown
- Add AV_PIX_FMT_CUDA detection with a clear ENOSYS error as a hook
point for the zero-copy GPU path (follow-up commit)
- Fix pre-existing SIGSEGV: parameters().begin() was unconditionally
dereferenced in th_start_inference() even when the model has no
learnable parameters. Parameterless TorchScript models now default
to the CPU device instead of crashing.
The lazy reallocation logic also lays the groundwork for dynamic-shape
handling (Phase 3 of the GSoC project).
Tested with:
./ffmpeg -f lavfi -i testsrc=duration=5:size=640x480:rate=25 \
-vf format=rgb24,dnn_processing=dnn_backend=torch:model=dummy_model.pt \
-vcodec rawvideo -f null /dev/null
(125 frames @ 16.2x speed, exit 0, sync and async modes)
Signed-off-by: Raja Rathour <[email protected]>
>From af2d996fc608e352e7b1a30fd5ae7f4c5b387d23 Mon Sep 17 00:00:00 2001
From: Raja Rathour <[email protected]>
Date: Wed, 20 May 2026 14:53:27 +0530
Subject: [PATCH] avfilter/dnn: implement persistent input buffer for torch
backend
Replace the per-frame av_malloc/av_free pattern with a persistent
buffer in THInferRequest that grows lazily on resolution increases
but is reused for every subsequent frame of the same or smaller size.
Key changes:
- Add input_data/input_data_size fields to THInferRequest to hold the
persistent pixel buffer across frames
- Add persistent_buf_deleter() no-op deleter: memory is owned by
THInferRequest, not the LibTorch tensor. This same ownership pattern
will be reused for zero-copy CUDA tensors in a follow-up commit.
- Update th_create_inference_request() to zero-initialise the new fields
- Update th_free_request() to release the persistent buffer on teardown
- Add AV_PIX_FMT_CUDA detection with a clear ENOSYS error as a hook
point for the zero-copy GPU path (follow-up commit)
- Fix pre-existing SIGSEGV: parameters().begin() was unconditionally
dereferenced in th_start_inference() even when the model has no
learnable parameters. Parameterless TorchScript models now default
to the CPU device instead of crashing.
The lazy reallocation logic also lays the groundwork for dynamic-shape
handling (Phase 3 of the GSoC project).
Tested with:
./ffmpeg -f lavfi -i testsrc=duration=5:size=640x480:rate=25 \
-vf format=rgb24,dnn_processing=dnn_backend=torch:model=dummy_model.pt \
-vcodec rawvideo -f null /dev/null
(125 frames @ 16.2x speed, exit 0, sync and async modes)
Signed-off-by: Raja Rathour <[email protected]>
---
libavfilter/dnn/dnn_backend_torch.cpp | 61 ++++++++++++++++++++-------
1 file changed, 46 insertions(+), 15 deletions(-)
diff --git a/libavfilter/dnn/dnn_backend_torch.cpp
b/libavfilter/dnn/dnn_backend_torch.cpp
index 24a202f493..e1f972510b 100644
--- a/libavfilter/dnn/dnn_backend_torch.cpp
+++ b/libavfilter/dnn/dnn_backend_torch.cpp
@@ -31,6 +31,7 @@ extern "C" {
#include "dnn_backend_common.h"
#include "libavutil/opt.h"
#include "libavutil/mem.h"
+#include "libavutil/pixfmt.h"
#include "queue.h"
#include "safe_queue.h"
}
@@ -47,6 +48,8 @@ typedef struct THModel {
typedef struct THInferRequest {
torch::Tensor *output;
torch::Tensor *input_tensor;
+ uint8_t *input_data; ///< Persistent buffer for input pixels
+ size_t input_data_size; ///< Current allocated size of input_data
} THInferRequest;
typedef struct THRequestItem {
@@ -95,6 +98,10 @@ static void th_free_request(THInferRequest *request)
delete(request->input_tensor);
request->input_tensor = NULL;
}
+ if (request->input_data) {
+ av_freep(&request->input_data);
+ request->input_data_size = 0;
+ }
return;
}
@@ -152,9 +159,9 @@ static int get_input_th(DNNModel *model, DNNData *input,
const char *input_name)
return 0;
}
-static void deleter(void *arg)
+static void persistent_buf_deleter(void *arg)
{
- av_freep(&arg);
+ (void)arg;
}
static int fill_model_input_th(THModel *th_model, THRequestItem *request)
@@ -165,6 +172,7 @@ static int fill_model_input_th(THModel *th_model,
THRequestItem *request)
DNNData input = { 0 };
DnnContext *ctx = th_model->ctx;
int ret, width_idx, height_idx, channel_idx;
+ size_t required_size;
lltask = (LastLevelTaskItem *)ff_queue_pop_front(th_model->lltask_queue);
if (!lltask) {
@@ -175,19 +183,38 @@ static int fill_model_input_th(THModel *th_model,
THRequestItem *request)
task = lltask->task;
infer_request = request->infer_request;
- ret = get_input_th(&th_model->model, &input, NULL);
- if ( ret != 0) {
+ if (task->in_frame->format == AV_PIX_FMT_CUDA) {
+ av_log(ctx, AV_LOG_ERROR,
+ "CUDA frame input is not yet supported. "
+ "Use the 'format=rgb24' filter before dnn_processing.\n");
+ ret = AVERROR(ENOSYS);
goto err;
}
- width_idx = dnn_get_width_idx_by_layout(input.layout);
+
+ ret = get_input_th(&th_model->model, &input, NULL);
+ if (ret != 0) {
+ goto err;
+ }
+ width_idx = dnn_get_width_idx_by_layout(input.layout);
height_idx = dnn_get_height_idx_by_layout(input.layout);
channel_idx = dnn_get_channel_idx_by_layout(input.layout);
input.dims[height_idx] = task->in_frame->height;
- input.dims[width_idx] = task->in_frame->width;
- input.data = av_malloc(input.dims[height_idx] * input.dims[width_idx] *
- input.dims[channel_idx] * sizeof(float));
- if (!input.data)
- return AVERROR(ENOMEM);
+ input.dims[width_idx] = task->in_frame->width;
+
+ required_size = (size_t)input.dims[height_idx] * input.dims[width_idx] *
+ input.dims[channel_idx] * sizeof(float);
+
+ if (infer_request->input_data_size < required_size) {
+ av_freep(&infer_request->input_data);
+ infer_request->input_data = (uint8_t *)av_malloc(required_size);
+ if (!infer_request->input_data) {
+ infer_request->input_data_size = 0;
+ return AVERROR(ENOMEM);
+ }
+ infer_request->input_data_size = required_size;
+ }
+ input.data = infer_request->input_data;
+
infer_request->input_tensor = new torch::Tensor();
infer_request->output = new torch::Tensor();
@@ -208,7 +235,7 @@ static int fill_model_input_th(THModel *th_model,
THRequestItem *request)
}
*infer_request->input_tensor = torch::from_blob(input.data,
{1, input.dims[channel_idx], input.dims[height_idx],
input.dims[width_idx]},
- deleter, torch::kFloat32);
+ persistent_buf_deleter, torch::kFloat32);
return 0;
err:
@@ -246,8 +273,10 @@ static int th_start_inference(void *args)
av_log(ctx, AV_LOG_ERROR, "input or output tensor is NULL\n");
return DNN_GENERIC_ERROR;
}
- // Transfer tensor to the same device as model
- c10::Device device = (*th_model->jit_model->parameters().begin()).device();
+ auto params = th_model->jit_model->parameters();
+ c10::Device device(torch::kCPU);
+ if (params.begin() != params.end())
+ device = (*params.begin()).device();
if (infer_request->input_tensor->device() != device)
*infer_request->input_tensor = infer_request->input_tensor->to(device);
inputs.push_back(*infer_request->input_tensor);
@@ -410,8 +439,10 @@ static THInferRequest *th_create_inference_request(void)
if (!request) {
return NULL;
}
- request->input_tensor = NULL;
- request->output = NULL;
+ request->input_tensor = NULL;
+ request->output = NULL;
+ request->input_data = NULL;
+ request->input_data_size = 0;
return request;
}
--
2.52.0
_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]