Hi Vittorio On Thu, Jul 10, 2025 at 12:25:41PM +0200, Vittorio Palmisano wrote: > It adds a new audio filter for running audio transcriptions with the whisper > model. > Documentation and examples are included into the patch. > > Signed-off-by: Vittorio Palmisano <[email protected]> > --- > configure | 5 + > doc/filters.texi | 101 ++++++++ > libavfilter/Makefile | 2 + > libavfilter/af_whisper.c | 488 +++++++++++++++++++++++++++++++++++++++ > libavfilter/allfilters.c | 2 + > 5 files changed, 598 insertions(+) > create mode 100644 libavfilter/af_whisper.c > [...] > diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c > new file mode 100644 > index 0000000000..81d90a77d7 > --- /dev/null > +++ b/libavfilter/af_whisper.c > @@ -0,0 +1,488 @@ > +/* > + * Copyright (c) 2025 Vittorio Palmisano > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public License > + * as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public License > + * along with FFmpeg; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include <stdio.h> > +#include <stdint.h> > +#include <stdlib.h> > + > +#include "libavutil/avutil.h" > +#include "libavutil/opt.h" > +#include "libavutil/channel_layout.h" > +#include "libavutil/samplefmt.h" > +#include "libavfilter/avfilter.h" > +#include "libavfilter/audio.h" > +#include "libavutil/mem.h" > +#include "libavutil/avstring.h" > +#include "libavutil/internal.h" > +#include "libavformat/avio.h" > +#include "libavutil/thread.h" > + > +#include "formats.h" > +
> +#include "whisper.h"
i presume thats meant to be #include <whisper.h> or something
> +
> +typedef struct WhisperContext {
> + const AVClass *class;
> + char *model_path;
> + char *language;
> + bool use_gpu;
> + int gpu_device;
> + char *vad_model_path;
> + float vad_threshold;
> + int vad_min_speech_duration;
> + int vad_min_silence_duration;
> +
> + int queue;
> + char *destination;
> + char *format;
> +
> + struct whisper_context *ctx_wsp;
> + struct whisper_vad_context *ctx_vad;
> + struct whisper_vad_params vad_params;
> +
> + float *audio_buffer;
> + int audio_buffer_queue_size;
> + int audio_buffer_fill_size;
> + int audio_buffer_vad_size;
> +
> + int eof;
> + int64_t next_pts;
> +
> + AVIOContext *avio_context;
> + int index;
> + int64_t timestamp;
> +} WhisperContext;
> +
> +static void cb_log_disable(enum ggml_log_level level, const char *text, void
> *user_data)
> +{
> +}
this should probably be forwarded to av_log
> +
> +static int init(AVFilterContext *ctx)
> +{
> + WhisperContext *wctx = ctx->priv;
> +
> + ggml_backend_load_all();
is this thread safe ?
> + whisper_log_set(cb_log_disable, NULL);
This is not thread safe, its directly changing global state
void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
g_state.log_callback = log_callback ? log_callback :
whisper_log_callback_default;
g_state.log_callback_user_data = user_data;
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
}
Not a bug in this patch of course but whisper itself
until whisper is actually thread safe, this should be wraped with
ff_thread_once() though that only protects af_whisper from itself not
from any other potential callers
[...]
[...]
> + wctx->timestamp += (int64_t) (duration * 1000);
that cast looks wierdly placed
> +
> + if (metadata && segments_text) {
> + av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
> + char *duration_text = av_asprintf("%f", duration);
> + av_dict_set(metadata, "lavfi.whisper.duration", duration_text, 0);
> + av_freep(&duration_text);
AV_DICT_DONT_STRDUP_VAL
> + }
> + av_freep(&segments_text);
> +
> + memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos,
> + end_pos * sizeof(float));
sizeof(*wctx->audio_buffer) is more robust than float
[...]
> +#define OFFSET(x) offsetof(WhisperContext, x)
> +#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
> +
> +static const AVOption whisper_options[] = {
> + { "model", "Path to the whisper.cpp model file", OFFSET(model_path),
> + AV_OPT_TYPE_STRING,.flags = FLAGS },
> + { "language", "Language for transcription ('auto' for auto-detect)",
> + OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"},.flags =
> + FLAGS },
> + { "queue", "Audio queue size in milliseconds", OFFSET(queue),
> + AV_OPT_TYPE_INT, {.i64 = 3000}, 20, INT_MAX,.flags = FLAGS },
> + { "use_gpu", "Use GPU for processing", OFFSET(use_gpu),
> + AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1,.flags = FLAGS },
> + { "gpu_device", "GPU device to use", OFFSET(gpu_device),
> + AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX,.flags = FLAGS },
> + { "destination", "Output destination", OFFSET(destination),
> + AV_OPT_TYPE_STRING, {.str = ""},.flags = FLAGS },
> + { "format", "Output format (text|srt|json)", OFFSET(format),
> + AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
> + { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path),
> + AV_OPT_TYPE_STRING,.flags = FLAGS },
> + { "vad_threshold", "VAD threshold", OFFSET(vad_threshold),
> + AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0,.flags = FLAGS },
> + { "vad_min_speech_duration",
> + "Minimum speech duration in milliseconds for VAD",
> + OFFSET(vad_min_speech_duration), AV_OPT_TYPE_INT, {.i64 = 50}, 20,
> + INT_MAX,.flags = FLAGS },
> + { "vad_min_silence_duration",
> + "Minimum silence duration in milliseconds for VAD",
> + OFFSET(vad_min_silence_duration), AV_OPT_TYPE_INT, {.i64 = 500}, 0,
> + INT_MAX,.flags = FLAGS },
> + { NULL }
not sure how others think of this, but i would ignore the 80 char limit and
format this like:
static const AVOption whisper_options[] = {
{ "model" , "Path to the whisper.cpp model file" ,
OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
{ "language", "Language for transcription ('auto' for auto-detect)",
OFFSET(language) , AV_OPT_TYPE_STRING, {.str = "auto"}, .flags =
FLAGS },
{ "queue" , "Audio queue size in milliseconds" ,
OFFSET(queue) , AV_OPT_TYPE_INT , {.i64 = 3000 }, 20, INT_MAX,.flags =
FLAGS },
{ "use_gpu" , "Use GPU for processing" ,
OFFSET(use_gpu) , AV_OPT_TYPE_BOOL , {.i64 = 1 }, 0 , 1 ,.flags =
FLAGS },
....
or:
static const AVOption whisper_options[] = {
{ "model" , "Path to the whisper.cpp model file" ,
OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
{ "language", "Language for transcription ('auto' for auto-detect)",
OFFSET(language) , AV_OPT_TYPE_STRING, {.str = "auto"},
.flags = FLAGS },
{ "queue" , "Audio queue size in milliseconds" ,
OFFSET(queue) , AV_OPT_TYPE_INT , {.i64 = 3000 }, 20,
INT_MAX,.flags = FLAGS },
{ "use_gpu" , "Use GPU for processing" ,
OFFSET(use_gpu) , AV_OPT_TYPE_BOOL , {.i64 = 1 }, 0 , 1
,.flags = FLAGS },
....
Also it seems, this is alot slower than whisper-cli
time whisper-cli matrix.wav -m ~/whisper.cpp/models/ggml-base.en.bin
--output-srt
real 0m16,283s
user 1m3,644s
sys 0m0,581s
time ./ffmpeg -v 99 -i matrix.wav -af
"aformat=sample_rates=16000:channel_layouts=mono,whisper=model=/home/michael/whisper.cpp/models/ggml-base.en.bin:language=en:queue=3000:destination=output.srt:format=srt"
-f null - 2> /tmp/log
real 1m30,827s
user 6m0,590s
sys 0m0,756s
and its af_whisper not the other processing:
time ./ffmpeg -v 99 -i matrix.wav -af
"aformat=sample_rates=16000:channel_layouts=mono" -f null - 2> /tmp/nolog
real 0m0,151s
user 0m0,185s
sys 0m0,048s
also the srt is different:
whisper-cli:
00:00:17,500 --> 00:00:22,000
Would you please remove any metallic items you're carrying, keys, boost change?
3
00:00:22,000 --> 00:00:24,000
[Music]
4
00:00:24,000 --> 00:00:26,000
Holy shit!
5
00:00:26,000 --> 00:00:37,000
[Music]
6
00:00:37,000 --> 00:00:38,000
Back up!
7
00:00:38,000 --> 00:00:39,000
Stand back up!
vs.
af_whisper
6
00:00:17.915 --> 00:00:20.815
Please remove any metallic items you're carrying, keys.
7
00:00:20.901 --> 00:00:21.741
boost change.
8
00:00:23.887 --> 00:00:25.887
Holy shit.
10
00:00:29.859 --> 00:00:32.119
(explosion)
11
00:00:32.845 --> 00:00:35.105
(explosion)
12
00:00:35.831 --> 00:00:37.831
Back on!
12
00:00:37.831 --> 00:00:38.831
Stand back on!
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Frequently ignored answer#1 FFmpeg bugs should be sent to our bugtracker. User
questions about the command line tools should be sent to the ffmpeg-user ML.
And questions about how to use libav* should be sent to the libav-user ML.
signature.asc
Description: PGP signature
_______________________________________________ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
