PR #21259 opened by WyattBlue URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21259 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21259.patch
This closes #20333 >From d387c186321ab5e8ebff92521c178dbd90475388 Mon Sep 17 00:00:00 2001 From: WyattBlue <[email protected]> Date: Sun, 21 Dec 2025 23:51:15 -0500 Subject: [PATCH] avfilter/af_whisper: Add max_len parameter --- libavfilter/af_whisper.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c index 3c0eba42f0..7e1b27e21b 100644 --- a/libavfilter/af_whisper.c +++ b/libavfilter/af_whisper.c @@ -52,6 +52,7 @@ typedef struct WhisperContext { int64_t queue; char *destination; char *format; + int max_len; struct whisper_context *ctx_wsp; struct whisper_vad_context *ctx_vad; @@ -204,6 +205,8 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples) params.print_progress = 0; params.print_realtime = 0; params.print_timestamps = 0; + params.max_len = wctx->max_len; + params.token_timestamps = (wctx->max_len > 0); if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) { av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n"); @@ -224,6 +227,14 @@ static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples) continue; } + // Skip segments that are parts of [BLANK_AUDIO] when max_len splits them + if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || strcmp(text_cleaned, "]") == 0 || + strcmp(text_cleaned, "BLANK") == 0 || strcmp(text_cleaned, "_") == 0 || + strcmp(text_cleaned, "AUDIO") == 0)) { + av_freep(&text_cleaned); + continue; + } + const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i); const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10; const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10; @@ -437,6 +448,7 @@ static const AVOption whisper_options[] = { { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS }, { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS }, { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS }, + { "max_len", "Max segment length in characters", OFFSET(max_len), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS }, { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS }, { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS }, -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
