PR #20644 opened by Y0SH1M4S73R
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20644.patch

See commit messages for details.


>From e4590f078e26dcee8ff8ff3c48fd0d50a9f231fa Mon Sep 17 00:00:00 2001
From: Y0SH1M4S73R <[email protected]>
Date: Sat, 4 Oct 2025 01:10:54 -0400
Subject: [PATCH 1/2] avfilter/flite: add .flitevox file support

libflite allows users to load voices from .flitevox files. The flite filter can 
now load a voice from such a file using the voicefile option.

The global initialization logic for the flite filter has been updated to 
register language and lexicon initialization functions for the "eng" and 
"useng" language strings. This is necessary to allow .flitevox files to be 
loaded correctly.

Due to the relatively large size of loaded voices (~15 MB), I have chosen to 
cache them in `voice_entry` structs contained in a static array. However, this 
implementation does not account for the possibility of the voice file at a 
specified path being changed before a voice previously loaded from that file is 
no longer in use. As such, changes to this implementation, and whether loaded 
voices should even be cached in the first place, should be discussed.
---
 libavfilter/asrc_flite.c | 141 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 136 insertions(+), 5 deletions(-)

diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
index 33576beade..6f7ef467cd 100644
--- a/libavfilter/asrc_flite.c
+++ b/libavfilter/asrc_flite.c
@@ -31,6 +31,7 @@
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/thread.h"
+#include "libavutil/tree.h"
 #include "avfilter.h"
 #include "filters.h"
 #include "audio.h"
@@ -39,6 +40,7 @@
 typedef struct FliteContext {
     const AVClass *class;
     char *voice_str;
+    char *voice_file;
     char *textfile;
     char *text;
     char *text_p;
@@ -65,6 +67,7 @@ static const AVOption flite_options[] = {
     { "textfile",    "set filename of the text to speak", OFFSET(textfile),  
AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { "v",           "set voice",                         OFFSET(voice_str), 
AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
     { "voice",       "set voice",                         OFFSET(voice_str), 
AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
+    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), 
AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS},
     { NULL }
 };
 
@@ -74,7 +77,7 @@ static AVMutex flite_mutex = AV_MUTEX_INITIALIZER;
 
 static int flite_inited = 0;
 
-/* declare functions for all the supported voices */
+/* declare functions for all the built-in voices */
 #define DECLARE_REGISTER_VOICE_FN(name) \
     cst_voice *register_cmu_us_## name(const char *); \
     void     unregister_cmu_us_## name(cst_voice *)
@@ -84,8 +87,12 @@ DECLARE_REGISTER_VOICE_FN(kal16);
 DECLARE_REGISTER_VOICE_FN(rms);
 DECLARE_REGISTER_VOICE_FN(slt);
 
+void usenglish_init(cst_voice *);
+cst_lexicon *cmulex_init(void);
+
 struct voice_entry {
     const char *name;
+    char *voice_file;
     cst_voice * (*register_fn)(const char *);
     void (*unregister_fn)(cst_voice *);
     cst_voice *voice;
@@ -145,6 +152,106 @@ static int select_voice(struct voice_entry **entry_ret, 
const char *voice_name,
     return AVERROR(EINVAL);
 }
 
+static int loaded_voice_entry_count = 0;
+static int loaded_voice_entry_capacity = 0;
+static struct voice_entry **loaded_voice_entries = NULL;
+
+static int add_loaded_entry(struct voice_entry *entry) {
+    if (loaded_voice_entry_count == loaded_voice_entry_capacity) {
+        if(av_dynarray_add_nofree(&loaded_voice_entries, 
&loaded_voice_entry_capacity, entry) < 0)
+            return AVERROR(ENOMEM);
+        loaded_voice_entry_count++;
+    } else {
+        for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+            if(!loaded_voice_entries[i]) {
+                loaded_voice_entries[i] = entry;
+                loaded_voice_entry_count++;
+                break;
+            }
+        }
+    }
+    return 0;
+}
+
+static struct voice_entry *get_loaded_entry(void *key, int (predicate)(const 
void *key, const struct voice_entry *entry)) {
+    for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+        struct voice_entry *entry = loaded_voice_entries[i];
+        if (entry && predicate(key, entry))
+            return entry;
+    }
+    return NULL;
+}
+
+static int path_predicate(const void *key, const struct voice_entry *entry) {
+    return !strcmp((const char *)key, entry->voice_file);
+}
+
+static int voice_predicate(const void *key, const struct voice_entry *entry) {
+    return (const cst_voice *)key == entry->voice;
+}
+
+static void remove_loaded_entry(const struct voice_entry *removing_entry) {
+    for (int i = 0; i < loaded_voice_entry_capacity; i++) {
+        const struct voice_entry *entry = loaded_voice_entries[i];
+        if (entry == removing_entry) {
+            loaded_voice_entries[i] = NULL;
+            loaded_voice_entry_count--;
+        }
+    }
+}
+
+static void unregister_loaded_voice(cst_voice *voice) {
+    struct voice_entry *entry = get_loaded_entry(voice, voice_predicate);
+    if (!entry) {
+        av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: no voice 
entry\n");
+        return;
+    }
+    if (entry->voice != voice) {
+        av_log(NULL, AV_LOG_ERROR, "unregister_loaded_voice failed: voice 
mismatch\n");
+        return;
+    }
+    remove_loaded_entry(entry);
+    delete_voice(entry->voice);
+    av_free((char *)entry->name);
+    av_free(entry->voice_file);
+    av_free(entry);
+    return;
+}
+
+static int load_voice(struct voice_entry **entry_ret, char *voice_path, void 
*log_ctx) {
+    pthread_mutex_lock(&flite_mutex);
+    struct voice_entry *entry = get_loaded_entry(voice_path, path_predicate);
+    if (!entry) {
+        cst_voice *voice;
+        if (!(voice = flite_voice_load(voice_path))) {
+            pthread_mutex_unlock(&flite_mutex);
+            av_log(log_ctx, AV_LOG_ERROR, "the voice file '%s' can not be 
read\n", voice_path);
+            return AVERROR_EXTERNAL;
+        }
+
+        entry = av_mallocz(sizeof(struct voice_entry));
+        entry->name = av_strdup(voice->name);
+        entry->voice_file = av_strdup(voice_path);
+        entry->unregister_fn = unregister_loaded_voice;
+        entry->voice = voice;
+
+        int ret;
+        if ((ret = add_loaded_entry(entry)) < 0) {
+            pthread_mutex_unlock(&flite_mutex);
+            delete_voice(voice);
+            av_free((char *)entry->name);
+            av_free(entry->voice_file);
+            av_free(entry);
+            return ret;
+        }
+    }
+    entry->usage_count++;
+    pthread_mutex_unlock(&flite_mutex);
+    *entry_ret = entry;
+    return 0;
+
+}
+
 static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int 
size,
                                       int last, cst_audio_streaming_info *asi)
 {
@@ -164,6 +271,17 @@ static int audio_stream_chunk_by_word(const cst_wave 
*wave, int start, int size,
     return CST_AUDIO_STREAM_CONT;
 }
 
+static int perform_flite_initializations(void) {
+    int ret = 0;
+    if ((ret = flite_init()) < 0)
+        return ret;
+    if ((ret = flite_add_lang("eng", usenglish_init, cmulex_init)) < 0)
+        return ret;
+    if ((ret = flite_add_lang("usenglish", usenglish_init, cmulex_init)) < 0)
+        return ret;
+    return 0;
+}
+
 static av_cold int init(AVFilterContext *ctx)
 {
     FliteContext *flite = ctx->priv;
@@ -177,7 +295,7 @@ static av_cold int init(AVFilterContext *ctx)
 
     pthread_mutex_lock(&flite_mutex);
     if (!flite_inited) {
-        if ((ret = flite_init()) >= 0)
+        if ((ret = perform_flite_initializations()) >= 0)
             flite_inited = 1;
     }
     pthread_mutex_unlock(&flite_mutex);
@@ -186,8 +304,14 @@ static av_cold int init(AVFilterContext *ctx)
         return AVERROR_EXTERNAL;
     }
 
-    if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 0)
-        return ret;
+    if (flite->voice_file) {
+        if ((ret = load_voice(&flite->voice_entry, flite->voice_file, ctx)) < 
0)
+            return ret;
+    } else {
+        if ((ret = select_voice(&flite->voice_entry, flite->voice_str, ctx)) < 
0)
+            return ret;
+    }
+
     flite->voice = flite->voice_entry->voice;
 
     if (flite->textfile && flite->text) {
@@ -297,8 +421,15 @@ static int config_props(AVFilterLink *outlink)
     outlink->sample_rate = flite->sample_rate;
     outlink->time_base = (AVRational){1, flite->sample_rate};
 
+    const char *voice_name;
+    if (flite->voice_file) {
+        voice_name = av_asprintf("%s (%s)", flite->voice->name, 
flite->voice_file);
+    } else {
+        voice_name = flite->voice_str;
+    }
+
     av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n",
-           flite->voice_str,
+           voice_name,
            av_get_sample_fmt_name(outlink->format), outlink->sample_rate);
 
     return 0;
-- 
2.49.1


>From 7d7a83f78e78928f7c561dee06ace8ab7ee68f50 Mon Sep 17 00:00:00 2001
From: Y0SH1M4S73R <[email protected]>
Date: Sat, 4 Oct 2025 02:52:34 -0400
Subject: [PATCH 2/2] avfilter/ffmpeg: corrects formatting mistakes

---
 libavfilter/asrc_flite.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavfilter/asrc_flite.c b/libavfilter/asrc_flite.c
index 6f7ef467cd..90936b9028 100644
--- a/libavfilter/asrc_flite.c
+++ b/libavfilter/asrc_flite.c
@@ -67,7 +67,7 @@ static const AVOption flite_options[] = {
     { "textfile",    "set filename of the text to speak", OFFSET(textfile),  
AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { "v",           "set voice",                         OFFSET(voice_str), 
AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
     { "voice",       "set voice",                         OFFSET(voice_str), 
AV_OPT_TYPE_STRING, {.str="kal"}, 0, 0, FLAGS },
-    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), 
AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS},
+    { "voicefile",   "set flitevox voice file",           OFFSET(voice_file), 
AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, FLAGS },
     { NULL }
 };
 
@@ -158,12 +158,12 @@ static struct voice_entry **loaded_voice_entries = NULL;
 
 static int add_loaded_entry(struct voice_entry *entry) {
     if (loaded_voice_entry_count == loaded_voice_entry_capacity) {
-        if(av_dynarray_add_nofree(&loaded_voice_entries, 
&loaded_voice_entry_capacity, entry) < 0)
+        if (av_dynarray_add_nofree(&loaded_voice_entries, 
&loaded_voice_entry_capacity, entry) < 0)
             return AVERROR(ENOMEM);
         loaded_voice_entry_count++;
     } else {
         for (int i = 0; i < loaded_voice_entry_capacity; i++) {
-            if(!loaded_voice_entries[i]) {
+            if (!loaded_voice_entries[i]) {
                 loaded_voice_entries[i] = entry;
                 loaded_voice_entry_count++;
                 break;
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to