PR #21178 opened by Zhao Zhili (quink) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21178 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21178.patch
Commit 8f48a62 extends tx to 2M, which takes 4M of bss section for tx_float: size libavutil/tx_float.o text data bss dec hex filename 35356 9160 4194796 4239312 40afd0 libavutil/tx_float.o This isn't a issue on devices with normal memory sizes and OS supporting virtual memory. But it's a real issue for embedded devices with realtime OS, which may not support virtual memory, e.g., Nuttx. This 4M of bss section will occupy physical memory, which is a scarce resource on embedded devices. >From 060047d35bfd6a95e3a3051cc9dccc03b5f1db03 Mon Sep 17 00:00:00 2001 From: Zhao Zhili <[email protected]> Date: Fri, 12 Dec 2025 11:50:36 +0800 Subject: [PATCH 1/2] avutil/aarch64/tx_float: enable SIMD for sizes over 131072 --- libavutil/aarch64/tx_float_init.c | 4 ++-- libavutil/aarch64/tx_float_neon.S | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index 8300472c4c..e13dab5cf3 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -57,8 +57,8 @@ const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), - TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0), - TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 128, neon_init, neon, NEON, 0, 0), + TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), NULL, }; diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index d00b3f9684..c20bcaafb0 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -1161,7 +1161,11 @@ SR_TRANSFORM_DEF 8192, 16384 SR_TRANSFORM_DEF 16384, 32768 SR_TRANSFORM_DEF 32768, 65536 SR_TRANSFORM_DEF 65536, 131072 -SR_TRANSFORM_DEF 131072 +SR_TRANSFORM_DEF 131072, 262144 +SR_TRANSFORM_DEF 262144, 524288 +SR_TRANSFORM_DEF 524288, 1048576 +SR_TRANSFORM_DEF 1048576, 2097152 +SR_TRANSFORM_DEF 2097152 0: // general deinterleave loop SR_COMBINE_DINT -- 2.49.1 >From c68cb53f11be6e4f7670fea52119c85f09f1551f Mon Sep 17 00:00:00 2001 From: Zhao Zhili <[email protected]> Date: Fri, 12 Dec 2025 14:55:28 +0800 Subject: [PATCH 2/2] avutil/tx: make tx table size configurable at build time Commit 8f48a62 extends tx to 2M, which takes 4M of bss section for tx_float: size libavutil/tx_float.o text data bss dec hex filename 35356 9160 4194796 4239312 40afd0 libavutil/tx_float.o This isn't a issue on devices with normal memory sizes and OS supporting virtual memory. But it's a real issue for embedded devices with realtime OS, which may not support virtual memory, e.g., Nuttx. This 4M of bss section will occupy physical memory, which is a scarce resource on embedded devices. --- configure | 6 +++ libavutil/aarch64/tx_float_init.c | 4 +- libavutil/aarch64/tx_float_neon.S | 28 ++++++++++ libavutil/tx_template.c | 88 ++++++++++++++++++++++++++++--- libavutil/x86/tx_float.asm | 28 ++++++++++ libavutil/x86/tx_float_init.c | 18 +++---- 6 files changed, 154 insertions(+), 18 deletions(-) diff --git a/configure b/configure index 9ec421cd23..40a4af50b7 100755 --- a/configure +++ b/configure @@ -447,6 +447,7 @@ Advanced options (experts only): crashes and arbitrary code execution, it may be faster, but should only be used with trusted input) --sws-max-filter-size=N the max filter size swscale uses [$sws_max_filter_size_default] + --tx-tab-max-size=N the max tab size tx uses, must be power of 2 in the range of [$tx_tab_max_size_min, $tx_tab_max_size_default] Optimization options (experts only): --disable-asm disable all assembly optimizations @@ -2166,6 +2167,7 @@ CONFIG_LIST=" ptx_compression resource_compression thumb + tx_tab_max_size valgrind_backtrace xmm_clobber_test $COMPONENT_LIST @@ -2796,6 +2798,7 @@ CMDLINE_SET=" target_samples tempprefix toolchain + tx_tab_max_size valgrind windres x86asmexe @@ -4300,7 +4303,10 @@ enable unstable enable valgrind_backtrace sws_max_filter_size_default=256 +tx_tab_max_size_min=16384 +tx_tab_max_size_default=2097152 set_default sws_max_filter_size +set_default tx_tab_max_size # internal components are enabled by default enable $EXTRALIBS_LIST diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index e13dab5cf3..df2f6643a3 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -57,8 +57,8 @@ const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), - TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 128, neon_init, neon, NEON, 0, 0), - TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft_sr, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 128, neon_init, neon, NEON, 0, 0), + TX_DEF(fft_sr_ns, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), NULL, }; diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index c20bcaafb0..b38d13740f 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -1158,14 +1158,42 @@ function ff_tx_fft_sr_\name\()_neon, export=1 SR_TRANSFORM_DEF 2048, 4096 SR_TRANSFORM_DEF 4096, 8192 SR_TRANSFORM_DEF 8192, 16384 +#if CONFIG_TX_TAB_MAX_SIZE >= 32768 SR_TRANSFORM_DEF 16384, 32768 +#else +SR_TRANSFORM_DEF 16384, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 65536 SR_TRANSFORM_DEF 32768, 65536 +#elif CONFIG_TX_TAB_MAX_SIZE >= 32768 +SR_TRANSFORM_DEF 32768, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 131072 SR_TRANSFORM_DEF 65536, 131072 +#elif CONFIG_TX_TAB_MAX_SIZE >= 65536 +SR_TRANSFORM_DEF 65536, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 262144 SR_TRANSFORM_DEF 131072, 262144 +#elif CONFIG_TX_TAB_MAX_SIZE >= 131072 +SR_TRANSFORM_DEF 131072, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 524288 SR_TRANSFORM_DEF 262144, 524288 +#elif CONFIG_TX_TAB_MAX_SIZE >= 262144 +SR_TRANSFORM_DEF 262144, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 1048576 SR_TRANSFORM_DEF 524288, 1048576 +#elif CONFIG_TX_TAB_MAX_SIZE >= 524288 +SR_TRANSFORM_DEF 524288, 0 +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 2097152 SR_TRANSFORM_DEF 1048576, 2097152 SR_TRANSFORM_DEF 2097152 +#elif CONFIG_TX_TAB_MAX_SIZE >= 1048576 +SR_TRANSFORM_DEF 1048576, 0 +#endif 0: // general deinterleave loop SR_COMBINE_DINT diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index ec630954b8..6aa33efe83 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -29,6 +29,52 @@ #define TABLE_DEF(name, size) \ DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size] +static_assert(CONFIG_TX_TAB_MAX_SIZE >= 16384, "CONFIG_TX_TAB_MAX_SIZE too small"); +static_assert(CONFIG_TX_TAB_MAX_SIZE <= 2097152, "CONFIG_TX_TAB_MAX_SIZE too large"); +static_assert((CONFIG_TX_TAB_MAX_SIZE & (CONFIG_TX_TAB_MAX_SIZE - 1)) == 0, "CONFIG_TX_TAB_MAX_SIZE isn't power of 2"); + +#if CONFIG_TX_TAB_MAX_SIZE >= 32768 +#define SR_TABLE_32768 SR_TABLE(32768) +#else +#define SR_TABLE_32768 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 65536 +#define SR_TABLE_65536 SR_TABLE(65536) +#else +#define SR_TABLE_65536 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 131072 +#define SR_TABLE_131072 SR_TABLE(131072) +#else +#define SR_TABLE_131072 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 262144 +#define SR_TABLE_262144 SR_TABLE(262144) +#else +#define SR_TABLE_262144 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 524288 +#define SR_TABLE_524288 SR_TABLE(524288) +#else +#define SR_TABLE_524288 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 1048576 +#define SR_TABLE_1048576 SR_TABLE(1048576) +#else +#define SR_TABLE_1048576 +#endif + +#if CONFIG_TX_TAB_MAX_SIZE >= 2097152 +#define SR_TABLE_2097152 SR_TABLE(2097152) +#else +#define SR_TABLE_2097152 +#endif + #define SR_POW2_TABLES \ SR_TABLE(8) \ SR_TABLE(16) \ @@ -42,13 +88,13 @@ SR_TABLE(4096) \ SR_TABLE(8192) \ SR_TABLE(16384) \ - SR_TABLE(32768) \ - SR_TABLE(65536) \ - SR_TABLE(131072) \ - SR_TABLE(262144) \ - SR_TABLE(524288) \ - SR_TABLE(1048576) \ - SR_TABLE(2097152) \ + SR_TABLE_32768 \ + SR_TABLE_65536 \ + SR_TABLE_131072 \ + SR_TABLE_262144 \ + SR_TABLE_524288 \ + SR_TABLE_1048576 \ + SR_TABLE_2097152 \ #define SR_TABLE(len) \ TABLE_DEF(len, len/4 + 1); @@ -721,13 +767,27 @@ DECL_SR_CODELET(2048,1024,512) DECL_SR_CODELET(4096,2048,1024) DECL_SR_CODELET(8192,4096,2048) DECL_SR_CODELET(16384,8192,4096) +#if CONFIG_TX_TAB_MAX_SIZE >= 32768 DECL_SR_CODELET(32768,16384,8192) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 65536 DECL_SR_CODELET(65536,32768,16384) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 131072 DECL_SR_CODELET(131072,65536,32768) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 262144 DECL_SR_CODELET(262144,131072,65536) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 524288 DECL_SR_CODELET(524288,262144,131072) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 1048576 DECL_SR_CODELET(1048576,524288,262144) +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 2097152 DECL_SR_CODELET(2097152,1048576,524288) +#endif static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s, const FFTXCodelet *cd, @@ -2157,13 +2217,27 @@ const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = { &TX_NAME(ff_tx_fft4096_ns_def), &TX_NAME(ff_tx_fft8192_ns_def), &TX_NAME(ff_tx_fft16384_ns_def), +#if CONFIG_TX_TAB_MAX_SIZE >= 32768 &TX_NAME(ff_tx_fft32768_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 65536 &TX_NAME(ff_tx_fft65536_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 131072 &TX_NAME(ff_tx_fft131072_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 262144 &TX_NAME(ff_tx_fft262144_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 524288 &TX_NAME(ff_tx_fft524288_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 1048576 &TX_NAME(ff_tx_fft1048576_ns_def), +#endif +#if CONFIG_TX_TAB_MAX_SIZE >= 2097152 &TX_NAME(ff_tx_fft2097152_ns_def), +#endif /* Prime factor codelets */ &TX_NAME(ff_tx_fft3_ns_def), diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index c030147ce8..024d2c9107 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -1382,14 +1382,42 @@ ALIGN 16 FFT_SPLIT_RADIX_DEF 2048, .4096pt FFT_SPLIT_RADIX_DEF 4096, .8192pt FFT_SPLIT_RADIX_DEF 8192, .16384pt +%if CONFIG_TX_TAB_MAX_SIZE >= 32768 FFT_SPLIT_RADIX_DEF 16384, .32768pt +%else +FFT_SPLIT_RADIX_DEF 16384 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 65536 FFT_SPLIT_RADIX_DEF 32768, .65536pt +%elif CONFIG_TX_TAB_MAX_SIZE >= 32768 +FFT_SPLIT_RADIX_DEF 32768 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 131072 FFT_SPLIT_RADIX_DEF 65536, .131072pt +%elif CONFIG_TX_TAB_MAX_SIZE >= 65536 +FFT_SPLIT_RADIX_DEF 65536 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 262144 FFT_SPLIT_RADIX_DEF 131072, .262144pt +%elif CONFIG_TX_TAB_MAX_SIZE >= 131072 +FFT_SPLIT_RADIX_DEF 131072 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 524288 FFT_SPLIT_RADIX_DEF 262144, .524288pt +%elif CONFIG_TX_TAB_MAX_SIZE >= 262144 +FFT_SPLIT_RADIX_DEF 262144 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 1048576 FFT_SPLIT_RADIX_DEF 524288, .1048576pt +%elif CONFIG_TX_TAB_MAX_SIZE >= 524288 +FFT_SPLIT_RADIX_DEF 524288 +%endif +%if CONFIG_TX_TAB_MAX_SIZE >= 2097152 FFT_SPLIT_RADIX_DEF 1048576, .2097152pt FFT_SPLIT_RADIX_DEF 2097152 +%elif CONFIG_TX_TAB_MAX_SIZE >= 1048576 +FFT_SPLIT_RADIX_DEF 1048576 +%endif ;=============================================================================== ; Final synthesis + deinterleaving code diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c index 3e99c21eac..c091e67f53 100644 --- a/libavutil/x86/tx_float_init.c +++ b/libavutil/x86/tx_float_init.c @@ -271,15 +271,15 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, + TX_DEF(fft_sr, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + TX_DEF(fft_sr_ns, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, + TX_DEF(fft_sr, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), + TX_DEF(fft_sr_asm, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + TX_DEF(fft_sr_ns, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2, @@ -287,11 +287,11 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), - TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0, + TX_DEF(fft_sr, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 320, b8_i2, avx2, AVX2, 0, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), - TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, + TX_DEF(fft_sr_asm, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), - TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, + TX_DEF(fft_sr_ns, FFT, 64, CONFIG_TX_TAB_MAX_SIZE, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2, -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
