Re: [FFmpeg-devel] [Updated PATCH 2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

2014-07-17 Thread Michael Niedermayer
On Fri, Jul 11, 2014 at 11:32:08AM +0100, Ben Avison wrote:
 The previous implementation targeted DTS Coherent Acoustics, which only
 requires nbits == 4 (fft16()). This case was (and still is) linked directly
 rather than being indirected through ff_fft_calc_vfp(), but now the full
 range from radix-4 up to radix-65536 is available. This benefits other codecs
 such as AAC and AC3.
 
 The implementaion is based upon the C version, with each routine larger than
 radix-16 calling a hierarchy of smaller FFT functions, then performing a
 post-processing pass. This pass benefits a lot from loop unrolling to
 counter the long pipelines in the VFP. A relaxed calling standard also
 reduces the overhead of the call hierarchy, and avoiding the excessive
 inlining performed by GCC probably helps with I-cache utilisation too.
 
 I benchmarked the result by measuring the number of gperftools samples that
 hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
 specifically in the FFT routines (fft4() to fft512() and pass()) for the
 same sample AAC stream:
 
   Before  After
   Mean   StdDev   Mean   StdDev  Confidence  Change
 Audio decode  2245.5 53.1 1599.6 43.8100.0%  +40.4%
 FFT routines  940.6  22.0 348.1  20.8100.0%  +170.2%
 ---
  libavcodec/arm/fft_init_arm.c |8 +-
  libavcodec/arm/fft_vfp.S  |  284 
 +++--
  2 files changed, 275 insertions(+), 17 deletions(-)

merged a variant of this patch

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Complexity theory is the science of finding the exact solution to an
approximation. Benchmarking OTOH is finding an approximation of the exact


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [Updated PATCH 2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

2014-07-11 Thread Ben Avison
The previous implementation targeted DTS Coherent Acoustics, which only
requires nbits == 4 (fft16()). This case was (and still is) linked directly
rather than being indirected through ff_fft_calc_vfp(), but now the full
range from radix-4 up to radix-65536 is available. This benefits other codecs
such as AAC and AC3.

The implementaion is based upon the C version, with each routine larger than
radix-16 calling a hierarchy of smaller FFT functions, then performing a
post-processing pass. This pass benefits a lot from loop unrolling to
counter the long pipelines in the VFP. A relaxed calling standard also
reduces the overhead of the call hierarchy, and avoiding the excessive
inlining performed by GCC probably helps with I-cache utilisation too.

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in the FFT routines (fft4() to fft512() and pass()) for the
same sample AAC stream:

  Before  After
  Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode  2245.5 53.1 1599.6 43.8100.0%  +40.4%
FFT routines  940.6  22.0 348.1  20.8100.0%  +170.2%
---
 libavcodec/arm/fft_init_arm.c |8 +-
 libavcodec/arm/fft_vfp.S  |  284 +++--
 2 files changed, 275 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 7e49b9c..5087f5f 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@
 #include libavcodec/rdft.h
 #include libavcodec/synth_filter.h
 
+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
@@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
 {
 int cpu_flags = av_get_cpu_flags();
 
-if (have_vfp(cpu_flags)) {
+if (have_vfp(cpu_flags)  !have_vfpv3(cpu_flags)) {
+s-fft_calc = ff_fft_calc_vfp;
 #if CONFIG_MDCT
-if (!have_vfpv3(cpu_flags))
-s-imdct_half   = ff_imdct_half_vfp;
+s-imdct_half   = ff_imdct_half_vfp;
 #endif
 }
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index f1ab37c..205222e 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,52 @@
 
 #include libavutil/arm/asm.S
 
-@ TODO: * FFTs wider than 16
-@   * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit.
+
+function ff_fft_calc_vfp, export=1
+ldr ip, [a1, #0]@ nbits
+mov a1, a2
+A   ldr ip, [pc, ip, lsl #2]
+A   bx  ip
+A   .word   0
+A   .word   0
+A   .word   fft4_vfp
+A   .word   fft8_vfp
+A   .word   ff_fft16_vfp@ this one alone is exported
+A   .word   fft32_vfp
+A   .word   fft64_vfp
+A   .word   fft128_vfp
+A   .word   fft256_vfp
+A   .word   fft512_vfp
+A   .word   fft1024_vfp
+A   .word   fft2048_vfp
+A   .word   fft4096_vfp
+A   .word   fft8192_vfp
+A   .word   fft16384_vfp
+A   .word   fft32768_vfp
+A   .word   fft65536_vfp
+T   tbh [pc, ip, lsl #1]
+T 0:.short  0
+T   .short  0
+T   .short  fft4_vfp - 0b
+T   .short  fft4_vfp - 0b
+T   .short  fft8_vfp - 0b
+T   .short  fft16_vfp - 0b
+T   .short  fft32_vfp - 0b
+T   .short  fft64_vfp - 0b
+T   .short  fft128_vfp - 0b
+T   .short  fft256_vfp - 0b
+T   .short  fft512_vfp - 0b
+T   .short  fft1024_vfp - 0b
+T   .short  fft2048_vfp - 0b
+T   .short  fft4096_vfp - 0b
+T   .short  fft8192_vfp - 0b
+T   .short  fft16384_vfp - 0b
+T   .short  fft32768_vfp - 0b
+T   .short  fft65536_vfp - 0b
+endfunc
 
 function fft4_vfp
 vldrd0, [a1, #0*2*4]   @ s0,s1   = z[0]
@@ -131,18 +175,22 @@ endfunc
  vstrd9, [a1, #3 * 2*4]
 .endm
 
+function fft8_internal_vfp
+macro_fft8_head
+macro_fft8_tail
+bx  lr
+endfunc
+
 function fft8_vfp
 ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1
 fmrxa2, FPSCR
 fmxrFPSCR, a3
 vpush   {s16-s31}
-
-macro_fft8_head
-macro_fft8_tail
-
+mov ip, lr
+bl  fft8_internal_vfp
 vpop{s16-s31}
 fmxrFPSCR, a2
-bx  lr
+bx  ip
 endfunc
 
 .align 3
@@ -153,12 +201,7 @@ cos1pi8:@ cos(1*pi/8) = sqrt(2+sqrt(2))/2
 cos3pi8:@ cos(2*pi/8) = sqrt(2-sqrt(2))/2
 .float  0.3826834261417388916015625
 
-function ff_fft16_vfp, export=1
-ldr a3, =0x0303 @ RunFast mode, vector length 4, stride 1
-fmrxa2, FPSCR
-fmxrFPSCR, a3
-vpush