Re: [libav-devel] [Updated PATCH 2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

Martin Storsjö Sun, 13 Jul 2014 02:10:32 -0700

On Fri, 11 Jul 2014, Ben Avison wrote:

The previous implementation targeted DTS Coherent Acoustics, which only
requires nbits == 4 (fft16()). This case was (and still is) linked directly
rather than being indirected through ff_fft_calc_vfp(), but now the full
range from radix-4 up to radix-65536 is available. This benefits other codecs
such as AAC and AC3.


The implementaion is based upon the C version, with each routine larger than
radix-16 calling a hierarchy of smaller FFT functions, then performing a
post-processing pass. This pass benefits a lot from loop unrolling to
counter the long pipelines in the VFP. A relaxed calling standard also
reduces the overhead of the call hierarchy, and avoiding the excessive
inlining performed by GCC probably helps with I-cache utilisation too.

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in the FFT routines (fft4() to fft512() and pass()) for the
same sample AAC stream:

             Before          After
             Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode  2245.5 53.1     1599.6 43.8    100.0%      +40.4%
FFT routines  940.6  22.0     348.1  20.8    100.0%      +170.2%
---
libavcodec/arm/fft_init_arm.c |    8 +-
libavcodec/arm/fft_vfp.S      |  284 +++++++++++++++++++++++++++++++++++++++--
2 files changed, 275 insertions(+), 17 deletions(-)

I tried this code on most of our odd configurations, and ran into a bunchof issues, most of which I've been able to resolve in one way or another,but some parts of it is a bit ugly...

The other three patches don't seem to require any extra tweaks tobuild/run in these configurations though.

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 3a3d1a7..bc143c1 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@
#include "libavcodec/rdft.h"
#include "libavcodec/synth_filter.h"

+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);

@@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
{
    int cpu_flags = av_get_cpu_flags();

-    if (have_vfp(cpu_flags)) {
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
+        s->fft_calc     = ff_fft_calc_vfp;
#if CONFIG_MDCT
-        if (!have_vfpv3(cpu_flags))
-            s->imdct_half   = ff_imdct_half_vfp;
+        s->imdct_half   = ff_imdct_half_vfp;
#endif
    }

diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index 7845ebb..e7d710b 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,52 @@

#include "libavutil/arm/asm.S"

-@ TODO: * FFTs wider than 16
-@       * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit.
+
+function ff_fft_calc_vfp, export=1
+        ldr     ip, [a1, #0]    @ nbits
+        mov     a1, a2
+A       ldr     ip, [pc, ip, lsl #2]
+A       bx      ip
+A       .word   0
+A       .word   0
+A       .word   fft4_vfp
+A       .word   fft8_vfp
+A       .word   ff_fft16_vfp    @ this one alone is exported


This reference needs to be wrapped in X()

+A       .word   fft32_vfp
+A       .word   fft64_vfp
+A       .word   fft128_vfp
+A       .word   fft256_vfp
+A       .word   fft512_vfp
+A       .word   fft1024_vfp
+A       .word   fft2048_vfp
+A       .word   fft4096_vfp
+A       .word   fft8192_vfp
+A       .word   fft16384_vfp
+A       .word   fft32768_vfp
+A       .word   fft65536_vfp
+T       tbh     [pc, ip, lsl #1]
+T 0:    .short  0
+T       .short  0
+T       .short  fft4_vfp - 0b
+T       .short  fft4_vfp - 0b

You've got one entry duplicated here, and these entries need to be dividedby 2 (just as they are in mlpdsp_armv5te.S).

+T       .short  fft8_vfp - 0b
+T       .short  fft16_vfp - 0b
+T       .short  fft32_vfp - 0b
+T       .short  fft64_vfp - 0b
+T       .short  fft128_vfp - 0b
+T       .short  fft256_vfp - 0b
+T       .short  fft512_vfp - 0b
+T       .short  fft1024_vfp - 0b
+T       .short  fft2048_vfp - 0b
+T       .short  fft4096_vfp - 0b
+T       .short  fft8192_vfp - 0b
+T       .short  fft16384_vfp - 0b
+T       .short  fft32768_vfp - 0b
+T       .short  fft65536_vfp - 0b
+endfunc

This jump table (the thumb version, after adding the division by two)fails to assemble with apple tools (both the current clang based ones, andthe old gcc based ones). It only considers these assemble-time constantsas long as there's no non-local labels between the two labels beingsubtracted.

To fix building, I renamed the fftX_vfp symbols into .LfftX_vfp, likewisefor the fftX_internal_vfp and the cosXpiY constants. This also forced meto move the ff_fft16_vfp public function to the bottom of the file - sothat there's no non-local labels between the jump table and all thefunctions it points to.

This does seem a bit overly complicated. Would it make sense to just turnit into a const table, just as fft_tab_neon in fft_neon.S? I guess itisn't significantly slower, although it generates more text relocations.But I think that would be a fair tradeoff here, if it fixes the buildissues on apple platforms (and probably also avoids the label differencebug in armasm).

function fft4_vfp
        vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
@@ -131,18 +175,22 @@ endfunc
             vstr    d9, [a1, #3 * 2*4]
.endm

+function fft8_internal_vfp
+        macro_fft8_head
+        macro_fft8_tail
+        bx      lr
+endfunc
+
function fft8_vfp
        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
        fmrx    a2, FPSCR
        fmxr    FPSCR, a3
        vpush   {s16-s31}
-
-        macro_fft8_head
-        macro_fft8_tail
-
+        mov     ip, lr
+        bl      fft8_internal_vfp
        vpop    {s16-s31}
        fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
endfunc

.align 3

Is there any particular reason why this is aligned to 8 bytes instead of 4- shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I knowthis isn't added by this patch though.)

The extra alignment here triggered a bug in MS armasm where the jump tableoffsets were miscalculated, which seemed to be triggered by this extraalignment here. (I haven't reduced the bug to a reportable testcase yet.)

@@ -153,12 +201,7 @@ cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
        .float  0.3826834261417388916015625

-function ff_fft16_vfp, export=1
-        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
-        fmrx    a2, FPSCR
-        fmxr    FPSCR, a3
-        vpush   {s16-s31}
-
+function fft16_internal_vfp
        macro_fft8_head
        @ FFT4(z+8)
        vldr    d10, [a1, #8 * 2*4]
@@ -292,7 +335,220 @@ function ff_fft16_vfp, export=1
              vstr    d8, [a1, #0 * 2*4]
              vstr    d9, [a1, #4 * 2*4]

+        bx      lr
+endfunc
+
+function ff_fft16_vfp, export=1
+T       b       fft16_vfp
+T endfunc
+T
+T function fft16_vfp
+        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
+        fmrx    a2, FPSCR
+        fmxr    FPSCR, a3
+        vpush   {s16-s31}
+        mov     ip, lr
+        bl      fft16_internal_vfp
        vpop    {s16-s31}
        fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
+endfunc
+
+.macro pass n, z0, z1, z2, z3
+        add     v6, v5, #4*2*\n
+        @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
+            @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+                @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
+                    @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+            vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            vldmdb  v6!, {s2}
+            vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vldmia  v5!, {s0,s1}                @ s0 is unused
+        vldr    s7, [\z2, #8*o2]            @ t1
+            vmul.f  s20, s16, s2                @ vector * scalar
+        vldr    s0, [\z3, #8*o3]            @ t5
+        vldr    s6, [\z2, #8*o2+4]          @ t2
+        vldr    s3, [\z3, #8*o3+4]          @ t6
+            vmul.f  s16, s16, s1                @ vector * scalar
+        ldr     a4, =\n-1
+1:      add     \z0, \z0, #8*2
+ .if \n*4*2 >= 512
+        add     \z1, \z1, #8*2
+ .endif
+ .if \n*4*2 >= 256
+        add     \z2, \z2, #8*2
+ .endif
+ .if \n*4*2 >= 512
+        add     \z3, \z3, #8*2
+ .endif
+        @ up to 2 stalls (VFP vector issuing / waiting for s0)
+        @ depending upon whether this is the first iteration and
+        @ how many add instructions are inserted above
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0-8*2]         @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1-8*2]        @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0-8*2]         @ s8,s9
+        vstr    d5, [\z1, #8*o1-8*2]        @ s10,s11
+        @ stall (waiting for s5)
+        vstr    d2, [\z2, #8*o2-8*2]        @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3-8*2]        @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1-8*2]         @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)-8*2]    @ s14,s15
+                vldr    d4, [\z2, #8*o2]            @ s8,s9
+                vldmdb  v6!, {s2,s3}
+                vldr    d5, [\z3, #8*o3]            @ s10,s11
+            vadd.f  s20, s4, s12                @ vector + vector
+                vldmia  v5!, {s0,s1}
+                    vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            @ stall (VFP vector issuing)
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+                vmul.f  s12, s8, s3                 @ vector * scalar
+            vstr    d10, [\z0, #8*1-8*2]        @ s20,s21
+                    vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vstr    d11, [\z1, #8*(o1+1)-8*2]   @ s22,s23
+                vmul.f  s8, s8, s0                  @ vector * scalar
+            vstr    d2, [\z2, #8*(o2+1)-8*2]    @ s4,s5
+            @ stall (waiting for s7)
+            vstr    d3, [\z3, #8*(o3+1)-8*2]    @ s6,s7
+                    vmul.f  s20, s16, s2                @ vector * scalar
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                vadd.f  s7, s8, s13                 @ t1
+                vsub.f  s6, s9, s12                 @ t2
+                vsub.f  s0, s10, s15                @ t5
+                vadd.f  s3, s11, s14                @ t6
+                    vmul.f  s16, s16, s1                @ vector * scalar
+        subs    a4, a4, #1
+        bne     1b
+        @ What remains is identical to the first two indentations of
+        @ the above, but without the increment of z
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0]             @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1]            @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0]             @ s8,s9
+        vstr    d5, [\z1, #8*o1]            @ s10,s11
+        vstr    d2, [\z2, #8*o2]            @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3]            @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1]             @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)]        @ s14,s15
+            vadd.f  s20, s4, s12                @ vector + vector
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+            vstr    d10, [\z0, #8*1]            @ s20,s21
+            vstr    d11, [\z1, #8*(o1+1)]       @ s22,s23
+            vstr    d2, [\z2, #8*(o2+1)]        @ s4,s5
+            vstr    d3, [\z3, #8*(o3+1)]        @ s6,s7
+.endm
+
+.macro fft_internal_vfp name, name2, name4, costable, n
+function \name
+ .if \n >= 512
+        push    {v1-v6,lr}
+ .elseif \n >= 256
+        push    {v1-v2,v5-v6,lr}
+ .else
+        push    {v1,v5-v6,lr}
+ .endif
+        mov     v1, a1
+        bl      \name2
+        add     a1, v1, #8*(\n/4)*2
+        bl      \name4
+        ldr     v5, =\costable


Should this perhaps use the movrelx macro?

+        add     a1, v1, #8*(\n/4)*3
+        bl      \name4
+ .if \n >= 512
+  .set o1, 0*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 0*(\n/4/2)
+        add     v2, v1, #8*2*(\n/4/2)
+        add     v3, v1, #8*4*(\n/4/2)
+        add     v4, v1, #8*6*(\n/4/2)
+        pass    (\n/4/2), v1, v2, v3, v4
+        pop     {v1-v6,pc}
+ .elseif \n >= 256
+  .set o1, 2*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 2*(\n/4/2)
+        add     v2, v1, #8*4*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v2, v2
+        pop     {v1-v2,v5-v6,pc}
+ .else
+  .set o1, 2*(\n/4/2)
+  .set o2, 4*(\n/4/2)
+  .set o3, 6*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v1, v1
+        pop     {v1,v5-v6,pc}
+ .endif
endfunc
+.endm
+
+#define DECL_FFT(n,n2,n4)               \
+fft_internal_vfp  fft##n##_internal_vfp, fft##n2##_internal_vfp, 
fft##n4##_internal_vfp, ff_cos_##n, n ;\


You need to add X() around the ff_cos_N name here

+                                       ;\
+function fft##n##_vfp                  ;\
+        ldr     a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ 
;\
+        fmrx    a2, FPSCR              ;\
+        fmxr    FPSCR, a3              ;\
+        vpush   {s16-s31}              ;\
+        mov     ip, lr                 ;\
+        bl      fft##n##_internal_vfp  ;\
+        vpop    {s16-s31}              ;\
+        fmxr    FPSCR, a2              ;\
+        bx      ip                     ;\
+endfunc                                ;\
+                                       ;\
+.ltorg

gas-preprocessor didn't previously support multiple instructionsconcatenated on one line separated with semicolons - I did write a patchfor doing that now though.

Nevertheless, wouldn't this be more readable written just as a normal gasmacro instead of as a cpp macro? That's how the corresponding macro isdone in fft_neon.S.


// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [Updated PATCH 2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

Reply via email to