On Fri, 11 Jul 2014, Ben Avison wrote:
The previous implementation targeted DTS Coherent Acoustics, which only requires nbits == 4 (fft16()). This case was (and still is) linked directly rather than being indirected through ff_fft_calc_vfp(), but now the full range from radix-4 up to radix-65536 is available. This benefits other codecs such as AAC and AC3.The implementaion is based upon the C version, with each routine larger than radix-16 calling a hierarchy of smaller FFT functions, then performing a post-processing pass. This pass benefits a lot from loop unrolling to counter the long pipelines in the VFP. A relaxed calling standard also reduces the overhead of the call hierarchy, and avoiding the excessive inlining performed by GCC probably helps with I-cache utilisation too. I benchmarked the result by measuring the number of gperftools samples that hit anywhere in the AAC decoder (starting from aac_decode_frame()) or specifically in the FFT routines (fft4() to fft512() and pass()) for the same sample AAC stream: Before After Mean StdDev Mean StdDev Confidence Change Audio decode 2245.5 53.1 1599.6 43.8 100.0% +40.4% FFT routines 940.6 22.0 348.1 20.8 100.0% +170.2% --- libavcodec/arm/fft_init_arm.c | 8 +- libavcodec/arm/fft_vfp.S | 284 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 275 insertions(+), 17 deletions(-)
I tried this code on most of our odd configurations, and ran into a bunch of issues, most of which I've been able to resolve in one way or another, but some parts of it is a bit ugly...
The other three patches don't seem to require any extra tweaks to build/run in these configurations though.
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 3a3d1a7..bc143c1 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@
#include "libavcodec/rdft.h"
#include "libavcodec/synth_filter.h"
+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
@@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
- if (have_vfp(cpu_flags)) {
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
+ s->fft_calc = ff_fft_calc_vfp;
#if CONFIG_MDCT
- if (!have_vfpv3(cpu_flags))
- s->imdct_half = ff_imdct_half_vfp;
+ s->imdct_half = ff_imdct_half_vfp;
#endif
}
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index 7845ebb..e7d710b 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,52 @@
#include "libavutil/arm/asm.S"
-@ TODO: * FFTs wider than 16
-@ * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit.
+
+function ff_fft_calc_vfp, export=1
+ ldr ip, [a1, #0] @ nbits
+ mov a1, a2
+A ldr ip, [pc, ip, lsl #2]
+A bx ip
+A .word 0
+A .word 0
+A .word fft4_vfp
+A .word fft8_vfp
+A .word ff_fft16_vfp @ this one alone is exported
This reference needs to be wrapped in X()
+A .word fft32_vfp +A .word fft64_vfp +A .word fft128_vfp +A .word fft256_vfp +A .word fft512_vfp +A .word fft1024_vfp +A .word fft2048_vfp +A .word fft4096_vfp +A .word fft8192_vfp +A .word fft16384_vfp +A .word fft32768_vfp +A .word fft65536_vfp +T tbh [pc, ip, lsl #1] +T 0: .short 0 +T .short 0 +T .short fft4_vfp - 0b +T .short fft4_vfp - 0b
You've got one entry duplicated here, and these entries need to be divided by 2 (just as they are in mlpdsp_armv5te.S).
+T .short fft8_vfp - 0b +T .short fft16_vfp - 0b +T .short fft32_vfp - 0b +T .short fft64_vfp - 0b +T .short fft128_vfp - 0b +T .short fft256_vfp - 0b +T .short fft512_vfp - 0b +T .short fft1024_vfp - 0b +T .short fft2048_vfp - 0b +T .short fft4096_vfp - 0b +T .short fft8192_vfp - 0b +T .short fft16384_vfp - 0b +T .short fft32768_vfp - 0b +T .short fft65536_vfp - 0b +endfunc
This jump table (the thumb version, after adding the division by two) fails to assemble with apple tools (both the current clang based ones, and the old gcc based ones). It only considers these assemble-time constants as long as there's no non-local labels between the two labels being subtracted.
To fix building, I renamed the fftX_vfp symbols into .LfftX_vfp, likewise for the fftX_internal_vfp and the cosXpiY constants. This also forced me to move the ff_fft16_vfp public function to the bottom of the file - so that there's no non-local labels between the jump table and all the functions it points to.
This does seem a bit overly complicated. Would it make sense to just turn it into a const table, just as fft_tab_neon in fft_neon.S? I guess it isn't significantly slower, although it generates more text relocations. But I think that would be a fair tradeoff here, if it fixes the build issues on apple platforms (and probably also avoids the label difference bug in armasm).
function fft4_vfp
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
@@ -131,18 +175,22 @@ endfunc
vstr d9, [a1, #3 * 2*4]
.endm
+function fft8_internal_vfp
+ macro_fft8_head
+ macro_fft8_tail
+ bx lr
+endfunc
+
function fft8_vfp
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
-
- macro_fft8_head
- macro_fft8_tail
-
+ mov ip, lr
+ bl fft8_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
- bx lr
+ bx ip
endfunc
.align 3
Is there any particular reason why this is aligned to 8 bytes instead of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I know this isn't added by this patch though.)
The extra alignment here triggered a bug in MS armasm where the jump table offsets were miscalculated, which seemed to be triggered by this extra alignment here. (I haven't reduced the bug to a reportable testcase yet.)
@@ -153,12 +201,7 @@ cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
.float 0.3826834261417388916015625
-function ff_fft16_vfp, export=1
- ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
- fmrx a2, FPSCR
- fmxr FPSCR, a3
- vpush {s16-s31}
-
+function fft16_internal_vfp
macro_fft8_head
@ FFT4(z+8)
vldr d10, [a1, #8 * 2*4]
@@ -292,7 +335,220 @@ function ff_fft16_vfp, export=1
vstr d8, [a1, #0 * 2*4]
vstr d9, [a1, #4 * 2*4]
+ bx lr
+endfunc
+
+function ff_fft16_vfp, export=1
+T b fft16_vfp
+T endfunc
+T
+T function fft16_vfp
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
+ fmrx a2, FPSCR
+ fmxr FPSCR, a3
+ vpush {s16-s31}
+ mov ip, lr
+ bl fft16_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
- bx lr
+ bx ip
+endfunc
+
+.macro pass n, z0, z1, z2, z3
+ add v6, v5, #4*2*\n
+ @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
+ @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+ @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
+ @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+ vldr d8, [\z2, #8*(o2+1)] @ s16,s17
+ vldmdb v6!, {s2}
+ vldr d9, [\z3, #8*(o3+1)] @ s18,s19
+ vldmia v5!, {s0,s1} @ s0 is unused
+ vldr s7, [\z2, #8*o2] @ t1
+ vmul.f s20, s16, s2 @ vector * scalar
+ vldr s0, [\z3, #8*o3] @ t5
+ vldr s6, [\z2, #8*o2+4] @ t2
+ vldr s3, [\z3, #8*o3+4] @ t6
+ vmul.f s16, s16, s1 @ vector * scalar
+ ldr a4, =\n-1
+1: add \z0, \z0, #8*2
+ .if \n*4*2 >= 512
+ add \z1, \z1, #8*2
+ .endif
+ .if \n*4*2 >= 256
+ add \z2, \z2, #8*2
+ .endif
+ .if \n*4*2 >= 512
+ add \z3, \z3, #8*2
+ .endif
+ @ up to 2 stalls (VFP vector issuing / waiting for s0)
+ @ depending upon whether this is the first iteration and
+ @ how many add instructions are inserted above
+ vadd.f s4, s0, s7 @ t5
+ vadd.f s5, s6, s3 @ t6
+ vsub.f s6, s6, s3 @ t4
+ vsub.f s7, s0, s7 @ t3
+ vldr d6, [\z0, #8*0-8*2] @ s12,s13
+ vadd.f s0, s16, s21 @ t1
+ vldr d7, [\z1, #8*o1-8*2] @ s14,s15
+ vsub.f s1, s18, s23 @ t5
+ vadd.f s8, s4, s12 @ vector + vector
+ @ stall (VFP vector issuing)
+ @ stall (VFP vector issuing)
+ @ stall (VFP vector issuing)
+ vsub.f s4, s12, s4
+ vsub.f s5, s13, s5
+ vsub.f s6, s14, s6
+ vsub.f s7, s15, s7
+ vsub.f s2, s17, s20 @ t2
+ vadd.f s3, s19, s22 @ t6
+ vstr d4, [\z0, #8*0-8*2] @ s8,s9
+ vstr d5, [\z1, #8*o1-8*2] @ s10,s11
+ @ stall (waiting for s5)
+ vstr d2, [\z2, #8*o2-8*2] @ s4,s5
+ vadd.f s4, s1, s0 @ t5
+ vstr d3, [\z3, #8*o3-8*2] @ s6,s7
+ vsub.f s7, s1, s0 @ t3
+ vadd.f s5, s2, s3 @ t6
+ vsub.f s6, s2, s3 @ t4
+ vldr d6, [\z0, #8*1-8*2] @ s12,s13
+ vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
+ vldr d4, [\z2, #8*o2] @ s8,s9
+ vldmdb v6!, {s2,s3}
+ vldr d5, [\z3, #8*o3] @ s10,s11
+ vadd.f s20, s4, s12 @ vector + vector
+ vldmia v5!, {s0,s1}
+ vldr d8, [\z2, #8*(o2+1)] @ s16,s17
+ @ stall (VFP vector issuing)
+ vsub.f s4, s12, s4
+ vsub.f s5, s13, s5
+ vsub.f s6, s14, s6
+ vsub.f s7, s15, s7
+ vmul.f s12, s8, s3 @ vector * scalar
+ vstr d10, [\z0, #8*1-8*2] @ s20,s21
+ vldr d9, [\z3, #8*(o3+1)] @ s18,s19
+ vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
+ vmul.f s8, s8, s0 @ vector * scalar
+ vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
+ @ stall (waiting for s7)
+ vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
+ vmul.f s20, s16, s2 @ vector * scalar
+ @ stall (VFP vector issuing)
+ @ stall (VFP vector issuing)
+ @ stall (VFP vector issuing)
+ vadd.f s7, s8, s13 @ t1
+ vsub.f s6, s9, s12 @ t2
+ vsub.f s0, s10, s15 @ t5
+ vadd.f s3, s11, s14 @ t6
+ vmul.f s16, s16, s1 @ vector * scalar
+ subs a4, a4, #1
+ bne 1b
+ @ What remains is identical to the first two indentations of
+ @ the above, but without the increment of z
+ vadd.f s4, s0, s7 @ t5
+ vadd.f s5, s6, s3 @ t6
+ vsub.f s6, s6, s3 @ t4
+ vsub.f s7, s0, s7 @ t3
+ vldr d6, [\z0, #8*0] @ s12,s13
+ vadd.f s0, s16, s21 @ t1
+ vldr d7, [\z1, #8*o1] @ s14,s15
+ vsub.f s1, s18, s23 @ t5
+ vadd.f s8, s4, s12 @ vector + vector
+ vsub.f s4, s12, s4
+ vsub.f s5, s13, s5
+ vsub.f s6, s14, s6
+ vsub.f s7, s15, s7
+ vsub.f s2, s17, s20 @ t2
+ vadd.f s3, s19, s22 @ t6
+ vstr d4, [\z0, #8*0] @ s8,s9
+ vstr d5, [\z1, #8*o1] @ s10,s11
+ vstr d2, [\z2, #8*o2] @ s4,s5
+ vadd.f s4, s1, s0 @ t5
+ vstr d3, [\z3, #8*o3] @ s6,s7
+ vsub.f s7, s1, s0 @ t3
+ vadd.f s5, s2, s3 @ t6
+ vsub.f s6, s2, s3 @ t4
+ vldr d6, [\z0, #8*1] @ s12,s13
+ vldr d7, [\z1, #8*(o1+1)] @ s14,s15
+ vadd.f s20, s4, s12 @ vector + vector
+ vsub.f s4, s12, s4
+ vsub.f s5, s13, s5
+ vsub.f s6, s14, s6
+ vsub.f s7, s15, s7
+ vstr d10, [\z0, #8*1] @ s20,s21
+ vstr d11, [\z1, #8*(o1+1)] @ s22,s23
+ vstr d2, [\z2, #8*(o2+1)] @ s4,s5
+ vstr d3, [\z3, #8*(o3+1)] @ s6,s7
+.endm
+
+.macro fft_internal_vfp name, name2, name4, costable, n
+function \name
+ .if \n >= 512
+ push {v1-v6,lr}
+ .elseif \n >= 256
+ push {v1-v2,v5-v6,lr}
+ .else
+ push {v1,v5-v6,lr}
+ .endif
+ mov v1, a1
+ bl \name2
+ add a1, v1, #8*(\n/4)*2
+ bl \name4
+ ldr v5, =\costable
Should this perhaps use the movrelx macro?
+ add a1, v1, #8*(\n/4)*3
+ bl \name4
+ .if \n >= 512
+ .set o1, 0*(\n/4/2)
+ .set o2, 0*(\n/4/2)
+ .set o3, 0*(\n/4/2)
+ add v2, v1, #8*2*(\n/4/2)
+ add v3, v1, #8*4*(\n/4/2)
+ add v4, v1, #8*6*(\n/4/2)
+ pass (\n/4/2), v1, v2, v3, v4
+ pop {v1-v6,pc}
+ .elseif \n >= 256
+ .set o1, 2*(\n/4/2)
+ .set o2, 0*(\n/4/2)
+ .set o3, 2*(\n/4/2)
+ add v2, v1, #8*4*(\n/4/2)
+ pass (\n/4/2), v1, v1, v2, v2
+ pop {v1-v2,v5-v6,pc}
+ .else
+ .set o1, 2*(\n/4/2)
+ .set o2, 4*(\n/4/2)
+ .set o3, 6*(\n/4/2)
+ pass (\n/4/2), v1, v1, v1, v1
+ pop {v1,v5-v6,pc}
+ .endif
endfunc
+.endm
+
+#define DECL_FFT(n,n2,n4) \
+fft_internal_vfp fft##n##_internal_vfp, fft##n2##_internal_vfp,
fft##n4##_internal_vfp, ff_cos_##n, n ;\
You need to add X() around the ff_cos_N name here
+ ;\
+function fft##n##_vfp ;\
+ ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
;\
+ fmrx a2, FPSCR ;\
+ fmxr FPSCR, a3 ;\
+ vpush {s16-s31} ;\
+ mov ip, lr ;\
+ bl fft##n##_internal_vfp ;\
+ vpop {s16-s31} ;\
+ fmxr FPSCR, a2 ;\
+ bx ip ;\
+endfunc ;\
+ ;\
+.ltorg
gas-preprocessor didn't previously support multiple instructions concatenated on one line separated with semicolons - I did write a patch for doing that now though.
Nevertheless, wouldn't this be more readable written just as a normal gas macro instead of as a cpp macro? That's how the corresponding macro is done in fft_neon.S.
// Martin _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
