Re: [libav-devel] [PATCH] AVX FFT

Vitor Sessak Sat, 23 Apr 2011 10:35:01 -0700

On 04/21/2011 08:19 PM, Loren Merritt wrote:

avx version of `fft-test -n 4` errors out, and -n 3 crashes. Likewise
for mdct up to -n 6.

All comments taken into account. I also had to do a iMDCT version, sincemy trick of just calling mdct_sse for AVX does not work anymore sincenow the permutation of AVX and SSE differ.

The results of a benchmark of my previous patch on actual SB hardwarefollows (thanks Reinhard!).


-Vitor


siretart-@sandy:~/libav/libav> master/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.5 us/transform [total time=1.52 s its=1048576]

siretart-@sandy:~/libav/libav> master/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.5 us/transform [total time=1.52 s its=1048576]

siretart-@sandy:~/libav/libav> master/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.5 us/transform [total time=1.53 s its=1048576]

siretart-@sandy:~/libav/libav> master/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.5 us/transform [total time=1.52 s its=1048576]

Now the AVX variant:

siretart-@sandy:~/libav/libav> avx/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.0 us/transform [total time=1.03 s its=1048576]

siretart-@sandy:~/libav/libav> avx/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.0 us/transform [total time=1.03 s its=1048576]

siretart-@sandy:~/libav/libav> avx/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.0 us/transform [total time=1.05 s its=1048576]

siretart-@sandy:~/libav/libav> avx/libavcodec/fft-test -s
FFT 512 test
Checking...
max:0.000008 e:3.92148e-08
Speed test...
time: 1.0 us/transform [total time=1.04 s its=1048576]

>From e01a7407a78fae25ace9613bf16985fe983c1e6d Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 23 Apr 2011 19:24:06 +0200
Subject: [PATCH 1/3] Update x86inc.asm from x264 to allow AVX emulation using SSE and MMX.

---
 libavcodec/x86/x86inc.asm |  249 ++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 248 insertions(+), 1 deletions(-)

diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm
index b7d1774..53091c1 100644
--- a/libavcodec/x86/x86inc.asm
+++ b/libavcodec/x86/x86inc.asm
@@ -1,10 +1,11 @@
 ;*****************************************************************************
 ;* x86inc.asm
 ;*****************************************************************************
-;* Copyright (C) 2005-2008 x264 project
+;* Copyright (C) 2005-2011 x264 project
 ;*
 ;* Authors: Loren Merritt <[email protected]>
 ;*          Anton Mitrofanov <[email protected]>
+;*          Jason Garrett-Glaser <[email protected]>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -499,6 +500,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endmacro
 
 %macro INIT_MMX 0
+    %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX
     %define mmsize 8
     %define num_mmregs 8
@@ -520,6 +522,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endmacro
 
 %macro INIT_XMM 0
+    %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_XMM
     %define mmsize 16
     %define num_mmregs 8
@@ -538,6 +541,31 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %endrep
 %endmacro
 
+%macro INIT_AVX 0
+    INIT_XMM
+    %assign avx_enabled 1
+    %define PALIGNR PALIGNR_SSSE3
+    %define RESET_MM_PERMUTATION INIT_AVX
+%endmacro
+
+%macro INIT_YMM 0
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM
+    %define mmsize 32
+    %define num_mmregs 8
+    %ifdef ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova vmovaps
+    %define movu vmovups
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
 INIT_MMX
 
 ; I often want to use macros that permute their arguments. e.g. there's no
@@ -645,3 +673,222 @@ INIT_MMX
         sub %1, %2
     %endif
 %endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+    %if sizeof%5==32
+        v%1 %5, %6, %7
+    %else
+        %if sizeof%5==8
+            %define %%regmov movq
+        %elif %2
+            %define %%regmov movaps
+        %else
+            %define %%regmov movdqa
+        %endif
+
+        %if %4>=3+%3
+            %ifnidn %5, %6
+                %if avx_enabled && sizeof%5==16
+                    v%1 %5, %6, %7
+                %else
+                    %%regmov %5, %6
+                    %1 %5, %7
+                %endif
+            %else
+                %1 %5, %7
+            %endif
+        %elif %3
+            %1 %5, %6, %7
+        %else
+            %1 %5, %6
+        %endif
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
+%macro AVX_INSTR 3
+    %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0
+AVX_INSTR addps, 1, 0
+AVX_INSTR addsd, 1, 0
+AVX_INSTR addss, 1, 0
+AVX_INSTR addsubpd, 1, 0
+AVX_INSTR addsubps, 1, 0
+AVX_INSTR andpd, 1, 0
+AVX_INSTR andps, 1, 0
+AVX_INSTR andnpd, 1, 0
+AVX_INSTR andnps, 1, 0
+AVX_INSTR blendpd, 1, 0
+AVX_INSTR blendps, 1, 0
+AVX_INSTR blendvpd, 1, 0
+AVX_INSTR blendvps, 1, 0
+AVX_INSTR cmppd, 1, 0
+AVX_INSTR cmpps, 1, 0
+AVX_INSTR cmpsd, 1, 0
+AVX_INSTR cmpss, 1, 0
+AVX_INSTR divpd, 1, 0
+AVX_INSTR divps, 1, 0
+AVX_INSTR divsd, 1, 0
+AVX_INSTR divss, 1, 0
+AVX_INSTR dppd, 1, 0
+AVX_INSTR dpps, 1, 0
+AVX_INSTR haddpd, 1, 0
+AVX_INSTR haddps, 1, 0
+AVX_INSTR hsubpd, 1, 0
+AVX_INSTR hsubps, 1, 0
+AVX_INSTR maxpd, 1, 0
+AVX_INSTR maxps, 1, 0
+AVX_INSTR maxsd, 1, 0
+AVX_INSTR maxss, 1, 0
+AVX_INSTR minpd, 1, 0
+AVX_INSTR minps, 1, 0
+AVX_INSTR minsd, 1, 0
+AVX_INSTR minss, 1, 0
+AVX_INSTR mpsadbw, 0, 1
+AVX_INSTR mulpd, 1, 0
+AVX_INSTR mulps, 1, 0
+AVX_INSTR mulsd, 1, 0
+AVX_INSTR mulss, 1, 0
+AVX_INSTR orpd, 1, 0
+AVX_INSTR orps, 1, 0
+AVX_INSTR packsswb, 0, 0
+AVX_INSTR packssdw, 0, 0
+AVX_INSTR packuswb, 0, 0
+AVX_INSTR packusdw, 0, 0
+AVX_INSTR paddb, 0, 0
+AVX_INSTR paddw, 0, 0
+AVX_INSTR paddd, 0, 0
+AVX_INSTR paddq, 0, 0
+AVX_INSTR paddsb, 0, 0
+AVX_INSTR paddsw, 0, 0
+AVX_INSTR paddusb, 0, 0
+AVX_INSTR paddusw, 0, 0
+AVX_INSTR palignr, 0, 1
+AVX_INSTR pand, 0, 0
+AVX_INSTR pandn, 0, 0
+AVX_INSTR pavgb, 0, 0
+AVX_INSTR pavgw, 0, 0
+AVX_INSTR pblendvb, 0, 0
+AVX_INSTR pblendw, 0, 1
+AVX_INSTR pcmpestri, 0, 0
+AVX_INSTR pcmpestrm, 0, 0
+AVX_INSTR pcmpistri, 0, 0
+AVX_INSTR pcmpistrm, 0, 0
+AVX_INSTR pcmpeqb, 0, 0
+AVX_INSTR pcmpeqw, 0, 0
+AVX_INSTR pcmpeqd, 0, 0
+AVX_INSTR pcmpeqq, 0, 0
+AVX_INSTR pcmpgtb, 0, 0
+AVX_INSTR pcmpgtw, 0, 0
+AVX_INSTR pcmpgtd, 0, 0
+AVX_INSTR pcmpgtq, 0, 0
+AVX_INSTR phaddw, 0, 0
+AVX_INSTR phaddd, 0, 0
+AVX_INSTR phaddsw, 0, 0
+AVX_INSTR phsubw, 0, 0
+AVX_INSTR phsubd, 0, 0
+AVX_INSTR phsubsw, 0, 0
+AVX_INSTR pmaddwd, 0, 0
+AVX_INSTR pmaddubsw, 0, 0
+AVX_INSTR pmaxsb, 0, 0
+AVX_INSTR pmaxsw, 0, 0
+AVX_INSTR pmaxsd, 0, 0
+AVX_INSTR pmaxub, 0, 0
+AVX_INSTR pmaxuw, 0, 0
+AVX_INSTR pmaxud, 0, 0
+AVX_INSTR pminsb, 0, 0
+AVX_INSTR pminsw, 0, 0
+AVX_INSTR pminsd, 0, 0
+AVX_INSTR pminub, 0, 0
+AVX_INSTR pminuw, 0, 0
+AVX_INSTR pminud, 0, 0
+AVX_INSTR pmulhuw, 0, 0
+AVX_INSTR pmulhrsw, 0, 0
+AVX_INSTR pmulhw, 0, 0
+AVX_INSTR pmullw, 0, 0
+AVX_INSTR pmulld, 0, 0
+AVX_INSTR pmuludq, 0, 0
+AVX_INSTR pmuldq, 0, 0
+AVX_INSTR por, 0, 0
+AVX_INSTR psadbw, 0, 0
+AVX_INSTR pshufb, 0, 0
+AVX_INSTR psignb, 0, 0
+AVX_INSTR psignw, 0, 0
+AVX_INSTR psignd, 0, 0
+AVX_INSTR psllw, 0, 0
+AVX_INSTR pslld, 0, 0
+AVX_INSTR psllq, 0, 0
+AVX_INSTR pslldq, 0, 0
+AVX_INSTR psraw, 0, 0
+AVX_INSTR psrad, 0, 0
+AVX_INSTR psrlw, 0, 0
+AVX_INSTR psrld, 0, 0
+AVX_INSTR psrlq, 0, 0
+AVX_INSTR psrldq, 0, 0
+AVX_INSTR psubb, 0, 0
+AVX_INSTR psubw, 0, 0
+AVX_INSTR psubd, 0, 0
+AVX_INSTR psubq, 0, 0
+AVX_INSTR psubsb, 0, 0
+AVX_INSTR psubsw, 0, 0
+AVX_INSTR psubusb, 0, 0
+AVX_INSTR psubusw, 0, 0
+AVX_INSTR punpckhbw, 0, 0
+AVX_INSTR punpckhwd, 0, 0
+AVX_INSTR punpckhdq, 0, 0
+AVX_INSTR punpckhqdq, 0, 0
+AVX_INSTR punpcklbw, 0, 0
+AVX_INSTR punpcklwd, 0, 0
+AVX_INSTR punpckldq, 0, 0
+AVX_INSTR punpcklqdq, 0, 0
+AVX_INSTR pxor, 0, 0
+AVX_INSTR shufps, 0, 1
+AVX_INSTR subpd, 1, 0
+AVX_INSTR subps, 1, 0
+AVX_INSTR subsd, 1, 0
+AVX_INSTR subss, 1, 0
+AVX_INSTR unpckhpd, 1, 0
+AVX_INSTR unpckhps, 1, 0
+AVX_INSTR unpcklpd, 1, 0
+AVX_INSTR unpcklps, 1, 0
+AVX_INSTR xorpd, 1, 0
+AVX_INSTR xorps, 1, 0
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0
+AVX_INSTR pfsub, 1, 0
+AVX_INSTR pfmul, 1, 0
-- 
1.7.1

>From 9c37b476fa578579e0cce6757532b8cdf62285d3 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 23 Apr 2011 19:24:31 +0200
Subject: [PATCH 2/3] Increase alignment of av_malloc() as needed by AVX ASM.

---
 libavutil/mem.c |   16 +++++++---------
 1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/libavutil/mem.c b/libavutil/mem.c
index 7a54bd0..7e3f9f0 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -69,21 +69,21 @@ void *av_malloc(FF_INTERNAL_MEM_TYPE size)
 #endif
 
     /* let's disallow possible ambiguous cases */
-    if(size > (INT_MAX-16) )
+    if(size > (INT_MAX-32) )
         return NULL;
 
 #if CONFIG_MEMALIGN_HACK
-    ptr = malloc(size+16);
+    ptr = malloc(size+32);
     if(!ptr)
         return ptr;
-    diff= ((-(long)ptr - 1)&15) + 1;
+    diff= ((-(long)ptr - 1)&31) + 1;
     ptr = (char*)ptr + diff;
     ((char*)ptr)[-1]= diff;
 #elif HAVE_POSIX_MEMALIGN
-    if (posix_memalign(&ptr,16,size))
+    if (posix_memalign(&ptr,32,size))
         ptr = NULL;
 #elif HAVE_MEMALIGN
-    ptr = memalign(16,size);
+    ptr = memalign(32,size);
     /* Why 64?
        Indeed, we should align it:
          on 4 for 386
@@ -93,10 +93,8 @@ void *av_malloc(FF_INTERNAL_MEM_TYPE size)
        Because L1 and L2 caches are aligned on those values.
        But I don't want to code such logic here!
      */
-     /* Why 16?
-        Because some CPUs need alignment, for example SSE2 on P4, & most RISC CPUs
-        it will just trigger an exception and the unaligned load will be done in the
-        exception handler or it will just segfault (SSE2 on P4).
+     /* Why 32?
+        For AVX ASM. SSE / NEON needs only 16.
         Why not larger? Because I did not see a difference in benchmarks ...
      */
      /* benchmarks with P3
-- 
1.7.1

>From 17abe6bfc02ef5e40581de2777e15defe78086de Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 23 Apr 2011 19:24:57 +0200
Subject: [PATCH 3/3] Add AVX FFT implementation.

---
 libavcodec/aac.h           |   10 +-
 libavcodec/aacenc.h        |    2 +-
 libavcodec/ac3dec.h        |   10 +-
 libavcodec/ac3enc.c        |    2 +-
 libavcodec/atrac1.c        |   20 +-
 libavcodec/atrac3.c        |    6 +-
 libavcodec/binkaudio.c     |    2 +-
 libavcodec/cook.c          |    2 +-
 libavcodec/dca.c           |   10 +-
 libavcodec/fft.c           |   53 +++++-
 libavcodec/fft.h           |    3 +-
 libavcodec/imc.c           |    2 +-
 libavcodec/nellymoserdec.c |    4 +-
 libavcodec/nellymoserenc.c |    6 +-
 libavcodec/qdm2.c          |    2 +-
 libavcodec/wma.h           |    8 +-
 libavcodec/wmaprodec.c     |    4 +-
 libavcodec/wmavoice.c      |    6 +-
 libavcodec/x86/fft.c       |    9 +-
 libavcodec/x86/fft.h       |    2 +
 libavcodec/x86/fft_mmx.asm |  478 ++++++++++++++++++++++++++++++--------------
 libavcodec/x86/fft_sse.c   |    8 +-
 22 files changed, 443 insertions(+), 206 deletions(-)

diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index e3385e2..bbe7912 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -223,9 +223,9 @@ typedef struct {
     float sf[120];                                  ///< scalefactors
     int sf_idx[128];                                ///< scalefactor indices (used by encoder)
     uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
-    DECLARE_ALIGNED(16, float,   coeffs)[1024];     ///< coefficients for IMDCT
-    DECLARE_ALIGNED(16, float,   saved)[1024];      ///< overlap
-    DECLARE_ALIGNED(16, float,   ret)[2048];        ///< PCM output
+    DECLARE_ALIGNED(32, float,   coeffs)[1024];     ///< coefficients for IMDCT
+    DECLARE_ALIGNED(32, float,   saved)[1024];      ///< overlap
+    DECLARE_ALIGNED(32, float,   ret)[2048];        ///< PCM output
     DECLARE_ALIGNED(16, int16_t, ltp_state)[3072];  ///< time signal for LTP
     PredictorState predictor_state[MAX_PREDICTORS];
 } SingleChannelElement;
@@ -272,7 +272,7 @@ typedef struct {
      * @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
      * @{
      */
-    DECLARE_ALIGNED(16, float, buf_mdct)[1024];
+    DECLARE_ALIGNED(32, float, buf_mdct)[1024];
     /** @} */
 
     /**
@@ -296,7 +296,7 @@ typedef struct {
     int sf_offset;                                    ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
     /** @} */
 
-    DECLARE_ALIGNED(16, float, temp)[128];
+    DECLARE_ALIGNED(32, float, temp)[128];
 
     enum OCStatus output_configured;
 } AACContext;
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index 3d584d2..7e08db2 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -64,7 +64,7 @@ typedef struct AACEncContext {
     int last_frame;
     float lambda;
     DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
-    DECLARE_ALIGNED(16, float, scoefs)[1024];    ///< scaled coefficients
+    DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
 } AACEncContext;
 
 #endif /* AVCODEC_AACENC_H */
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 3459441..6cba95b 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -200,11 +200,11 @@ typedef struct {
 
 ///@defgroup arrays aligned arrays
     DECLARE_ALIGNED(16, int,   fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];       ///> fixed-point transform coefficients
-    DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
-    DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
-    DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
-    DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
-    DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
+    DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS];   ///< transform coefficients
+    DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];             ///< delay - added to the next block
+    DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE];                              ///< window coefficients
+    DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE];                          ///< temporary storage for output before windowing
+    DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE];            ///< output after imdct transform and windowing
 ///@}
 } AC3DecodeContext;
 
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 04e8b4f..77647d4 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
 
     uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
 
-    DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
+    DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
 } AC3EncodeContext;
 
 typedef struct AC3Mant {
diff --git a/libavcodec/atrac1.c b/libavcodec/atrac1.c
index d6c7053..0ba2cf6 100644
--- a/libavcodec/atrac1.c
+++ b/libavcodec/atrac1.c
@@ -60,11 +60,11 @@ typedef struct {
     int                 log2_block_count[AT1_QMF_BANDS];    ///< log2 number of blocks in a band
     int                 num_bfus;                           ///< number of Block Floating Units
     float*              spectrum[2];
-    DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
-    DECLARE_ALIGNED(16, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
-    DECLARE_ALIGNED(16, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
-    DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
+    DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES];     ///< mdct buffer
+    DECLARE_ALIGNED(32, float, fst_qmf_delay)[46];         ///< delay line for the 1st stacked QMF filter
+    DECLARE_ALIGNED(32, float, snd_qmf_delay)[46];         ///< delay line for the 2nd stacked QMF filter
+    DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23];    ///< delay line for the last stacked QMF filter
 } AT1SUCtx;
 
 /**
@@ -72,13 +72,13 @@ typedef struct {
  */
 typedef struct {
     AT1SUCtx            SUs[AT1_MAX_CHANNELS];              ///< channel sound unit
-    DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
+    DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES];      ///< the mdct spectrum buffer
 
-    DECLARE_ALIGNED(16, float,  low)[256];
-    DECLARE_ALIGNED(16, float,  mid)[256];
-    DECLARE_ALIGNED(16, float, high)[512];
+    DECLARE_ALIGNED(32, float,  low)[256];
+    DECLARE_ALIGNED(32, float,  mid)[256];
+    DECLARE_ALIGNED(32, float, high)[512];
     float*              bands[3];
-    DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
+    DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
     FFTContext          mdct_ctx[3];
     int                 channels;
     DSPContext          dsp;
diff --git a/libavcodec/atrac3.c b/libavcodec/atrac3.c
index 5633520..5c8b87d 100644
--- a/libavcodec/atrac3.c
+++ b/libavcodec/atrac3.c
@@ -74,8 +74,8 @@ typedef struct {
     int               gcBlkSwitch;
     gain_block        gainBlock[2];
 
-    DECLARE_ALIGNED(16, float, spectrum)[1024];
-    DECLARE_ALIGNED(16, float, IMDCT_buf)[1024];
+    DECLARE_ALIGNED(32, float, spectrum)[1024];
+    DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
 
     float             delayBuf1[46]; ///<qmf delay buffers
     float             delayBuf2[46];
@@ -122,7 +122,7 @@ typedef struct {
     FFTContext          mdct_ctx;
 } ATRAC3Context;
 
-static DECLARE_ALIGNED(16, float,mdct_window)[512];
+static DECLARE_ALIGNED(32, float,mdct_window)[512];
 static VLC              spectral_coeff_tab[7];
 static float            gain_tab1[16];
 static float            gain_tab2[31];
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index 77ce6b9..a05b0b5 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -55,7 +55,7 @@ typedef struct {
     int num_bands;
     unsigned int *bands;
     float root;
-    DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
     DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16];  ///< coeffs from previous audio block
     float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
     union {
diff --git a/libavcodec/cook.c b/libavcodec/cook.c
index 7717c4b..42ebc08 100644
--- a/libavcodec/cook.c
+++ b/libavcodec/cook.c
@@ -153,7 +153,7 @@ typedef struct cook {
     /* data buffers */
 
     uint8_t*            decoded_bytes_buffer;
-    DECLARE_ALIGNED(16, float,mono_mdct_output)[2048];
+    DECLARE_ALIGNED(32, float,mono_mdct_output)[2048];
     float               decode_buffer_1[1024];
     float               decode_buffer_2[1024];
     float               decode_buffer_0[1060]; /* static allocation for joint decode */
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index e3c6466..cd13285 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -321,16 +321,16 @@ typedef struct {
 
     /* Subband samples history (for ADPCM) */
     float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
-    DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
-    DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
+    DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
+    DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
     int hist_index[DCA_PRIM_CHANNELS_MAX];
-    DECLARE_ALIGNED(16, float, raXin)[32];
+    DECLARE_ALIGNED(32, float, raXin)[32];
 
     int output;                 ///< type of output
     float scale_bias;           ///< output scale
 
-    DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
-    DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
+    DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
+    DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
     const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
 
     uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 077f471..5bd8463 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
 #endif
 }
 
+static const int avx_tab[] = {
+    0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
+};
+
+static int is_second_half_of_fft32(int i, int n)
+{
+    if (n <= 32)
+        return i >= 16;
+    else if (i < n/2)
+        return is_second_half_of_fft32(i, n/2);
+    else if (i < 3*n/4)
+        return is_second_half_of_fft32(i - n/2, n/4);
+    else
+        return is_second_half_of_fft32(i - 3*n/4, n/4);
+}
+
+static av_cold void fft_perm_avx(FFTContext *s)
+{
+    int i;
+    int n = 1 << s->nbits;
+
+    for(i = 0; i < n; i += 16) {
+        int k;
+        if (is_second_half_of_fft32(i, n)) {
+            for (k=0; k < 16; k++)
+                s->revtab[-split_radix_permutation(i+k, n, s->inverse) & (n-1)] =
+                    i + avx_tab[k];
+
+        } else {
+            for (k=0; k < 16; k++) {
+                int j = i + k;
+                j = (j&~7) | ((j>>1)&3) | ((j<<2)&4);
+                s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n-1)] = j;
+            }
+        }
+    }
+}
+
 av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 {
     int i, j, n;
@@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
-    for(i=0; i<n; i++) {
-        int j = i;
-        if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
-            j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
-        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
+
+    if (s->fft_permutation == FF_FFT_PERM_AVX) {
+        fft_perm_avx(s);
+    } else {
+        for(i=0; i<n; i++) {
+            int j = i;
+            if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
+                j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
+            s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
+        }
     }
 
     return 0;
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index a4fee00..dc3c190 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -85,6 +85,7 @@ struct FFTContext {
     int fft_permutation;
 #define FF_FFT_PERM_DEFAULT   0
 #define FF_FFT_PERM_SWAP_LSBS 1
+#define FF_FFT_PERM_AVX       2
     int mdct_permutation;
 #define FF_MDCT_PERM_NONE       0
 #define FF_MDCT_PERM_INTERLEAVE 1
@@ -97,7 +98,7 @@ struct FFTContext {
 #endif
 
 #define COSTABLE(size) \
-    COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
+    COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
 
 extern COSTABLE(16);
 extern COSTABLE(32);
diff --git a/libavcodec/imc.c b/libavcodec/imc.c
index e48a709..07d6cad 100644
--- a/libavcodec/imc.c
+++ b/libavcodec/imc.c
@@ -88,7 +88,7 @@ typedef struct {
 
     DSPContext dsp;
     FFTContext fft;
-    DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2];
+    DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
     float *out_samples;
 } IMCContext;
 
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 5ad49ab..12aaed7 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -47,7 +47,7 @@
 
 typedef struct NellyMoserDecodeContext {
     AVCodecContext* avctx;
-    DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float,float_buf)[NELLY_SAMPLES];
     float           state[128];
     AVLFG           random_state;
     GetBitContext   gb;
@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
     DSPContext      dsp;
     FFTContext      imdct_ctx;
     FmtConvertContext fmt_conv;
-    DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
+    DECLARE_ALIGNED(32, float,imdct_out)[NELLY_BUF_LEN * 2];
 } NellyMoserDecodeContext;
 
 static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)
diff --git a/libavcodec/nellymoserenc.c b/libavcodec/nellymoserenc.c
index 586443d..ef88ff8 100644
--- a/libavcodec/nellymoserenc.c
+++ b/libavcodec/nellymoserenc.c
@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
     int             have_saved;
     DSPContext      dsp;
     FFTContext      mdct_ctx;
-    DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES];
-    DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
+    DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
+    DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN];     ///< sample buffer
     float           (*opt )[NELLY_BANDS];
     uint8_t         (*path)[NELLY_BANDS];
 } NellyMoserEncodeContext;
diff --git a/libavcodec/qdm2.c b/libavcodec/qdm2.c
index 3ef712c..198f11f 100644
--- a/libavcodec/qdm2.c
+++ b/libavcodec/qdm2.c
@@ -120,7 +120,7 @@ typedef struct {
 } FFTCoefficient;
 
 typedef struct {
-    DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
+    DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
 } QDM2FFT;
 
 /**
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index d12c55c..f11d550 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
     uint8_t ms_stereo;                      ///< true if mid/side stereo mode
     uint8_t channel_coded[MAX_CHANNELS];    ///< true if channel is coded
     int exponents_bsize[MAX_CHANNELS];      ///< log2 ratio frame/exp. length
-    DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
     float max_exponent[MAX_CHANNELS];
     WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
-    DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
+    DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
     FFTContext mdct_ctx[BLOCK_NB_SIZES];
     float *windows[BLOCK_NB_SIZES];
     /* output buffer for one frame and the last for IMDCT windowing */
-    DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
+    DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
     /* last frame info */
     uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
     int last_bitoffset;
diff --git a/libavcodec/wmaprodec.c b/libavcodec/wmaprodec.c
index c9048a9..ab2cd5c 100644
--- a/libavcodec/wmaprodec.c
+++ b/libavcodec/wmaprodec.c
@@ -145,7 +145,7 @@ typedef struct {
     uint8_t  table_idx;                               ///< index in sf_offsets for the scale factor reference block
     float*   coeffs;                                  ///< pointer to the subframe decode buffer
     uint16_t num_vec_coeffs;                          ///< number of vector coded coefficients
-    DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
+    DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
 } WMAProChannelCtx;
 
 /**
@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
                       FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
     PutBitContext    pb;                            ///< context for filling the frame_data buffer
     FFTContext       mdct_ctx[WMAPRO_BLOCK_SIZES];  ///< MDCT context per block size
-    DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
+    DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
     float*           windows[WMAPRO_BLOCK_SIZES];   ///< windows for the different block sizes
 
     /* frame size dependent frame information (set during initialization) */
diff --git a/libavcodec/wmavoice.c b/libavcodec/wmavoice.c
index ea8260c..1bf9f27 100644
--- a/libavcodec/wmavoice.c
+++ b/libavcodec/wmavoice.c
@@ -275,11 +275,11 @@ typedef struct {
                                   ///< by postfilter
     float denoise_filter_cache[MAX_FRAMESIZE];
     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
-    DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
                                   ///< aligned buffer for LPC tilting
-    DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
+    DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
                                   ///< aligned buffer for denoise coefficients
-    DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
+    DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
                                   ///< aligned buffer for postfilter speech
                                   ///< synthesis
     /**
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 2426a3d..80cc896 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
 {
 #if HAVE_YASM
     int has_vectors = av_get_cpu_flags();
-    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
+    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
+        /* SSE for P3/P4/K8 */
+        s->imdct_calc  = ff_imdct_calc_sse;
+        s->imdct_half  = ff_imdct_half_avx;
+        s->fft_permute = ff_fft_permute_sse;
+        s->fft_calc    = ff_fft_calc_avx;
+        s->fft_permutation = FF_FFT_PERM_AVX;
+    } else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
         /* SSE for P3/P4/K8 */
         s->imdct_calc  = ff_imdct_calc_sse;
         s->imdct_half  = ff_imdct_half_sse;
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 073d408..e6eace2 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -22,6 +22,7 @@
 #include "libavcodec/fft.h"
 
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
 
 #endif
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index e3829b8..b4dd51d 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -1,6 +1,7 @@
 ;******************************************************************************
 ;* FFT transform with SSE/3DNow optimizations
 ;* Copyright (c) 2008 Loren Merritt
+;* Copyright (c) 2011 Vitor Sessak
 ;*
 ;* This algorithm (though not any of the implementation details) is
 ;* based on libdjbfft by D. J. Bernstein.
@@ -49,11 +50,22 @@ endstruc
 SECTION_RODATA
 
 %define M_SQRT1_2 0.70710678118654752440
-ps_root2: times 4 dd M_SQRT1_2
-ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
-ps_p1p1m1p1: dd 0, 0, 1<<31, 0
+%define M_COS_PI_1_8 0.923879532511287
+%define M_COS_PI_3_8 0.38268343236509
+
+ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
+ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
+
+ps_root2: times 8 dd M_SQRT1_2
+ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
 ps_m1p1: dd 1<<31, 0
 
+perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
+perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
+ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
+
 %assign i 16
 %rep 13
 cextern cos_ %+ i
@@ -96,51 +108,80 @@ section .text align=16
     SWAP     %3, %6
 %endmacro
 
+;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
+;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
+;      %3, %4, %5 tmp
+; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
+;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
+%macro T8_AVX 5
+    vsubps     %5, %1, %2	; v  = %1 - %2
+    vaddps     %3, %1, %2	; w  = %1 + %2
+    vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
+    vpermilps  %2, %2, [perm1]
+    vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
+    vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
+    vsubps     %4, %5, %1	; s = r - q
+    vaddps     %1, %5, %1	; u = r + q
+    vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
+    vshufps    %5, %4, %1, 0xbb
+    vshufps    %3, %4, %1, 0xee
+    vperm2f128 %3, %3, %5, 0x13
+    vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
+    vshufps    %2, %1, %4, 0xdd
+    vshufps    %1, %1, %4, 0x88
+    vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
+    vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
+    vsubps     %5, %1, %3
+    vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
+    vsubps     %2, %4, %1	; %2 = v - w
+    vaddps     %1, %4, %1	; %1 = v + w
+%endmacro
+
+; In SSE mode do one fft4 transforms
 ; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
+;
+; In AVX mode do two fft4 transforms
+; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
+; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
 %macro T4_SSE 3
-    mova     %3, %1
-    addps    %1, %2       ; {t1,t2,t6,t5}
-    subps    %3, %2       ; {t3,t4,-t8,t7}
-    xorps    %3, [ps_p1p1m1p1]
-    mova     %2, %1
-    shufps   %1, %3, 0x44 ; {t1,t2,t3,t4}
-    shufps   %2, %3, 0xbe ; {t6,t5,t7,t8}
-    mova     %3, %1
-    addps    %1, %2       ; {r0,i0,r1,i1}
-    subps    %3, %2       ; {r2,i2,r3,i3}
-    mova     %2, %1
-    shufps   %1, %3, 0x88 ; {r0,r1,r2,r3}
-    shufps   %2, %3, 0xdd ; {i0,i1,i2,i3}
+    subps    %3, %1, %2       ; {t3,t4,-t8,t7}
+    addps    %1, %1, %2       ; {t1,t2,t6,t5}
+    xorps    %3, %3, [ps_p1p1m1p1]
+    shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
+    shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
+    subps    %3, %1, %2       ; {r2,i2,r3,i3}
+    addps    %1, %1, %2       ; {r0,i0,r1,i1}
+    shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
+    shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
 %endmacro
 
+; In SSE mode do one FFT8
 ; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
+;
+; In AVX mode do two FFT8
+; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
+;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
+; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
+;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} 
 %macro T8_SSE 6
-    mova     %6, %3
-    subps    %3, %4       ; {r5,i5,r7,i7}
-    addps    %6, %4       ; {t1,t2,t3,t4}
-    mova     %4, %3
-    shufps   %4, %4, 0xb1 ; {i5,r5,i7,r7}
-    mulps    %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
-    mulps    %4, [ps_root2]
-    addps    %3, %4       ; {t8,t7,ta,t9}
-    mova     %4, %6
-    shufps   %6, %3, 0x36 ; {t3,t2,t9,t8}
-    shufps   %4, %3, 0x9c ; {t1,t4,t7,ta}
-    mova     %3, %6
-    addps    %6, %4       ; {t1,t2,t9,ta}
-    subps    %3, %4       ; {t6,t5,tc,tb}
-    mova     %4, %6
-    shufps   %6, %3, 0xd8 ; {t1,t9,t5,tb}
-    shufps   %4, %3, 0x8d ; {t2,ta,t6,tc}
-    mova     %3, %1
-    mova     %5, %2
-    addps    %1, %6       ; {r0,r1,r2,r3}
-    addps    %2, %4       ; {i0,i1,i2,i3}
-    subps    %3, %6       ; {r4,r5,r6,r7}
-    subps    %5, %4       ; {i4,i5,i6,i7}
-    SWAP     %4, %5
+    addps    %6, %3, %4       ; {t1,t2,t3,t4}
+    subps    %3, %3, %4       ; {r5,i5,r7,i7}
+    shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
+    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
+    mulps    %4, %4, [ps_root2]
+    addps    %3, %3, %4       ; {t8,t7,ta,t9}
+    shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
+    shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
+    subps    %3, %6, %4       ; {t6,t5,tc,tb}
+    addps    %6, %6, %4       ; {t1,t2,t9,ta}
+    shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
+    shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
+    subps    %3, %1, %6       ; {r4,r5,r6,r7}
+    addps    %1, %1, %6       ; {r0,r1,r2,r3}
+    subps    %4, %2, %5       ; {i4,i5,i6,i7}
+    addps    %2, %2, %5       ; {i0,i1,i2,i3}
 %endmacro
 
 ; scheduled for cpu-bound sizes
@@ -148,52 +189,44 @@ section .text align=16
 IF%1 mova    m4, Z(4)
 IF%1 mova    m5, Z(5)
     mova     m0, %2 ; wre
-    mova     m2, m4
     mova     m1, %3 ; wim
-    mova     m3, m5
-    mulps    m2, m0 ; r2*wre
+    mulps    m2, m4, m0 ; r2*wre
 IF%1 mova    m6, Z2(6)
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
 IF%1 mova    m7, Z2(7)
-    mulps    m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
-    mulps    m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
-    mulps    m3, m7 ; i3*wim
-    mulps    m4, m6 ; r3*wre
-    mulps    m0, m7 ; i3*wre
-    subps    m4, m3 ; r3*wre - i3*wim
+    mulps    m4, m4, m1 ; r2*wim
+    mulps    m5, m5, m0 ; i2*wre
+    addps    m2, m2, m3 ; r2*wre + i2*wim
+    mulps    m3, m1, m7 ; i3*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
+    mulps    m1, m1, m6 ; r3*wim
+    mulps    m4, m0, m6 ; r3*wre
+    mulps    m0, m0, m7 ; i3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
     mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
-    addps    m4, m2 ; t5
-    subps    m1, m2 ; t3
-    subps    m3, m4 ; r2
-    addps    m4, Z(0) ; r0
+    addps    m0, m0, m1 ; i3*wre + r3*wim
+    subps    m1, m4, m2 ; t3
+    addps    m4, m4, m2 ; t5
+    subps    m3, m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
     mova     m6, Z(2)
     mova   Z(4), m3
     mova   Z(0), m4
-    mova     m3, m5
-    subps    m5, m0 ; t4
-    mova     m4, m6
-    subps    m6, m5 ; r3
-    addps    m5, m4 ; r1
-    mova  Z2(6), m6
-    mova   Z(2), m5
+    subps    m3, m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
+    addps    m3, m3, m6 ; r1
+    mova  Z2(6), m4
+    mova   Z(2), m3
     mova     m2, Z(3)
-    addps    m3, m0 ; t6
-    subps    m2, m1 ; i3
+    addps    m3, m5, m0 ; t6
+    subps    m2, m2, m1 ; i3
     mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
     mova  Z2(7), m2
     mova   Z(3), m1
-    mova     m4, m7
-    subps    m7, m3 ; i2
-    addps    m3, m4 ; i0
-    mova   Z(5), m7
+    subps    m4, m7, m3 ; i2
+    addps    m3, m3, m7 ; i0
+    mova   Z(5), m4
     mova   Z(1), m3
 %endmacro
 
@@ -201,77 +234,55 @@ IF%1 mova    m7, Z2(7)
 %macro PASS_BIG 1 ; (!interleave)
     mova     m4, Z(4) ; r2
     mova     m5, Z(5) ; i2
-    mova     m2, m4
     mova     m0, [wq] ; wre
-    mova     m3, m5
     mova     m1, [wq+o1q] ; wim
-    mulps    m2, m0 ; r2*wre
+    mulps    m2, m4, m0 ; r2*wre
     mova     m6, Z2(6) ; r3
-    mulps    m3, m1 ; i2*wim
+    mulps    m3, m5, m1 ; i2*wim
     mova     m7, Z2(7) ; i3
-    mulps    m4, m1 ; r2*wim
-    mulps    m5, m0 ; i2*wre
-    addps    m2, m3 ; r2*wre + i2*wim
-    mova     m3, m1
-    mulps    m1, m6 ; r3*wim
-    subps    m5, m4 ; i2*wre - r2*wim
-    mova     m4, m0
-    mulps    m3, m7 ; i3*wim
-    mulps    m4, m6 ; r3*wre
-    mulps    m0, m7 ; i3*wre
-    subps    m4, m3 ; r3*wre - i3*wim
+    mulps    m4, m4, m1 ; r2*wim
+    mulps    m5, m5, m0 ; i2*wre
+    addps    m2, m2, m3 ; r2*wre + i2*wim
+    mulps    m3, m1, m7 ; i3*wim
+    mulps    m1, m1, m6 ; r3*wim
+    subps    m5, m5, m4 ; i2*wre - r2*wim
+    mulps    m4, m0, m6 ; r3*wre
+    mulps    m0, m0, m7 ; i3*wre
+    subps    m4, m4, m3 ; r3*wre - i3*wim
     mova     m3, Z(0)
-    addps    m0, m1 ; i3*wre + r3*wim
-    mova     m1, m4
-    addps    m4, m2 ; t5
-    subps    m1, m2 ; t3
-    subps    m3, m4 ; r2
-    addps    m4, Z(0) ; r0
+    addps    m0, m0, m1 ; i3*wre + r3*wim
+    subps    m1, m4, m2 ; t3
+    addps    m4, m4, m2 ; t5
+    subps    m3, m3, m4 ; r2
+    addps    m4, m4, Z(0) ; r0
     mova     m6, Z(2)
     mova   Z(4), m3
     mova   Z(0), m4
-    mova     m3, m5
-    subps    m5, m0 ; t4
-    mova     m4, m6
-    subps    m6, m5 ; r3
-    addps    m5, m4 ; r1
-IF%1 mova Z2(6), m6
-IF%1 mova  Z(2), m5
+    subps    m3, m5, m0 ; t4
+    subps    m4, m6, m3 ; r3
+    addps    m3, m3, m6 ; r1
+IF%1 mova Z2(6), m4
+IF%1 mova  Z(2), m3
     mova     m2, Z(3)
-    addps    m3, m0 ; t6
-    subps    m2, m1 ; i3
+    addps    m5, m5, m0 ; t6
+    subps    m2, m2, m1 ; i3
     mova     m7, Z(1)
-    addps    m1, Z(3) ; i1
+    addps    m1, m1, Z(3) ; i1
 IF%1 mova Z2(7), m2
 IF%1 mova  Z(3), m1
-    mova     m4, m7
-    subps    m7, m3 ; i2
-    addps    m3, m4 ; i0
-IF%1 mova  Z(5), m7
-IF%1 mova  Z(1), m3
+    subps    m6, m7, m5 ; i2
+    addps    m5, m5, m7 ; i0
+IF%1 mova  Z(5), m6
+IF%1 mova  Z(1), m5
 %if %1==0
-    mova     m4, m5 ; r1
-    mova     m0, m6 ; r3
-    unpcklps m5, m1
-    unpckhps m4, m1
-    unpcklps m6, m2
-    unpckhps m0, m2
+    INTERL m1, m3, m7, Z, 2
+    INTERL m2, m4, m0, Z2, 6
+
     mova     m1, Z(0)
     mova     m2, Z(4)
-    mova   Z(2), m5
-    mova   Z(3), m4
-    mova  Z2(6), m6
-    mova  Z2(7), m0
-    mova     m5, m1 ; r0
-    mova     m4, m2 ; r2
-    unpcklps m1, m3
-    unpckhps m5, m3
-    unpcklps m2, m7
-    unpckhps m4, m7
-    mova   Z(0), m1
-    mova   Z(1), m5
-    mova   Z(4), m2
-    mova   Z(5), m4
+
+    INTERL m5, m1, m3, Z, 0
+    INTERL m6, m2, m7, Z, 4
 %endif
 %endmacro
 
@@ -281,13 +292,106 @@ IF%1 mova  Z(1), m3
     punpckhdq %3, %2
 %endmacro
 
-INIT_XMM
-%define mova movaps
-
 %define Z(x) [r0+mmsize*x]
 %define Z2(x) [r0+mmsize*x]
+%define ZH(x) [r0+mmsize*x+mmsize/2]
+
+INIT_YMM
+
+align 16
+fft8_avx:
+    mova      m0, Z(0)
+    mova      m1, Z(1)
+    T8_AVX    m0, m1, m2, m3, m4
+    mova      Z(0), m0
+    mova      Z(1), m1
+    ret
+
 
 align 16
+fft16_avx:
+    mova       m2, Z(2)
+    mova       m3, Z(3)
+    T4_SSE     m2, m3, m7
+
+    mova       m0, Z(0)
+    mova       m1, Z(1)
+    T8_AVX     m0, m1, m4, m5, m7
+
+    mova       m4, [ps_cos16_1]
+    mova       m5, [ps_cos16_2]
+    vmulps     m6, m2, m4
+    vmulps     m7, m3, m5
+    vaddps     m7, m7, m6
+    vmulps     m2, m2, m5
+    vmulps     m3, m3, m4
+    vsubps     m3, m3, m2
+    vblendps   m2, m7, m3, 0xf0
+    vperm2f128 m3, m7, m3, 0x21
+    vaddps     m4, m2, m3
+    vsubps     m2, m3, m2
+    vperm2f128 m2, m2, m2, 0x01
+    vsubps     m3, m1, m2
+    vaddps     m1, m1, m2
+    vsubps     m5, m0, m4
+    vaddps     m0, m0, m4
+    vextractf128   Z(0), m0, 0
+    vextractf128  ZH(0), m1, 0
+    vextractf128   Z(1), m0, 1
+    vextractf128  ZH(1), m1, 1
+    vextractf128   Z(2), m5, 0
+    vextractf128  ZH(2), m3, 0
+    vextractf128   Z(3), m5, 1
+    vextractf128  ZH(3), m3, 1
+    ret
+
+align 16
+fft32_avx:
+    call fft16_avx
+
+    mova m0, Z(4)
+    mova m1, Z(5)
+
+    T4_SSE      m0, m1, m4
+
+    mova m2, Z(6)
+    mova m3, Z(7)
+
+    T8_SSE      m0, m1, m2, m3, m4, m6
+    ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
+    ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
+
+    vperm2f128  m4, m0, m2, 0x20
+    vperm2f128  m5, m1, m3, 0x20
+    vperm2f128  m6, m0, m2, 0x31
+    vperm2f128  m7, m1, m3, 0x31
+
+    PASS_SMALL 0, [cos_32], [cos_32+32]
+
+    ret
+
+fft32_interleave_avx:
+    call fft32_avx
+    mov r2d, 32
+.deint_loop:
+    mova     m2, Z(0)
+    mova     m3, Z(1)
+    vunpcklps      m0, m2, m3
+    vunpckhps      m1, m2, m3
+    vextractf128   Z(0), m0, 0
+    vextractf128  ZH(0), m1, 0
+    vextractf128   Z(1), m0, 1
+    vextractf128  ZH(1), m1, 1
+    add r0, mmsize*2
+    sub r2d, mmsize/4
+    jg .deint_loop
+    ret
+
+INIT_XMM
+%define mova    movaps
+
+align 16
+fft4_avx:
 fft4_sse:
     mova     m0, Z(0)
     mova     m1, Z(1)
@@ -406,6 +510,8 @@ FFT48_3DN _3dn
 
 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
 %define Z2(x) [zq + o3q + mmsize*(x&1)]
+%define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
+%define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
 
 %macro DECL_PASS 2+ ; name, payload
 align 16
@@ -423,8 +529,35 @@ DEFINE_ARGS z, w, n, o1, o3
     rep ret
 %endmacro
 
+INIT_YMM
+
+%macro INTERL_AVX 5
+    vunpckhps      %3, %2, %1
+    vunpcklps      %2, %2, %1
+    vextractf128   %4(%5), %2, 0
+    vextractf128  %4 %+ H(%5), %3, 0
+    vextractf128   %4(%5 + 1), %2, 1
+    vextractf128  %4 %+ H(%5 + 1), %3, 1
+%endmacro
+
+%define INTERL INTERL_AVX
+
+DECL_PASS pass_avx, PASS_BIG 1
+DECL_PASS pass_interleave_avx, PASS_BIG 0
+
 INIT_XMM
-%define mova movaps
+%define mova    movaps
+
+%macro INTERL_SSE 5
+    mova     %3, %2
+    unpcklps %2, %1
+    unpckhps %3, %1
+    mova  %4(%5), %2
+    mova  %4(%5+1), %3
+%endmacro
+
+%define INTERL INTERL_SSE
+
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0
 
@@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
 
 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
 %xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
-%if %1==5
+%if %1>=5
 %xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
 %endif
+%if %1>=6
+%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
+%endif
 
 %assign n 1<<%1
 %rep 17-%1
@@ -495,6 +631,8 @@ cglobal fft_dispatch%3%2, 2,5,8, z, nbits
     RET
 %endmacro ; DECL_FFT
 
+DECL_FFT 6, _avx
+DECL_FFT 6, _avx, _interleave
 DECL_FFT 5, _sse
 DECL_FFT 5, _sse, _interleave
 DECL_FFT 4, _3dn
@@ -533,21 +671,53 @@ INIT_XMM
 %endmacro
 
 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
-    movaps   xmm6, [%4+%1*2]
-    movaps   %2,   [%4+%1*2+0x10]
-    movaps   %3,   xmm6
-    movaps   xmm7, %2
-    mulps    xmm6, [%5+%1]
-    mulps    %2,   [%6+%1]
-    mulps    %3,   [%6+%1]
-    mulps    xmm7, [%5+%1]
-    subps    %2,   xmm6
-    addps    %3,   xmm7
+    mulps      m6, %3, [%5+%1]
+    mulps      m7, %2, [%5+%1]
+    mulps      %2, %2, [%6+%1]
+    mulps      %3, %3, [%6+%1]
+    subps      %2, %2, m6
+    addps      %3, %3, m7
+%endmacro
+
+%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
+.post:
+    vmovaps      ymm1,   [%3+%1*2]
+    vmovaps      ymm0,   [%3+%1*2+0x20]
+    vmovaps      ymm3,   [%3+%2*2]
+    vmovaps      ymm2,   [%3+%2*2+0x20]
+
+    CMUL         %1, ymm0, ymm1, %3, %4, %5
+    CMUL         %2, ymm2, ymm3, %3, %4, %5
+    vshufps      ymm1, ymm1, ymm1,	0x1b
+    vshufps      ymm3, ymm3, ymm3, 0x1b
+    vperm2f128   ymm1, ymm1, ymm1, 0x01
+    vperm2f128   ymm3, ymm3, ymm3, 0x01
+    vunpcklps    ymm6, ymm2, ymm1
+    vunpckhps    ymm4, ymm2, ymm1
+    vunpcklps    ymm7, ymm0, ymm3
+    vunpckhps    ymm5, ymm0, ymm3
+
+    vextractf128 [%3+%1*2],      ymm7, 0
+    vextractf128 [%3+%1*2+0x10], ymm5, 0
+    vextractf128 [%3+%1*2+0x20], ymm7, 1
+    vextractf128 [%3+%1*2+0x30], ymm5, 1
+
+    vextractf128 [%3+%2*2],      ymm6, 0
+    vextractf128 [%3+%2*2+0x10], ymm4, 0
+    vextractf128 [%3+%2*2+0x20], ymm6, 1
+    vextractf128 [%3+%2*2+0x30], ymm4, 1
+    sub      %2,   0x20
+    add      %1,   0x20
+    jl       .post
 %endmacro
 
 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
 .post:
+    movaps   xmm6, [%3+%1*2]
+    movaps   xmm7, [%3+%1*2+0x10]
     CMUL     %1,   xmm0, xmm1, %3, %4, %5
+    movaps   xmm6, [%3+%2*2]
+    movaps   xmm7, [%3+%2*2+0x10]
     CMUL     %2,   xmm4, xmm5, %3, %4, %5
     shufps   xmm1, xmm1, 0x1b
     shufps   xmm5, xmm5, 0x1b
@@ -566,7 +736,8 @@ INIT_XMM
     jl       .post
 %endmacro
 
-cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%macro DECL_IMDCT 2
+cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
 %ifdef ARCH_X86_64
 %define rrevtab r10
 %define rtcos   r11
@@ -641,7 +812,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
     mov  r0, r1
     mov  r1d, [r5+FFTContext.nbits]
 
-    FFT_DISPATCH _sse, r1
+    FFT_DISPATCH %1, r1
 
     mov  r0d, [r5+FFTContext.mdctsize]
     add  r6, r0
@@ -653,9 +824,9 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
     mov  rtsin, [esp+4]
 %endif
     neg  r0
-    mov  r1, -16
+    mov  r1, -mmsize
     sub  r1, r0
-    POSROTATESHUF r0, r1, r6, rtcos, rtsin
+    %2 r0, r1, r6, rtcos, rtsin
 %ifdef ARCH_X86_64
     pop  r14
     pop  r13
@@ -664,3 +835,10 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
     add esp, 12
 %endif
     RET
+%endmacro
+
+DECL_IMDCT _sse, POSROTATESHUF
+
+INIT_YMM
+
+DECL_IMDCT _avx, POSROTATESHUF_AVX
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
index 9de4e4c..f068718 100644
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
 
 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
+
+void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
+{
+    ff_fft_dispatch_interleave_avx(z, s->nbits);
+}
 
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
@@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
     long n = s->mdct_size;
     long n4 = n >> 2;
 
-    ff_imdct_half_sse(s, output+n4, input);
+    s->imdct_half(s, output+n4, input);
 
     j = -n;
     k = n-16;
-- 
1.7.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] AVX FFT

Reply via email to