On 05/13/2012 11:40 PM, Diego Elio Pettenò wrote:
Il 12/05/2012 01:15, Vitor Sessak ha scritto:

Here is a version with 128-bit regs (again, completely untested).

The patch seems to require a new series that factors "x86: use more
standard construct for setting ASM funcitons in FFT code" into it.
Probably you should resend the series.

If you do I can test it as I have a xop-capable box at hand.

Nice! Here is the full patchset.

-Vitor
>From 5db214fead3250f3f96022036a362d229b36f571 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Sat, 12 May 2012 08:57:36 +0200
Subject: [PATCH 1/4] x86: use more standard construct for setting ASM
 funcitons in FFT code

---
 libavcodec/x86/fft.c |   45 +++++++++++++++++++++++----------------------
 1 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 3e0c42f..5495821 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -25,30 +25,31 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
 {
 #if HAVE_YASM
     int has_vectors = av_get_cpu_flags();
-    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
-        /* AVX for SB */
-        s->imdct_calc      = ff_imdct_calc_sse;
-        s->imdct_half      = ff_imdct_half_avx;
-        s->fft_permute     = ff_fft_permute_sse;
-        s->fft_calc        = ff_fft_calc_avx;
-        s->fft_permutation = FF_FFT_PERM_AVX;
-    } else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
+    if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+        /* 3DNow! for K6-2/3 */
+        s->imdct_calc = ff_imdct_calc_3dn;
+        s->imdct_half = ff_imdct_half_3dn;
+        s->fft_calc   = ff_fft_calc_3dn;
+    }
+    if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
+        /* 3DNowEx for K7 */
+        s->imdct_calc = ff_imdct_calc_3dn2;
+        s->imdct_half = ff_imdct_half_3dn2;
+        s->fft_calc   = ff_fft_calc_3dn2;
+    }
+    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
         /* SSE for P3/P4/K8 */
         s->imdct_calc  = ff_imdct_calc_sse;
         s->imdct_half  = ff_imdct_half_sse;
         s->fft_permute = ff_fft_permute_sse;
         s->fft_calc    = ff_fft_calc_sse;
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
-    } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
-        /* 3DNowEx for K7 */
-        s->imdct_calc = ff_imdct_calc_3dn2;
-        s->imdct_half = ff_imdct_half_3dn2;
-        s->fft_calc   = ff_fft_calc_3dn2;
-    } else if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
-        /* 3DNow! for K6-2/3 */
-        s->imdct_calc = ff_imdct_calc_3dn;
-        s->imdct_half = ff_imdct_half_3dn;
-        s->fft_calc   = ff_fft_calc_3dn;
+    }
+    if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
+        /* AVX for SB */
+        s->imdct_half      = ff_imdct_half_avx;
+        s->fft_calc        = ff_fft_calc_avx;
+        s->fft_permutation = FF_FFT_PERM_AVX;
     }
 #endif
 }
@@ -58,12 +59,12 @@ av_cold void ff_dct_init_mmx(DCTContext *s)
 {
 #if HAVE_YASM
     int has_vectors = av_get_cpu_flags();
+    if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
+        s->dct32 = ff_dct32_float_sse;
+    if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
+        s->dct32 = ff_dct32_float_sse2;
     if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX)
         s->dct32 = ff_dct32_float_avx;
-    else if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE)
-        s->dct32 = ff_dct32_float_sse2;
-    else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE)
-        s->dct32 = ff_dct32_float_sse;
 #endif
 }
 #endif
-- 
1.7.5.4

>From dc41c1a0c90b3ac3dbe17675cca218a33fcc10ff Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Thu, 10 May 2012 20:53:08 +0200
Subject: [PATCH 2/4] x86: use new schema for ASM macros

---
 libavcodec/x86/fft.c       |   12 +++---
 libavcodec/x86/fft.h       |   12 +++---
 libavcodec/x86/fft_3dn2.c  |   26 ++++++------
 libavcodec/x86/fft_mmx.asm |  101 ++++++++++++++++++++++---------------------
 4 files changed, 77 insertions(+), 74 deletions(-)

diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 5495821..6349c23 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -27,15 +27,15 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
     int has_vectors = av_get_cpu_flags();
     if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
         /* 3DNow! for K6-2/3 */
-        s->imdct_calc = ff_imdct_calc_3dn;
-        s->imdct_half = ff_imdct_half_3dn;
-        s->fft_calc   = ff_fft_calc_3dn;
+        s->imdct_calc = ff_imdct_calc_3dnow;
+        s->imdct_half = ff_imdct_half_3dnow;
+        s->fft_calc   = ff_fft_calc_3dnow;
     }
     if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) {
         /* 3DNowEx for K7 */
-        s->imdct_calc = ff_imdct_calc_3dn2;
-        s->imdct_half = ff_imdct_half_3dn2;
-        s->fft_calc   = ff_fft_calc_3dn2;
+        s->imdct_calc = ff_imdct_calc_3dnow2;
+        s->imdct_half = ff_imdct_half_3dnow2;
+        s->fft_calc   = ff_fft_calc_3dnow2;
     }
     if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
         /* SSE for P3/P4/K8 */
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 9d68d5b..1cefe7a 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -24,13 +24,13 @@
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z);
 
-void ff_imdct_calc_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dn(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
diff --git a/libavcodec/x86/fft_3dn2.c b/libavcodec/x86/fft_3dn2.c
index ce3c9da..e684cc7 100644
--- a/libavcodec/x86/fft_3dn2.c
+++ b/libavcodec/x86/fft_3dn2.c
@@ -30,30 +30,30 @@ DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 };
     "movq "#s","#d"\n"\
     "psrlq $32,"#d"\n"\
     "punpckldq "#s","#d"\n"
-#define ff_fft_calc_3dn2 ff_fft_calc_3dn
-#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
-#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
-#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
-#define ff_imdct_half_3dn2 ff_imdct_half_3dn
+#define ff_fft_calc_3dnow2 ff_fft_calc_3dnow
+#define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow
+#define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow
+#define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow
+#define ff_imdct_half_3dnow2 ff_imdct_half_3dnow
 #else
 #define PSWAPD(s,d) "pswapd "#s","#d"\n"
 #endif
 
-void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
-void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
+void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits);
 
-void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
+void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z)
 {
     int n = 1<<s->nbits;
     int i;
-    ff_fft_dispatch_interleave_3dn2(z, s->nbits);
+    ff_fft_dispatch_interleave_3dnow2(z, s->nbits);
     __asm__ volatile("femms");
     if(n <= 8)
         for(i=0; i<n; i+=2)
             FFSWAP(FFTSample, z[i].im, z[i+1].re);
 }
 
-void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
+void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
     x86_reg j, k;
     long n = s->mdct_size;
@@ -101,7 +101,7 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
         );
     }
 
-    ff_fft_dispatch_3dn2(z, s->nbits);
+    ff_fft_dispatch_3dnow2(z, s->nbits);
 
 #define CMUL(j,mm0,mm1)\
         "movq  (%2,"#j",2), %%mm6 \n"\
@@ -144,13 +144,13 @@ void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
     __asm__ volatile("femms");
 }
 
-void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
+void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
     x86_reg j, k;
     long n = s->mdct_size;
     long n4 = n >> 2;
 
-    ff_imdct_half_3dn2(s, output+n4, input);
+    ff_imdct_half_3dnow2(s, output+n4, input);
 
     j = -n;
     k = n-8;
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 225c666..7120d1e 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -297,7 +297,7 @@ IF%1 mova  Z(1), m5
 %define Z2(x) [r0+mmsize*x]
 %define ZH(x) [r0+mmsize*x+mmsize/2]
 
-INIT_YMM
+INIT_YMM avx
 
 %if HAVE_AVX
 align 16
@@ -390,7 +390,7 @@ fft32_interleave_avx:
     ret
 %endif
 
-INIT_XMM
+INIT_XMM sse
 %define movdqa  movaps
 
 align 16
@@ -439,8 +439,6 @@ fft16_sse:
     ret
 
 
-INIT_MMX
-
 %macro FFT48_3DN 1
 align 16
 fft4%1:
@@ -495,7 +493,8 @@ fft8%1:
     ret
 %endmacro
 
-FFT48_3DN _3dn2
+INIT_MMX 3dnow2
+FFT48_3DN _3dnow2
 
 %macro pswapd 2
 %ifidn %1, %2
@@ -508,7 +507,8 @@ FFT48_3DN _3dn2
 %endif
 %endmacro
 
-FFT48_3DN _3dn
+INIT_MMX 3dnow
+FFT48_3DN _3dnow
 
 
 %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
@@ -532,7 +532,7 @@ DEFINE_ARGS z, w, n, o1, o3
     rep ret
 %endmacro
 
-INIT_YMM
+INIT_YMM avx
 
 %if HAVE_AVX
 %macro INTERL_AVX 5
@@ -550,7 +550,7 @@ DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0
 %endif
 
-INIT_XMM
+INIT_XMM sse
 
 %macro INTERL_SSE 5
     mova     %3, %2
@@ -565,16 +565,16 @@ INIT_XMM
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0
 
-INIT_MMX
+INIT_MMX 3dnow
 %define mulps pfmul
 %define addps pfadd
 %define subps pfsub
 %define unpcklps punpckldq
 %define unpckhps punpckhdq
-DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
-DECL_PASS pass_interleave_3dn, PASS_BIG 0
-%define pass_3dn2 pass_3dn
-%define pass_interleave_3dn2 pass_interleave_3dn
+DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
+DECL_PASS pass_interleave_3dnow, PASS_BIG 0
+%define pass_3dnow2 pass_3dnow
+%define pass_interleave_3dnow2 pass_interleave_3dnow
 
 %ifdef PIC
 %define SECTION_REL - $$
@@ -592,67 +592,70 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
     call r2
 %endmacro ; FFT_DISPATCH
 
-%macro DECL_FFT 2-3 ; nbits, cpu, suffix
-%xdefine list_of_fft fft4%2 SECTION_REL, fft8%2 SECTION_REL
+%macro DECL_FFT 1-2 ; nbits, cpu, suffix
+%xdefine cpusuffix _ %+ cpuname
+%xdefine fullsuffix %2_ %+ cpuname
+%xdefine list_of_fft fft4 %+ cpusuffix SECTION_REL, fft8 %+ cpusuffix SECTION_REL
 %if %1>=5
-%xdefine list_of_fft list_of_fft, fft16%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft16 %+ cpusuffix SECTION_REL
 %endif
 %if %1>=6
-%xdefine list_of_fft list_of_fft, fft32%3%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
 %endif
 
 %assign n 1<<%1
 %rep 17-%1
 %assign n2 n/2
 %assign n4 n/4
-%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 SECTION_REL
+%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
 
 align 16
-fft %+ n %+ %3%2:
-    call fft %+ n2 %+ %2
+fft %+ n %+ fullsuffix:
+    call fft %+ n2 %+ cpusuffix
     add r0, n*4 - (n&(-2<<%1))
-    call fft %+ n4 %+ %2
+    call fft %+ n4 %+ cpusuffix
     add r0, n*2 - (n2&(-2<<%1))
-    call fft %+ n4 %+ %2
+    call fft %+ n4 %+ cpusuffix
     sub r0, n*6 + (n2&(-2<<%1))
     lea r1, [cos_ %+ n]
     mov r2d, n4/2
-    jmp pass%3%2
+    jmp pass %+ fullsuffix
 
 %assign n n*2
 %endrep
 %undef n
 
 align 8
-dispatch_tab%3%2: pointer list_of_fft
+dispatch_tab %+ fullsuffix: pointer list_of_fft
 
 section .text
 
 ; On x86_32, this function does the register saving and restoring for all of fft.
 ; The others pass args in registers and don't spill anything.
-cglobal fft_dispatch%3%2, 2,5,8, z, nbits
-    FFT_DISPATCH %3%2, nbits
-%ifidn %2, _avx
+cglobal fft_dispatch%2, 2,5,8, z, nbits
+    FFT_DISPATCH fullsuffix, nbits
+%if mmsize == 32
     vzeroupper
 %endif
     RET
 %endmacro ; DECL_FFT
 
 %if HAVE_AVX
-INIT_YMM
-DECL_FFT 6, _avx
-DECL_FFT 6, _avx, _interleave
+INIT_YMM avx
+DECL_FFT 6
+DECL_FFT 6, _interleave
 %endif
-INIT_XMM
-DECL_FFT 5, _sse
-DECL_FFT 5, _sse, _interleave
-INIT_MMX
-DECL_FFT 4, _3dn
-DECL_FFT 4, _3dn, _interleave
-DECL_FFT 4, _3dn2
-DECL_FFT 4, _3dn2, _interleave
-
-INIT_XMM
+INIT_XMM sse
+DECL_FFT 5
+DECL_FFT 5, _interleave
+INIT_MMX 3dnow
+DECL_FFT 4
+DECL_FFT 4, _interleave
+INIT_MMX 3dnow2
+DECL_FFT 4
+DECL_FFT 4, _interleave
+
+INIT_XMM sse
 %undef mulps
 %undef addps
 %undef subps
@@ -748,8 +751,8 @@ INIT_XMM
     jl       .post
 %endmacro
 
-%macro DECL_IMDCT 2
-cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
+%macro DECL_IMDCT 1
+cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
 %if ARCH_X86_64
 %define rrevtab r7
 %define rtcos   r8
@@ -821,7 +824,7 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
     mov  r0, r1
     mov  r1d, [r5+FFTContext.nbits]
 
-    FFT_DISPATCH %1, r1
+    FFT_DISPATCH _ %+ cpuname, r1
 
     mov  r0d, [r5+FFTContext.mdctsize]
     add  r6, r0
@@ -835,20 +838,20 @@ cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample
     neg  r0
     mov  r1, -mmsize
     sub  r1, r0
-    %2 r0, r1, r6, rtcos, rtsin
+    %1 r0, r1, r6, rtcos, rtsin
 %if ARCH_X86_64 == 0
     add esp, 12
 %endif
-%ifidn avx_enabled, 1
+%if mmsize == 32
     vzeroupper
 %endif
     RET
 %endmacro
 
-DECL_IMDCT _sse, POSROTATESHUF
-
-INIT_YMM
+DECL_IMDCT POSROTATESHUF
 
+INIT_YMM avx
+	
 %if HAVE_AVX
-DECL_IMDCT _avx, POSROTATESHUF_AVX
+DECL_IMDCT POSROTATESHUF_AVX
 %endif
-- 
1.7.5.4

>From 5ac607e34a49b6be0c883613e4565bc4b81365a1 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Fri, 11 May 2012 22:22:16 +0200
Subject: [PATCH 3/4] build: add XOP instruction set support

---
 configure |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/configure b/configure
index 4b1e551..7a5c788 100755
--- a/configure
+++ b/configure
@@ -243,6 +243,7 @@ Optimization options (experts only):
   --disable-sse            disable SSE optimizations
   --disable-ssse3          disable SSSE3 optimizations
   --disable-avx            disable AVX optimizations
+  --disable-xop            disable XOP optimizations
   --disable-armv5te        disable armv5te optimizations
   --disable-armv6          disable armv6 optimizations
   --disable-armv6t2        disable armv6t2 optimizations
@@ -1032,6 +1033,7 @@ ARCH_EXT_LIST='
     armv6t2
     armvfp
     avx
+    xop
     mmi
     mmx
     mmx2
@@ -1254,6 +1256,7 @@ mmx2_deps="mmx"
 sse_deps="mmx"
 ssse3_deps="sse"
 avx_deps="ssse3"
+xop_deps="avx"
 
 aligned_stack_if_any="ppc x86"
 fast_64bit_if_any="alpha ia64 mips64 parisc64 ppc64 sparc64 x86_64"
@@ -2817,6 +2820,7 @@ EOF
         check_yasm "pextrd [eax], xmm0, 1" && enable yasm ||
             die "yasm not found, use --disable-yasm for a crippled build"
         check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx
+        check_yasm "vfmaddps xmm0, xmm0, xmm0, xmm0" || disable xop
     fi
 
     case "$cpu" in
@@ -3213,6 +3217,7 @@ if enabled x86; then
     echo "SSE enabled               ${sse-no}"
     echo "SSSE3 enabled             ${ssse3-no}"
     echo "AVX enabled               ${avx-no}"
+    echo "XOP enabled               ${xop-no}"
     echo "CMOV enabled              ${cmov-no}"
     echo "CMOV is fast              ${fast_cmov-no}"
     echo "EBX available             ${ebx_available-no}"
-- 
1.7.5.4

>From ad5ae3b20a7192dba1581d8ae0196da470194481 Mon Sep 17 00:00:00 2001
From: Vitor Sessak <[email protected]>
Date: Fri, 11 May 2012 22:26:45 +0200
Subject: [PATCH 4/4] x86: add XOP code for FFT

---
 libavcodec/x86/fft.c       |    6 ++
 libavcodec/x86/fft.h       |    2 +
 libavcodec/x86/fft_mmx.asm |  112 +++++++++++++++++++++++++-------------------
 libavcodec/x86/fft_sse.c   |    7 +++
 libavutil/x86/x86inc.asm   |    4 +-
 5 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 6349c23..8005a5c 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -51,6 +51,12 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
         s->fft_calc        = ff_fft_calc_avx;
         s->fft_permutation = FF_FFT_PERM_AVX;
     }
+    if (has_vectors & AV_CPU_FLAG_XOP && HAVE_XOP && s->nbits >= 5) {
+        /* AVX for SB */
+        s->imdct_half      = ff_imdct_half_xop;
+        s->fft_calc        = ff_fft_calc_xop;
+        s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+    }
 #endif
 }
 
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 1cefe7a..59ef2fb 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -22,6 +22,7 @@
 #include "libavcodec/fft.h"
 
 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_xop(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
@@ -34,6 +35,7 @@ void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *inp
 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_xop(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
 void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
 void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 7120d1e..07ea30e 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -170,9 +170,8 @@ SECTION_TEXT
     addps    %6, %3, %4       ; {t1,t2,t3,t4}
     subps    %3, %3, %4       ; {r5,i5,r7,i7}
     shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
-    mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
     mulps    %4, %4, [ps_root2]
-    addps    %3, %3, %4       ; {t8,t7,ta,t9}
+    fmaddps  %3, %3, [ps_root2mppm], %4
     shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
     shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
     subps    %3, %6, %4       ; {t6,t5,tc,tb}
@@ -191,25 +190,21 @@ IF%1 mova    m4, Z(4)
 IF%1 mova    m5, Z(5)
     mova     m0, %2 ; wre
     mova     m1, %3 ; wim
-    mulps    m2, m4, m0 ; r2*wre
 IF%1 mova    m6, Z2(6)
     mulps    m3, m5, m1 ; i2*wim
 IF%1 mova    m7, Z2(7)
-    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m2, m3 ; r2*wre + i2*wim
+    mulps    m2, m4, m1 ; r2*wim
+    fmaddps  m4, m4, m0, m3
     mulps    m3, m1, m7 ; i3*wim
-    subps    m5, m5, m4 ; i2*wre - r2*wim
     mulps    m1, m1, m6 ; r3*wim
-    mulps    m4, m0, m6 ; r3*wre
-    mulps    m0, m0, m7 ; i3*wre
-    subps    m4, m4, m3 ; r3*wre - i3*wim
+    fmsubps  m5, m5, m0, m2
+    fmsubps  m6, m6, m0, m3
+    fmaddps  m0, m0, m7, m1
     mova     m3, Z(0)
-    addps    m0, m0, m1 ; i3*wre + r3*wim
-    subps    m1, m4, m2 ; t3
-    addps    m4, m4, m2 ; t5
-    subps    m3, m3, m4 ; r2
-    addps    m4, m4, Z(0) ; r0
+    subps    m1, m6, m4 ; t3
+    addps    m6, m6, m4 ; t5
+    subps    m3, m3, m6 ; r2
+    addps    m4, m6, Z(0) ; r0
     mova     m6, Z(2)
     mova   Z(4), m3
     mova   Z(0), m4
@@ -233,25 +228,21 @@ IF%1 mova    m7, Z2(7)
 
 ; scheduled to avoid store->load aliasing
 %macro PASS_BIG 1 ; (!interleave)
-    mova     m4, Z(4) ; r2
+    mova     m2, Z(4) ; r2
     mova     m5, Z(5) ; i2
-    mova     m0, [wq] ; wre
-    mova     m1, [wq+o1q] ; wim
-    mulps    m2, m4, m0 ; r2*wre
+    mova     m4, [wq] ; wre
+    mova     m0, [wq+o1q] ; wim
     mova     m6, Z2(6) ; r3
-    mulps    m3, m5, m1 ; i2*wim
+    mulps    m3, m5, m0 ; i2*wim
     mova     m7, Z2(7) ; i3
-    mulps    m4, m4, m1 ; r2*wim
-    mulps    m5, m5, m0 ; i2*wre
-    addps    m2, m2, m3 ; r2*wre + i2*wim
-    mulps    m3, m1, m7 ; i3*wim
-    mulps    m1, m1, m6 ; r3*wim
-    subps    m5, m5, m4 ; i2*wre - r2*wim
-    mulps    m4, m0, m6 ; r3*wre
-    mulps    m0, m0, m7 ; i3*wre
-    subps    m4, m4, m3 ; r3*wre - i3*wim
+    mulps    m1, m2, m0 ; r2*wim
+    fmaddps  m2, m2, m4, m3
+    fmsubps  m5, m5, m4, m1
+    mulps    m3, m0, m7 ; i3*wim
+    mulps    m1, m4, m7 ; i3*wre
+    fmsubps  m4, m4, m6, m3
+    fmaddps  m0, m0, m6, m1
     mova     m3, Z(0)
-    addps    m0, m0, m1 ; i3*wre + r3*wim
     subps    m1, m4, m2 ; t3
     addps    m4, m4, m2 ; t5
     subps    m3, m3, m4 ; r2
@@ -310,24 +301,23 @@ fft8_avx:
     ret
 
 
+%macro FFT_DECL_16_32 0
 align 16
-fft16_avx:
+fft16_ %+ cpuname:
     mova       m2, Z(2)
     mova       m3, Z(3)
     T4_SSE     m2, m3, m7
 
     mova       m0, Z(0)
     mova       m1, Z(1)
-    T8_AVX     m0, m1, m4, m5, m7
+    T8_AVX     m0, m1, m4, m7, m5
 
     mova       m4, [ps_cos16_1]
-    mova       m5, [ps_cos16_2]
+    mova       m7, [ps_cos16_2]
     vmulps     m6, m2, m4
-    vmulps     m7, m3, m5
-    vaddps     m7, m7, m6
-    vmulps     m2, m2, m5
-    vmulps     m3, m3, m4
-    vsubps     m3, m3, m2
+    vmulps     m2, m2, m7
+    fmaddps    m7, m7, m3, m6
+    fmsubps    m3, m3, m4, m2
     vblendps   m2, m7, m3, 0xf0
     vperm2f128 m3, m7, m3, 0x21
     vaddps     m4, m2, m3
@@ -348,8 +338,8 @@ fft16_avx:
     ret
 
 align 16
-fft32_avx:
-    call fft16_avx
+fft32_ %+ cpuname:
+    call fft16_ %+ cpuname
 
     mova m0, Z(4)
     mova m1, Z(5)
@@ -372,8 +362,8 @@ fft32_avx:
 
     ret
 
-fft32_interleave_avx:
-    call fft32_avx
+fft32_interleave_ %+ cpuname:
+    call fft32_ %+ cpuname
     mov r2d, 32
 .deint_loop:
     mova     m2, Z(0)
@@ -388,12 +378,16 @@ fft32_interleave_avx:
     sub r2d, mmsize/4
     jg .deint_loop
     ret
+%endmacro
+
+FFT_DECL_16_32
 %endif
 
 INIT_XMM sse
 %define movdqa  movaps
 
 align 16
+fft4_xop:
 fft4_avx:
 fft4_sse:
     mova     m0, Z(0)
@@ -403,8 +397,9 @@ fft4_sse:
     mova   Z(1), m1
     ret
 
+%macro FFT8_16_XMM 0
 align 16
-fft8_sse:
+fft8_ %+ cpuname:
     mova     m0, Z(0)
     mova     m1, Z(1)
     T4_SSE   m0, m1, m2
@@ -418,7 +413,7 @@ fft8_sse:
     ret
 
 align 16
-fft16_sse:
+fft16_ %+ cpuname:
     mova     m0, Z(0)
     mova     m1, Z(1)
     T4_SSE   m0, m1, m2
@@ -437,7 +432,12 @@ fft16_sse:
     T4_SSE   m6, m7, m0
     PASS_SMALL 0, [cos_16], [cos_16+16]
     ret
+%endmacro
 
+INIT_XMM sse
+FFT8_16_XMM
+INIT_XMM xop
+FFT8_16_XMM
 
 %macro FFT48_3DN 1
 align 16
@@ -565,6 +565,12 @@ INIT_XMM sse
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0
 
+%if HAVE_XOP
+INIT_XMM xop
+DECL_PASS pass_xop, PASS_BIG 1
+DECL_PASS pass_interleave_xop, PASS_BIG 0
+%endif
+
 INIT_MMX 3dnow
 %define mulps pfmul
 %define addps pfadd
@@ -640,6 +646,11 @@ cglobal fft_dispatch%2, 2,5,8, z, nbits
     RET
 %endmacro ; DECL_FFT
 
+%if HAVE_XOP
+INIT_XMM xop
+DECL_FFT 5
+DECL_FFT 5, _interleave
+%endif
 %if HAVE_AVX
 INIT_YMM avx
 DECL_FFT 6
@@ -688,10 +699,8 @@ INIT_XMM sse
 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
     mulps      m6, %3, [%5+%1]
     mulps      m7, %2, [%5+%1]
-    mulps      %2, %2, [%6+%1]
-    mulps      %3, %3, [%6+%1]
-    subps      %2, %2, m6
-    addps      %3, %3, m7
+    fmsubps    %2, %2, [%6+%1], m6
+    fmaddps    %3, %3, [%6+%1], m7
 %endmacro
 
 %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
@@ -850,8 +859,13 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
 
 DECL_IMDCT POSROTATESHUF
 
+%if HAVE_XOP
+INIT_XMM xop
+DECL_IMDCT POSROTATESHUF
+%endif
+
 INIT_YMM avx
-	
+
 %if HAVE_AVX
 DECL_IMDCT POSROTATESHUF_AVX
 %endif
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
index 13b992f..0591b58 100644
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@@ -30,6 +30,7 @@ DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_xop(FFTComplex *z, int nbits);
 
 #if HAVE_AVX
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
@@ -37,6 +38,12 @@ void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
     ff_fft_dispatch_interleave_avx(z, s->nbits);
 }
 #endif
+#if HAVE_XOP
+void ff_fft_calc_xop(FFTContext *s, FFTComplex *z)
+{
+    ff_fft_dispatch_interleave_xop(z, s->nbits);
+}
+#endif
 
 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c167057..85fca76 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1085,7 +1085,7 @@ AVX_INSTR pfmul, 1, 0, 1
             v%5 %1, %2, %3, %4
         %else
             %6 %1, %2, %3
-            %7 %1, %4
+            %7 %1, %1, %4
         %endif
     %endmacro
 %endmacro
@@ -1093,3 +1093,5 @@ AVX_INSTR pfmul, 1, 0, 1
 FMA_INSTR  pmacsdd,  pmulld, paddd
 FMA_INSTR  pmacsww,  pmullw, paddw
 FMA_INSTR pmadcswd, pmaddwd, paddd
+FMA_INSTR fmaddps,   mulps, addps
+FMA_INSTR fmsubps,   mulps, subps
-- 
1.7.5.4

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to