---
libavcodec/x86/fft.c | 9 +++-
libavcodec/x86/fft.h | 2 +
libavcodec/x86/fft_mmx.asm | 108 +++++++++++++++++++++++++-------------------
libavcodec/x86/fft_sse.c | 7 +++
libavutil/x86/x86inc.asm | 4 +-
5 files changed, 81 insertions(+), 49 deletions(-)
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 90315e6..a03b386 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{
#if HAVE_YASM
int has_vectors = av_get_cpu_flags();
- if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
+ if (has_vectors & AV_CPU_FLAG_XOP && HAVE_XOP && s->nbits >= 5) {
+ /* AVX for SB */
+ s->imdct_calc = ff_imdct_calc_sse;
+ s->imdct_half = ff_imdct_half_xop;
+ s->fft_permute = ff_fft_permute_sse;
+ s->fft_calc = ff_fft_calc_xop;
+ s->fft_permutation = FF_FFT_PERM_AVX;
+ } else if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
/* AVX for SB */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_avx;
diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h
index 1cefe7a..59ef2fb 100644
--- a/libavcodec/x86/fft.h
+++ b/libavcodec/x86/fft.h
@@ -22,6 +22,7 @@
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_xop(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
@@ -34,6 +35,7 @@ void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output,
const FFTSample *inp
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample
*input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample
*input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample
*input);
+void ff_imdct_half_xop(FFTContext *s, FFTSample *output, const FFTSample
*input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 7120d1e..f449f5f 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -170,9 +170,8 @@ SECTION_TEXT
addps %6, %3, %4 ; {t1,t2,t3,t4}
subps %3, %3, %4 ; {r5,i5,r7,i7}
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
- mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
mulps %4, %4, [ps_root2]
- addps %3, %3, %4 ; {t8,t7,ta,t9}
+ fmaddps %3, %3, [ps_root2mppm], %4
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
subps %3, %6, %4 ; {t6,t5,tc,tb}
@@ -191,25 +190,21 @@ IF%1 mova m4, Z(4)
IF%1 mova m5, Z(5)
mova m0, %2 ; wre
mova m1, %3 ; wim
- mulps m2, m4, m0 ; r2*wre
IF%1 mova m6, Z2(6)
mulps m3, m5, m1 ; i2*wim
IF%1 mova m7, Z2(7)
- mulps m4, m4, m1 ; r2*wim
- mulps m5, m5, m0 ; i2*wre
- addps m2, m2, m3 ; r2*wre + i2*wim
+ mulps m2, m4, m1 ; r2*wim
+ fmaddps m4, m4, m0, m3
mulps m3, m1, m7 ; i3*wim
- subps m5, m5, m4 ; i2*wre - r2*wim
mulps m1, m1, m6 ; r3*wim
- mulps m4, m0, m6 ; r3*wre
- mulps m0, m0, m7 ; i3*wre
- subps m4, m4, m3 ; r3*wre - i3*wim
+ fmsubps m5, m5, m0, m2
+ fmsubps m6, m6, m0, m3
+ fmaddps m0, m0, m7, m1
mova m3, Z(0)
- addps m0, m0, m1 ; i3*wre + r3*wim
- subps m1, m4, m2 ; t3
- addps m4, m4, m2 ; t5
- subps m3, m3, m4 ; r2
- addps m4, m4, Z(0) ; r0
+ subps m1, m6, m4 ; t3
+ addps m6, m6, m4 ; t5
+ subps m3, m3, m6 ; r2
+ addps m4, m6, Z(0) ; r0
mova m6, Z(2)
mova Z(4), m3
mova Z(0), m4
@@ -233,25 +228,21 @@ IF%1 mova m7, Z2(7)
; scheduled to avoid store->load aliasing
%macro PASS_BIG 1 ; (!interleave)
- mova m4, Z(4) ; r2
+ mova m2, Z(4) ; r2
mova m5, Z(5) ; i2
- mova m0, [wq] ; wre
- mova m1, [wq+o1q] ; wim
- mulps m2, m4, m0 ; r2*wre
+ mova m4, [wq] ; wre
+ mova m0, [wq+o1q] ; wim
mova m6, Z2(6) ; r3
- mulps m3, m5, m1 ; i2*wim
+ mulps m3, m5, m0 ; i2*wim
mova m7, Z2(7) ; i3
- mulps m4, m4, m1 ; r2*wim
- mulps m5, m5, m0 ; i2*wre
- addps m2, m2, m3 ; r2*wre + i2*wim
- mulps m3, m1, m7 ; i3*wim
- mulps m1, m1, m6 ; r3*wim
- subps m5, m5, m4 ; i2*wre - r2*wim
- mulps m4, m0, m6 ; r3*wre
- mulps m0, m0, m7 ; i3*wre
- subps m4, m4, m3 ; r3*wre - i3*wim
+ mulps m1, m2, m0 ; r2*wim
+ fmaddps m2, m2, m4, m3
+ fmsubps m5, m5, m4, m1
+ mulps m3, m0, m7 ; i3*wim
+ mulps m1, m4, m7 ; i3*wre
+ fmsubps m4, m4, m6, m3
+ fmaddps m0, m0, m6, m1
mova m3, Z(0)
- addps m0, m0, m1 ; i3*wre + r3*wim
subps m1, m4, m2 ; t3
addps m4, m4, m2 ; t5
subps m3, m3, m4 ; r2
@@ -302,6 +293,7 @@ INIT_YMM avx
%if HAVE_AVX
align 16
fft8_avx:
+fft8_xop:
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m2, m3, m4
@@ -310,24 +302,23 @@ fft8_avx:
ret
+%macro FFT_DECL_16_32 0
align 16
-fft16_avx:
+fft16_ %+ cpuname:
mova m2, Z(2)
mova m3, Z(3)
T4_SSE m2, m3, m7
mova m0, Z(0)
mova m1, Z(1)
- T8_AVX m0, m1, m4, m5, m7
+ T8_AVX m0, m1, m4, m7, m5
mova m4, [ps_cos16_1]
- mova m5, [ps_cos16_2]
+ mova m7, [ps_cos16_2]
vmulps m6, m2, m4
- vmulps m7, m3, m5
- vaddps m7, m7, m6
- vmulps m2, m2, m5
- vmulps m3, m3, m4
- vsubps m3, m3, m2
+ vmulps m2, m2, m7
+ fmaddps m7, m7, m3, m6
+ fmsubps m3, m3, m4, m2
vblendps m2, m7, m3, 0xf0
vperm2f128 m3, m7, m3, 0x21
vaddps m4, m2, m3
@@ -348,8 +339,8 @@ fft16_avx:
ret
align 16
-fft32_avx:
- call fft16_avx
+fft32_ %+ cpuname:
+ call fft16_ %+ cpuname
mova m0, Z(4)
mova m1, Z(5)
@@ -372,8 +363,8 @@ fft32_avx:
ret
-fft32_interleave_avx:
- call fft32_avx
+fft32_interleave_ %+ cpuname:
+ call fft32_ %+ cpuname
mov r2d, 32
.deint_loop:
mova m2, Z(0)
@@ -388,12 +379,21 @@ fft32_interleave_avx:
sub r2d, mmsize/4
jg .deint_loop
ret
+%endmacro
+
+FFT_DECL_16_32
+%endif
+
+%if HAVE_XOP
+INIT_YMM xop
+FFT_DECL_16_32
%endif
INIT_XMM sse
%define movdqa movaps
align 16
+fft4_xop:
fft4_avx:
fft4_sse:
mova m0, Z(0)
@@ -550,6 +550,12 @@ DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
%endif
+%if HAVE_XOP
+INIT_YMM xop
+DECL_PASS pass_xop, PASS_BIG 1
+DECL_PASS pass_interleave_xop, PASS_BIG 0
+%endif
+
INIT_XMM sse
%macro INTERL_SSE 5
@@ -640,6 +646,11 @@ cglobal fft_dispatch%2, 2,5,8, z, nbits
RET
%endmacro ; DECL_FFT
+%if HAVE_XOP
+INIT_YMM xop
+DECL_FFT 6
+DECL_FFT 6, _interleave
+%endif
%if HAVE_AVX
INIT_YMM avx
DECL_FFT 6
@@ -688,10 +699,8 @@ INIT_XMM sse
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
mulps m6, %3, [%5+%1]
mulps m7, %2, [%5+%1]
- mulps %2, %2, [%6+%1]
- mulps %3, %3, [%6+%1]
- subps %2, %2, m6
- addps %3, %3, m7
+ fmsubps %2, %2, [%6+%1], m6
+ fmaddps %3, %3, [%6+%1], m7
%endmacro
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
@@ -851,7 +860,12 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample
*output, const FFTSample *i
DECL_IMDCT POSROTATESHUF
INIT_YMM avx
-
+
%if HAVE_AVX
DECL_IMDCT POSROTATESHUF_AVX
%endif
+
+%if HAVE_XOP
+INIT_YMM xop
+DECL_IMDCT POSROTATESHUF_AVX
+%endif
diff --git a/libavcodec/x86/fft_sse.c b/libavcodec/x86/fft_sse.c
index 13b992f..0591b58 100644
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@@ -30,6 +30,7 @@ DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
+void ff_fft_dispatch_interleave_xop(FFTComplex *z, int nbits);
#if HAVE_AVX
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
@@ -37,6 +38,12 @@ void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
ff_fft_dispatch_interleave_avx(z, s->nbits);
}
#endif
+#if HAVE_XOP
+void ff_fft_calc_xop(FFTContext *s, FFTComplex *z)
+{
+ ff_fft_dispatch_interleave_xop(z, s->nbits);
+}
+#endif
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c167057..85fca76 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -1085,7 +1085,7 @@ AVX_INSTR pfmul, 1, 0, 1
v%5 %1, %2, %3, %4
%else
%6 %1, %2, %3
- %7 %1, %4
+ %7 %1, %1, %4
%endif
%endmacro
%endmacro
@@ -1093,3 +1093,5 @@ AVX_INSTR pfmul, 1, 0, 1
FMA_INSTR pmacsdd, pmulld, paddd
FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
+FMA_INSTR fmaddps, mulps, addps
+FMA_INSTR fmsubps, mulps, subps
--
1.7.5.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel