avx version of `fft-test -n 4` errors out, and -n 3 crashes. Likewise for
mdct up to -n 6.
--Loren Merritt
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index a5c9b39..5bd8463 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -99,13 +99,9 @@ static const int avx_tab[] = {
static int is_second_half_of_fft32(int i, int n)
{
- if (n < 32)
- return 0;
-
- if (n == 32)
- return i < 16 ? 0 : 1;
-
- if (i < n/2)
+ if (n <= 32)
+ return i >= 16;
+ else if (i < n/2)
return is_second_half_of_fft32(i, n/2);
else if (i < 3*n/4)
return is_second_half_of_fft32(i - n/2, n/4);
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 1fb78a6..6fb81c7 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -145,15 +145,15 @@ section .text align=16
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3
- vsubps %3, %1, %2 ; {t3,t4,-t8,t7}
- vaddps %1, %1, %2 ; {t1,t2,t6,t5}
- vxorps %3, %3, [ps_p1p1m1p1]
- vshufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
- vshufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
- vsubps %3, %1, %2 ; {r2,i2,r3,i3}
- vaddps %1, %1, %2 ; {r0,i0,r1,i1}
- vshufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
- vshufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
+ subps %3, %1, %2 ; {t3,t4,-t8,t7}
+ addps %1, %1, %2 ; {t1,t2,t6,t5}
+ xorps %3, %3, [ps_p1p1m1p1]
+ shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
+ shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
+ subps %3, %1, %2 ; {r2,i2,r3,i3}
+ addps %1, %1, %2 ; {r0,i0,r1,i1}
+ shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
+ shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
%endmacro
; In SSE mode do one FFT8
@@ -166,121 +166,120 @@ section .text align=16
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6
- vaddps %6, %3, %4 ; {t1,t2,t3,t4}
- vsubps %3, %3, %4 ; {r5,i5,r7,i7}
- vshufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
- vmulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
- vmulps %4, %4, [ps_root2]
- vaddps %3, %3, %4 ; {t8,t7,ta,t9}
- vshufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
- vshufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
- vsubps %3, %6, %4 ; {t6,t5,tc,tb}
- vaddps %6, %6, %4 ; {t1,t2,t9,ta}
- vshufps %4, %6, %3, 0x8d ; {t2,ta,t6,tc}
- vshufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
- vsubps %3, %1, %6 ; {r4,r5,r6,r7}
- vaddps %1, %1, %6 ; {r0,r1,r2,r3}
- vsubps %5, %2, %4 ; {i4,i5,i6,i7}
- vaddps %2, %2, %4 ; {i0,i1,i2,i3}
- SWAP %4, %5
+ addps %6, %3, %4 ; {t1,t2,t3,t4}
+ subps %3, %3, %4 ; {r5,i5,r7,i7}
+ shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
+ mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
+ mulps %4, %4, [ps_root2]
+ addps %3, %3, %4 ; {t8,t7,ta,t9}
+ shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
+ shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
+ subps %3, %6, %4 ; {t6,t5,tc,tb}
+ addps %6, %6, %4 ; {t1,t2,t9,ta}
+ shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
+ shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
+ subps %3, %1, %6 ; {r4,r5,r6,r7}
+ addps %1, %1, %6 ; {r0,r1,r2,r3}
+ subps %4, %2, %5 ; {i4,i5,i6,i7}
+ addps %2, %2, %5 ; {i0,i1,i2,i3}
%endmacro
; scheduled for cpu-bound sizes
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
-IF%1 mova m4, Z(4)
-IF%1 mova m5, Z(5)
- mova m0, %2 ; wre
- mova m1, %3 ; wim
- vmulps m2, m4, m0 ; r2*wre
-IF%1 mova m6, Z2(6)
- vmulps m3, m5, m1 ; i2*wim
-IF%1 mova m7, Z2(7)
- vmulps m4, m4, m1 ; r2*wim
- vmulps m5, m5, m0 ; i2*wre
- vaddps m2, m2, m3 ; r2*wre + i2*wim
- vmulps m3, m1, m7 ; i3*wim
- vsubps m5, m5, m4 ; i2*wre - r2*wim
- vmulps m1, m1, m6 ; r3*wim
- vmulps m4, m0, m6 ; r3*wre
- vmulps m0, m0, m7 ; i3*wre
- vsubps m4, m4, m3 ; r3*wre - i3*wim
- mova m3, Z(0)
- vaddps m0, m0, m1 ; i3*wre + r3*wim
- vsubps m1, m4, m2 ; t3
- vaddps m4, m4, m2 ; t5
- vsubps m3, m3, m4 ; r2
- vaddps m4, m4, Z(0) ; r0
- mova m6, Z(2)
- mova Z(4), m3
- mova Z(0), m4
- vsubps m3, m5, m0 ; t4
- vsubps m4, m6, m3 ; r3
- vaddps m3, m3, m6 ; r1
- mova Z2(6), m4
- mova Z(2), m3
- mova m2, Z(3)
- vaddps m3, m5, m0 ; t6
- vsubps m2, m2, m1 ; i3
- mova m7, Z(1)
- vaddps m1, m1, Z(3) ; i1
- mova Z2(7), m2
- mova Z(3), m1
- vsubps m4, m7, m3 ; i2
- vaddps m3, m3, m7 ; i0
- mova Z(5), m4
- mova Z(1), m3
+IF%1 mova m4, Z(4)
+IF%1 mova m5, Z(5)
+ mova m0, %2 ; wre
+ mova m1, %3 ; wim
+ mulps m2, m4, m0 ; r2*wre
+IF%1 mova m6, Z2(6)
+ mulps m3, m5, m1 ; i2*wim
+IF%1 mova m7, Z2(7)
+ mulps m4, m4, m1 ; r2*wim
+ mulps m5, m5, m0 ; i2*wre
+ addps m2, m2, m3 ; r2*wre + i2*wim
+ mulps m3, m1, m7 ; i3*wim
+ subps m5, m5, m4 ; i2*wre - r2*wim
+ mulps m1, m1, m6 ; r3*wim
+ mulps m4, m0, m6 ; r3*wre
+ mulps m0, m0, m7 ; i3*wre
+ subps m4, m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m0, m1 ; i3*wre + r3*wim
+ subps m1, m4, m2 ; t3
+ addps m4, m4, m2 ; t5
+ subps m3, m3, m4 ; r2
+ addps m4, m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ subps m3, m5, m0 ; t4
+ subps m4, m6, m3 ; r3
+ addps m3, m3, m6 ; r1
+ mova Z2(6), m4
+ mova Z(2), m3
+ mova m2, Z(3)
+ addps m3, m5, m0 ; t6
+ subps m2, m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, m1, Z(3) ; i1
+ mova Z2(7), m2
+ mova Z(3), m1
+ subps m4, m7, m3 ; i2
+ addps m3, m3, m7 ; i0
+ mova Z(5), m4
+ mova Z(1), m3
%endmacro
; scheduled to avoid store->load aliasing
%macro PASS_BIG 1 ; (!interleave)
- mova m4, Z(4) ; r2
- mova m5, Z(5) ; i2
- mova m0, [wq] ; wre
- mova m1, [wq+o1q] ; wim
- vmulps m2, m4, m0 ; r2*wre
- mova m6, Z2(6) ; r3
- vmulps m3, m5, m1 ; i2*wim
- mova m7, Z2(7) ; i3
- vmulps m4, m4, m1 ; r2*wim
- vmulps m5, m5, m0 ; i2*wre
- vaddps m2, m2, m3 ; r2*wre + i2*wim
- vmulps m3, m1, m7 ; i3*wim
- vmulps m1, m1, m6 ; r3*wim
- vsubps m5, m5, m4 ; i2*wre - r2*wim
- vmulps m4, m0, m6 ; r3*wre
- vmulps m0, m0, m7 ; i3*wre
- vsubps m4, m4, m3 ; r3*wre - i3*wim
- mova m3, Z(0)
- vaddps m0, m0, m1 ; i3*wre + r3*wim
- vsubps m1, m4, m2 ; t3
- vaddps m4, m4, m2 ; t5
- vsubps m3, m3, m4 ; r2
- vaddps m4, m4, Z(0) ; r0
- mova m6, Z(2)
- mova Z(4), m3
- mova Z(0), m4
- vsubps m3, m5, m0 ; t4
- vsubps m4, m6, m3 ; r3
- vaddps m3, m3, m6 ; r1
-IF%1 mova Z2(6), m4
-IF%1 mova Z(2), m3
- mova m2, Z(3)
- vaddps m5, m5, m0 ; t6
- vsubps m2, m2, m1 ; i3
- mova m7, Z(1)
- vaddps m1, m1, Z(3) ; i1
-IF%1 mova Z2(7), m2
-IF%1 mova Z(3), m1
- vsubps m6, m7, m5 ; i2
- vaddps m5, m5, m7 ; i0
-IF%1 mova Z(5), m6
-IF%1 mova Z(1), m5
+ mova m4, Z(4) ; r2
+ mova m5, Z(5) ; i2
+ mova m0, [wq] ; wre
+ mova m1, [wq+o1q] ; wim
+ mulps m2, m4, m0 ; r2*wre
+ mova m6, Z2(6) ; r3
+ mulps m3, m5, m1 ; i2*wim
+ mova m7, Z2(7) ; i3
+ mulps m4, m4, m1 ; r2*wim
+ mulps m5, m5, m0 ; i2*wre
+ addps m2, m2, m3 ; r2*wre + i2*wim
+ mulps m3, m1, m7 ; i3*wim
+ mulps m1, m1, m6 ; r3*wim
+ subps m5, m5, m4 ; i2*wre - r2*wim
+ mulps m4, m0, m6 ; r3*wre
+ mulps m0, m0, m7 ; i3*wre
+ subps m4, m4, m3 ; r3*wre - i3*wim
+ mova m3, Z(0)
+ addps m0, m0, m1 ; i3*wre + r3*wim
+ subps m1, m4, m2 ; t3
+ addps m4, m4, m2 ; t5
+ subps m3, m3, m4 ; r2
+ addps m4, m4, Z(0) ; r0
+ mova m6, Z(2)
+ mova Z(4), m3
+ mova Z(0), m4
+ subps m3, m5, m0 ; t4
+ subps m4, m6, m3 ; r3
+ addps m3, m3, m6 ; r1
+IF%1 mova Z2(6), m4
+IF%1 mova Z(2), m3
+ mova m2, Z(3)
+ addps m5, m5, m0 ; t6
+ subps m2, m2, m1 ; i3
+ mova m7, Z(1)
+ addps m1, m1, Z(3) ; i1
+IF%1 mova Z2(7), m2
+IF%1 mova Z(3), m1
+ subps m6, m7, m5 ; i2
+ addps m5, m5, m7 ; i0
+IF%1 mova Z(5), m6
+IF%1 mova Z(1), m5
%if %1==0
INTERL m1, m3, m7, Z, 2
INTERL m2, m4, m0, Z2, 6
- mova m1, Z(0)
- mova m2, Z(4)
+ mova m1, Z(0)
+ mova m2, Z(4)
INTERL m5, m1, m3, Z, 0
INTERL m6, m2, m7, Z, 4
@@ -390,11 +389,6 @@ fft32_interleave_avx:
INIT_XMM
%define mova movaps
-%define vmulps mulps
-%define vaddps addps
-%define vsubps subps
-%define vxorps xorps
-%define vshufps shufps
align 16
fft4_avx:
@@ -536,11 +530,6 @@ DEFINE_ARGS z, w, n, o1, o3
%endmacro
INIT_YMM
-%define vmulps vmulps
-%define vaddps vaddps
-%define vsubps vsubps
-%define vxorps vxorps
-%define vshufps vshufps
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
@@ -558,11 +547,6 @@ DECL_PASS pass_interleave_avx, PASS_BIG 0
INIT_XMM
%define mova movaps
-%define vmulps mulps
-%define vaddps addps
-%define vsubps subps
-%define vxorps xorps
-%define vshufps shufps
%macro INTERL_SSE 5
mova %3, %2
diff --git a/libavcodec/x86/x86inc.asm b/libavcodec/x86/x86inc.asm
index 4ba536f..53091c1 100644
--- a/libavcodec/x86/x86inc.asm
+++ b/libavcodec/x86/x86inc.asm
@@ -678,30 +678,16 @@ INIT_MMX
; AVX abstraction layer
;=============================================================================
-%define sizeofmm0 8
-%define sizeofmm1 8
-%define sizeofmm2 8
-%define sizeofmm3 8
-%define sizeofmm4 8
-%define sizeofmm5 8
-%define sizeofmm6 8
-%define sizeofmm7 8
-%define sizeofxmm0 16
-%define sizeofxmm1 16
-%define sizeofxmm2 16
-%define sizeofxmm3 16
-%define sizeofxmm4 16
-%define sizeofxmm5 16
-%define sizeofxmm6 16
-%define sizeofxmm7 16
-%define sizeofxmm8 16
-%define sizeofxmm9 16
-%define sizeofxmm10 16
-%define sizeofxmm11 16
-%define sizeofxmm12 16
-%define sizeofxmm13 16
-%define sizeofxmm14 16
-%define sizeofxmm15 16
+%assign i 0
+%rep 16
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
;%1 == instruction
;%2 == 1 if float, 0 if int
@@ -709,29 +695,33 @@ INIT_MMX
;%4 == number of operands given
;%5+: operands
%macro RUN_AVX_INSTR 6-7+
- %if sizeof%5==8
- %define %%regmov movq
- %elif %2
- %define %%regmov movaps
+ %if sizeof%5==32
+ v%1 %5, %6, %7
%else
- %define %%regmov movdqa
- %endif
+ %if sizeof%5==8
+ %define %%regmov movq
+ %elif %2
+ %define %%regmov movaps
+ %else
+ %define %%regmov movdqa
+ %endif
- %if %4>=3+%3
- %ifnidn %5, %6
- %if avx_enabled && sizeof%5==16
- v%1 %5, %6, %7
+ %if %4>=3+%3
+ %ifnidn %5, %6
+ %if avx_enabled && sizeof%5==16
+ v%1 %5, %6, %7
+ %else
+ %%regmov %5, %6
+ %1 %5, %7
+ %endif
%else
- %%regmov %5, %6
%1 %5, %7
%endif
+ %elif %3
+ %1 %5, %6, %7
%else
- %1 %5, %7
+ %1 %5, %6
%endif
- %elif %3
- %1 %5, %6, %7
- %else
- %1 %5, %6
%endif
%endmacro
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel