[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-10-02 Thread Ronald S. Bultje
---
 libavcodec/x86/constants.c|2 +
 libavcodec/x86/constants.h|1 +
 libavcodec/x86/h264_qpel_10bit.asm|4 +-
 libavcodec/x86/vp9dsp_init.h  |4 +
 libavcodec/x86/vp9dsp_init_16bpp.c|   33 +
 libavcodec/x86/vp9intrapred_16bpp.asm | 1520 +
 6 files changed, 1562 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 19345f5..3f3ee0f 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -85,3 +85,5 @@ DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_16)   = { 
0x00100010ULL, 0x000
 0x00100010ULL, 
0x00100010ULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_32)   = { 0x00200020ULL, 
0x00200020ULL,
 0x00200020ULL, 
0x00200020ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pd_65535)= { 0xULL, 
0xULL,
+0xULL, 
0xULL };
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 4a2451d..ee8422e 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -65,5 +65,6 @@ extern const xmm_reg  ff_ps_neg;
 extern const ymm_reg  ff_pd_1;
 extern const ymm_reg  ff_pd_16;
 extern const ymm_reg  ff_pd_32;
+extern const ymm_reg  ff_pd_65535;
 
 #endif /* AVCODEC_X86_CONSTANTS_H */
diff --git a/libavcodec/x86/h264_qpel_10bit.asm 
b/libavcodec/x86/h264_qpel_10bit.asm
index 7e9be36..8722683 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -26,6 +26,7 @@
 
 SECTION_RODATA 32
 
+cextern pd_65535
 cextern pw_1023
 %define pw_pixel_max pw_1023
 cextern pw_16
@@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
-pd_0f: times 4 dd 0x
 
 SECTION .text
 
@@ -708,7 +708,7 @@ h%1_loop_op:
 psrad  m1, 10
 psrad  m2, 10
 pslld  m2, 16
-pand   m1, [pd_0f]
+pand   m1, [pd_65535]
 porm1, m2
 %if num_mmregs <= 8
 pxor   m0, m0
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index 47d2246..5842282 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
 init_ipred_func(type, enum, 16, bpp, opt); \
 init_ipred_func(type, enum, 32, bpp, opt)
 
+#define init_ipred_funcs(type, enum, bpp, opt) \
+init_ipred_func(type, enum,  4, bpp, opt); \
+init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
 void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index f4a4a5d..4ceb4d4 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -51,6 +51,18 @@ decl_ipred_fns(h,   16, mmxext, sse2);
 decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
 init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
 init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+init_ipred_funcs(hu, HOR_UP, 16, sse2);
+init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+}
+
+if (EXTERNAL_SSSE3(cpu_flags)) {
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
 }
 
 if (EXTERNAL_AVX_FAST(cpu_flags)) {
 init_fpel_func(2, 0,  32, put, , avx);
 init_fpel_func(1, 0,  64, put, , avx);
 init_fpel_func(0, 0, 128, put, , avx);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);

Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-10-01 Thread Henrik Gramner
On Wed, Sep 30, 2015 at 9:36 PM, Ronald S. Bultje  wrote:
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm
b/libavcodec/x86/vp9intrapred_16bpp.asm

+pd_65535: times 8 dd 0x

Duplicate of pd_0f from 264_qpel_10bit.asm

+%if cpuflag(ssse3)
+; FIXME this can be done without three-op-instr by doing pshfhw m1, m0
+; but then interleaving decreases, measure which is faster
+pshufb  m1, m0, [pb_2to15_14_15]; bcdefghh
+%else
+psrldq  m1, m0, 2   ; bcdefgh.
+%endif
+pshufhw m0, m0, q3310   ; abcdefhh
+%if notcpuflag(ssse3)
+pshufhw m1, m1, q2210   ; bcdefghh
+%endif

Move pshufhw into the else part. There's also a typo (pshfhw) in the comment.

+%if cpuflag(ssse3)
+pshufb  m0, m4
+%else
+psrldq  m0, 2   ; CDEFGHh.
+%endif
+pshuflw m1, m1, q3321   ; GHhh
+%if notcpuflag(ssse3)
+pshufhw m0, m0, q2210   ; CDEFGHhh
+%endif

Ditto

+%if cpuflag(ssse3)
+pshufb  m1, m3
+pshufb  m2, m3
+%else
+psrldq  m1, 2
+psrldq  m2, 2
+pshufhw m1, m1, q2210
+pshufhw m2, m2, q2210
+%endif
+mova  [dstq+strideq*2], m1
+mova  [dstq+stride3q ], m2
+lea   dstq, [dstq+strideq*4]
+%if cpuflag(ssse3)
+pshufb  m1, m3
+pshufb  m2, m3
+%else
+psrldq  m1, 2
+psrldq  m2, 2
+pshufhw m1, m1, q2210
+pshufhw m2, m2, q2210
+%endif
+mova  [dstq+strideq*0], m1
+mova  [dstq+strideq*1], m2
+%if cpuflag(ssse3)
+pshufb  m1, m3
+pshufb  m2, m3
+%else
+psrldq  m1, 2
+psrldq  m2, 2
+pshufhw m1, m1, q2210
+pshufhw m2, m2, q2210
+%endif
+mova  [dstq+strideq*2], m1
+mova  [dstq+stride3q ], m2

Possibly some deduplication here. There are a few very similar
segments in more places as well, might be possible to turn them into a
macro.

+%if cpuflag(ssse3)
+pshufb  m2, [pb_4_5_8to13_8x0]
+%else
+pshuflw m2, m2, q
+%endif
+psrldq  m0, 6
+%if notcpuflag(ssse3)
+psrldq  m2, 6
+%endif

Move psrldq into the else part.

It's quite a large patch so I mostly just skimmed through it fairly
quickly, but the rest looks fine to me.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Ronald S. Bultje
---
 libavcodec/x86/vp9dsp_init.h  |4 +
 libavcodec/x86/vp9dsp_init_16bpp.c|   33 +
 libavcodec/x86/vp9intrapred_16bpp.asm | 1613 +
 3 files changed, 1650 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index 47d2246..5842282 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
 init_ipred_func(type, enum, 16, bpp, opt); \
 init_ipred_func(type, enum, 32, bpp, opt)
 
+#define init_ipred_funcs(type, enum, bpp, opt) \
+init_ipred_func(type, enum,  4, bpp, opt); \
+init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
 void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index f4a4a5d..4ceb4d4 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -51,6 +51,18 @@ decl_ipred_fns(h,   16, mmxext, sse2);
 decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
 init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
 init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+init_ipred_funcs(hu, HOR_UP, 16, sse2);
+init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+}
+
+if (EXTERNAL_SSSE3(cpu_flags)) {
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
 }
 
 if (EXTERNAL_AVX_FAST(cpu_flags)) {
 init_fpel_func(2, 0,  32, put, , avx);
 init_fpel_func(1, 0,  64, put, , avx);
 init_fpel_func(0, 0, 128, put, , avx);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+init_ipred_funcs(hu, HOR_UP, 16, avx);
+init_ipred_funcs(hd, HOR_DOWN, 16, avx);
 }
 
 if (EXTERNAL_AVX2(cpu_flags)) {
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 6da42cf..807ba21 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -27,6 +27,11 @@ SECTION_RODATA 32
 pd_2: times 8 dd 2
 pd_4: times 8 dd 4
 pd_8: times 8 dd 8
+pd_65535: times 8 dd 0x
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
 
 cextern pw_1
 cextern pw_1023
@@ -34,8 +39,48 @@ cextern pw_4095
 cextern pd_16
 cextern pd_32
 
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
 SECTION .text
 
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+SWAP%1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+mova  [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+SWAP%1, %2
+%else
+mova   m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+mova   m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
 INIT_MMX mmx
 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
 movifnidn   aq, amp
@@ -669,3 +714,1571 @@ cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * 
ARCH_X86_32, dst, stride, l, a
 cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
 movam0, [pw_4095]
 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra 

Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Henrik Gramner
On Wed, Sep 30, 2015 at 11:22 PM, Ronald S. Bultje  wrote:
> On Wed, Sep 30, 2015 at 5:01 PM, Henrik Gramner  wrote:
>> I just wanted to make sure that my patch fixes this first before
>> posting it, but I'm unable to apply your patch. Is it based on top of
>> another patch?
>
> Yes, see https://github.com/rbultje/ffmpeg/commits/vp9-16bpp-asm

Ok, my patch fixes it so I posted it to the ML.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Ronald S. Bultje
---
 libavcodec/x86/vp9dsp_init.h  |4 +
 libavcodec/x86/vp9dsp_init_16bpp.c|   33 +
 libavcodec/x86/vp9intrapred_16bpp.asm | 1613 +
 3 files changed, 1650 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h
index 47d2246..5842282 100644
--- a/libavcodec/x86/vp9dsp_init.h
+++ b/libavcodec/x86/vp9dsp_init.h
@@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
 init_ipred_func(type, enum, 16, bpp, opt); \
 init_ipred_func(type, enum, 32, bpp, opt)
 
+#define init_ipred_funcs(type, enum, bpp, opt) \
+init_ipred_func(type, enum,  4, bpp, opt); \
+init_8_16_32_ipred_funcs(type, enum, bpp, opt)
+
 void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
 void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index f4a4a5d..4ceb4d4 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -51,6 +51,18 @@ decl_ipred_fns(h,   16, mmxext, sse2);
 decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
+
+#define decl_ipred_dir_funcs(type) \
+decl_ipred_fns(type, 16, sse2,  sse2); \
+decl_ipred_fns(type, 16, ssse3, ssse3); \
+decl_ipred_fns(type, 16, avx,   avx)
+
+decl_ipred_dir_funcs(dl);
+decl_ipred_dir_funcs(dr);
+decl_ipred_dir_funcs(vl);
+decl_ipred_dir_funcs(vr);
+decl_ipred_dir_funcs(hu);
+decl_ipred_dir_funcs(hd);
 #endif /* HAVE_YASM */
 
 av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
 init_8_16_32_ipred_funcs(dc_top,  TOP_DC,  16, sse2);
 init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
+init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
+init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
+init_ipred_funcs(hu, HOR_UP, 16, sse2);
+init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
+}
+
+if (EXTERNAL_SSSE3(cpu_flags)) {
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
+init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
+init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
+init_ipred_funcs(hu, HOR_UP, 16, ssse3);
+init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
 }
 
 if (EXTERNAL_AVX_FAST(cpu_flags)) {
 init_fpel_func(2, 0,  32, put, , avx);
 init_fpel_func(1, 0,  64, put, , avx);
 init_fpel_func(0, 0, 128, put, , avx);
+init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
+init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
+init_ipred_funcs(vl, VERT_LEFT, 16, avx);
+init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
+init_ipred_funcs(hu, HOR_UP, 16, avx);
+init_ipred_funcs(hd, HOR_DOWN, 16, avx);
 }
 
 if (EXTERNAL_AVX2(cpu_flags)) {
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 6da42cf..8abee89 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -27,6 +27,11 @@ SECTION_RODATA 32
 pd_2: times 8 dd 2
 pd_4: times 8 dd 4
 pd_8: times 8 dd 8
+pd_65535: times 8 dd 0x
+
+pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
+pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
+pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
 
 cextern pw_1
 cextern pw_1023
@@ -34,8 +39,48 @@ cextern pw_4095
 cextern pd_16
 cextern pd_32
 
+; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
+; only 3 registers on x86-32, which would make it one cycle faster, but that
+; would make the code quite a bit uglier...
+
 SECTION .text
 
+%macro SCRATCH 3-4
+%if ARCH_X86_64
+SWAP%1, %2
+%if %0 == 4
+%define reg_%4 m%2
+%endif
+%else
+mova  [%3], m%1
+%if %0 == 4
+%define reg_%4 [%3]
+%endif
+%endif
+%endmacro
+
+%macro UNSCRATCH 3-4
+%if ARCH_X86_64
+SWAP%1, %2
+%else
+mova   m%1, [%3]
+%endif
+%if %0 == 4
+%undef reg_%4
+%endif
+%endmacro
+
+%macro PRELOAD 2-3
+%if ARCH_X86_64
+mova   m%1, [%2]
+%if %0 == 3
+%define reg_%3 m%1
+%endif
+%elif %0 == 3
+%define reg_%3 [%2]
+%endif
+%endmacro
+
 INIT_MMX mmx
 cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
 movifnidn   aq, amp
@@ -669,3 +714,1571 @@ cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * 
ARCH_X86_32, dst, stride, l, a
 cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a
 movam0, [pw_4095]
 jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
+
+; Directional intra 

Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Henrik Gramner
On Wed, Sep 30, 2015 at 9:45 PM, James Almer  wrote:
> Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines do.
I actually have a patch locally somewhere that makes that work.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Ronald S. Bultje
Hi,

On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramner  wrote:

> On Wed, Sep 30, 2015 at 9:45 PM, James Almer  wrote:
> > Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines
> do.
> I actually have a patch locally somewhere that makes that work.


Oh that's great! I guess I'll just do nothing then...?

Ronald
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Henrik Gramner
On Wed, Sep 30, 2015 at 10:49 PM, Ronald S. Bultje  wrote:
 On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramner  wrote:
>> I actually have a patch locally somewhere that makes that work.
>
> Oh that's great! I guess I'll just do nothing then...?

I just wanted to make sure that my patch fixes this first before
posting it, but I'm unable to apply your patch. Is it based on top of
another patch?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread Ronald S. Bultje
Hi,

On Wed, Sep 30, 2015 at 5:01 PM, Henrik Gramner  wrote:

> On Wed, Sep 30, 2015 at 10:49 PM, Ronald S. Bultje 
> wrote:
>  On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramner 
> wrote:
> >> I actually have a patch locally somewhere that makes that work.
> >
> > Oh that's great! I guess I'll just do nothing then...?
>
> I just wanted to make sure that my patch fixes this first before
> posting it, but I'm unable to apply your patch. Is it based on top of
> another patch?


Yes, see https://github.com/rbultje/ffmpeg/commits/vp9-16bpp-asm

Ronald
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

2015-09-30 Thread James Almer
On 9/30/2015 4:36 PM, Ronald S. Bultje wrote:
> +cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \

/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1141: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1143: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1145: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1922: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1922: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1924: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1924: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1926: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1926: error: 
(WIN64_SPILL_XMM:1) expecting `)'
/ffmpeg/src/library.mak:30: recipe for target 
'libavcodec/x86/vp9intrapred_16bpp.o' failed
make: *** [libavcodec/x86/vp9intrapred_16bpp.o] Error 1

Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines do.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel