[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
--- libavcodec/x86/constants.c|2 + libavcodec/x86/constants.h|1 + libavcodec/x86/h264_qpel_10bit.asm|4 +- libavcodec/x86/vp9dsp_init.h |4 + libavcodec/x86/vp9dsp_init_16bpp.c| 33 + libavcodec/x86/vp9intrapred_16bpp.asm | 1520 + 6 files changed, 1562 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 19345f5..3f3ee0f 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -85,3 +85,5 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x00100010ULL, 0x000 0x00100010ULL, 0x00100010ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x00200020ULL, 0x00200020ULL, 0x00200020ULL, 0x00200020ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0xULL, 0xULL, +0xULL, 0xULL }; diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 4a2451d..ee8422e 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -65,5 +65,6 @@ extern const xmm_reg ff_ps_neg; extern const ymm_reg ff_pd_1; extern const ymm_reg ff_pd_16; extern const ymm_reg ff_pd_32; +extern const ymm_reg ff_pd_65535; #endif /* AVCODEC_X86_CONSTANTS_H */ diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index 7e9be36..8722683 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -26,6 +26,7 @@ SECTION_RODATA 32 +cextern pd_65535 cextern pw_1023 %define pw_pixel_max pw_1023 cextern pw_16 @@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16 tap1: times 4 dw 1, -5 tap2: times 4 dw 20, 20 tap3: times 4 dw -5, 1 -pd_0f: times 4 dd 0x SECTION .text @@ -708,7 +708,7 @@ h%1_loop_op: psrad m1, 10 psrad m2, 10 pslld m2, 16 -pand m1, [pd_0f] +pand m1, [pd_65535] porm1, m2 %if num_mmregs <= 8 pxor m0, m0 diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 47d2246..5842282 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) init_ipred_func(type, enum, 16, bpp, opt); \ init_ipred_func(type, enum, 32, bpp, opt) +#define init_ipred_funcs(type, enum, bpp, opt) \ +init_ipred_func(type, enum, 4, bpp, opt); \ +init_8_16_32_ipred_funcs(type, enum, bpp, opt) + void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index f4a4a5d..4ceb4d4 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); + +#define decl_ipred_dir_funcs(type) \ +decl_ipred_fns(type, 16, sse2, sse2); \ +decl_ipred_fns(type, 16, ssse3, ssse3); \ +decl_ipred_fns(type, 16, avx, avx) + +decl_ipred_dir_funcs(dl); +decl_ipred_dir_funcs(dr); +decl_ipred_dir_funcs(vl); +decl_ipred_dir_funcs(vr); +decl_ipred_dir_funcs(hu); +decl_ipred_dir_funcs(hd); #endif /* HAVE_YASM */ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) @@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_8_16_32_ipred_funcs(dc, DC, 16, sse2); init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2); +init_ipred_funcs(vl, VERT_LEFT, 16, sse2); +init_ipred_funcs(vr, VERT_RIGHT, 16, sse2); +init_ipred_funcs(hu, HOR_UP, 16, sse2); +init_ipred_funcs(hd, HOR_DOWN, 16, sse2); +} + +if (EXTERNAL_SSSE3(cpu_flags)) { +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3); +init_ipred_funcs(vl, VERT_LEFT, 16, ssse3); +init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3); +init_ipred_funcs(hu, HOR_UP, 16, ssse3); +init_ipred_funcs(hd, HOR_DOWN, 16, ssse3); } if (EXTERNAL_AVX_FAST(cpu_flags)) { init_fpel_func(2, 0, 32, put, , avx); init_fpel_func(1, 0, 64, put, , avx); init_fpel_func(0, 0, 128, put, , avx); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
On Wed, Sep 30, 2015 at 9:36 PM, Ronald S. Bultjewrote: diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm +pd_65535: times 8 dd 0x Duplicate of pd_0f from 264_qpel_10bit.asm +%if cpuflag(ssse3) +; FIXME this can be done without three-op-instr by doing pshfhw m1, m0 +; but then interleaving decreases, measure which is faster +pshufb m1, m0, [pb_2to15_14_15]; bcdefghh +%else +psrldq m1, m0, 2 ; bcdefgh. +%endif +pshufhw m0, m0, q3310 ; abcdefhh +%if notcpuflag(ssse3) +pshufhw m1, m1, q2210 ; bcdefghh +%endif Move pshufhw into the else part. There's also a typo (pshfhw) in the comment. +%if cpuflag(ssse3) +pshufb m0, m4 +%else +psrldq m0, 2 ; CDEFGHh. +%endif +pshuflw m1, m1, q3321 ; GHhh +%if notcpuflag(ssse3) +pshufhw m0, m0, q2210 ; CDEFGHhh +%endif Ditto +%if cpuflag(ssse3) +pshufb m1, m3 +pshufb m2, m3 +%else +psrldq m1, 2 +psrldq m2, 2 +pshufhw m1, m1, q2210 +pshufhw m2, m2, q2210 +%endif +mova [dstq+strideq*2], m1 +mova [dstq+stride3q ], m2 +lea dstq, [dstq+strideq*4] +%if cpuflag(ssse3) +pshufb m1, m3 +pshufb m2, m3 +%else +psrldq m1, 2 +psrldq m2, 2 +pshufhw m1, m1, q2210 +pshufhw m2, m2, q2210 +%endif +mova [dstq+strideq*0], m1 +mova [dstq+strideq*1], m2 +%if cpuflag(ssse3) +pshufb m1, m3 +pshufb m2, m3 +%else +psrldq m1, 2 +psrldq m2, 2 +pshufhw m1, m1, q2210 +pshufhw m2, m2, q2210 +%endif +mova [dstq+strideq*2], m1 +mova [dstq+stride3q ], m2 Possibly some deduplication here. There are a few very similar segments in more places as well, might be possible to turn them into a macro. +%if cpuflag(ssse3) +pshufb m2, [pb_4_5_8to13_8x0] +%else +pshuflw m2, m2, q +%endif +psrldq m0, 6 +%if notcpuflag(ssse3) +psrldq m2, 6 +%endif Move psrldq into the else part. It's quite a large patch so I mostly just skimmed through it fairly quickly, but the rest looks fine to me. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
--- libavcodec/x86/vp9dsp_init.h |4 + libavcodec/x86/vp9dsp_init_16bpp.c| 33 + libavcodec/x86/vp9intrapred_16bpp.asm | 1613 + 3 files changed, 1650 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 47d2246..5842282 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) init_ipred_func(type, enum, 16, bpp, opt); \ init_ipred_func(type, enum, 32, bpp, opt) +#define init_ipred_funcs(type, enum, bpp, opt) \ +init_ipred_func(type, enum, 4, bpp, opt); \ +init_8_16_32_ipred_funcs(type, enum, bpp, opt) + void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index f4a4a5d..4ceb4d4 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); + +#define decl_ipred_dir_funcs(type) \ +decl_ipred_fns(type, 16, sse2, sse2); \ +decl_ipred_fns(type, 16, ssse3, ssse3); \ +decl_ipred_fns(type, 16, avx, avx) + +decl_ipred_dir_funcs(dl); +decl_ipred_dir_funcs(dr); +decl_ipred_dir_funcs(vl); +decl_ipred_dir_funcs(vr); +decl_ipred_dir_funcs(hu); +decl_ipred_dir_funcs(hd); #endif /* HAVE_YASM */ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) @@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_8_16_32_ipred_funcs(dc, DC, 16, sse2); init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2); +init_ipred_funcs(vl, VERT_LEFT, 16, sse2); +init_ipred_funcs(vr, VERT_RIGHT, 16, sse2); +init_ipred_funcs(hu, HOR_UP, 16, sse2); +init_ipred_funcs(hd, HOR_DOWN, 16, sse2); +} + +if (EXTERNAL_SSSE3(cpu_flags)) { +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3); +init_ipred_funcs(vl, VERT_LEFT, 16, ssse3); +init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3); +init_ipred_funcs(hu, HOR_UP, 16, ssse3); +init_ipred_funcs(hd, HOR_DOWN, 16, ssse3); } if (EXTERNAL_AVX_FAST(cpu_flags)) { init_fpel_func(2, 0, 32, put, , avx); init_fpel_func(1, 0, 64, put, , avx); init_fpel_func(0, 0, 128, put, , avx); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx); +init_ipred_funcs(vl, VERT_LEFT, 16, avx); +init_ipred_funcs(vr, VERT_RIGHT, 16, avx); +init_ipred_funcs(hu, HOR_UP, 16, avx); +init_ipred_funcs(hd, HOR_DOWN, 16, avx); } if (EXTERNAL_AVX2(cpu_flags)) { diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 6da42cf..807ba21 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -27,6 +27,11 @@ SECTION_RODATA 32 pd_2: times 8 dd 2 pd_4: times 8 dd 4 pd_8: times 8 dd 8 +pd_65535: times 8 dd 0x + +pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 +pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 +pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 cextern pw_1 cextern pw_1023 @@ -34,8 +39,48 @@ cextern pw_4095 cextern pd_16 cextern pd_32 +; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take +; only 3 registers on x86-32, which would make it one cycle faster, but that +; would make the code quite a bit uglier... + SECTION .text +%macro SCRATCH 3-4 +%if ARCH_X86_64 +SWAP%1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else +mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 +SWAP%1, %2 +%else +mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 +mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + INIT_MMX mmx cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a movifnidn aq, amp @@ -669,3 +714,1571 @@ cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a movam0, [pw_4095] jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body + +; Directional intra
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
On Wed, Sep 30, 2015 at 11:22 PM, Ronald S. Bultjewrote: > On Wed, Sep 30, 2015 at 5:01 PM, Henrik Gramner wrote: >> I just wanted to make sure that my patch fixes this first before >> posting it, but I'm unable to apply your patch. Is it based on top of >> another patch? > > Yes, see https://github.com/rbultje/ffmpeg/commits/vp9-16bpp-asm Ok, my patch fixes it so I posted it to the ML. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
--- libavcodec/x86/vp9dsp_init.h |4 + libavcodec/x86/vp9dsp_init_16bpp.c| 33 + libavcodec/x86/vp9intrapred_16bpp.asm | 1613 + 3 files changed, 1650 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 47d2246..5842282 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) init_ipred_func(type, enum, 16, bpp, opt); \ init_ipred_func(type, enum, 32, bpp, opt) +#define init_ipred_funcs(type, enum, bpp, opt) \ +init_ipred_func(type, enum, 4, bpp, opt); \ +init_8_16_32_ipred_funcs(type, enum, bpp, opt) + void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index f4a4a5d..4ceb4d4 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); + +#define decl_ipred_dir_funcs(type) \ +decl_ipred_fns(type, 16, sse2, sse2); \ +decl_ipred_fns(type, 16, ssse3, ssse3); \ +decl_ipred_fns(type, 16, avx, avx) + +decl_ipred_dir_funcs(dl); +decl_ipred_dir_funcs(dr); +decl_ipred_dir_funcs(vl); +decl_ipred_dir_funcs(vr); +decl_ipred_dir_funcs(hu); +decl_ipred_dir_funcs(hd); #endif /* HAVE_YASM */ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) @@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_8_16_32_ipred_funcs(dc, DC, 16, sse2); init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2); init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2); +init_ipred_funcs(vl, VERT_LEFT, 16, sse2); +init_ipred_funcs(vr, VERT_RIGHT, 16, sse2); +init_ipred_funcs(hu, HOR_UP, 16, sse2); +init_ipred_funcs(hd, HOR_DOWN, 16, sse2); +} + +if (EXTERNAL_SSSE3(cpu_flags)) { +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3); +init_ipred_funcs(vl, VERT_LEFT, 16, ssse3); +init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3); +init_ipred_funcs(hu, HOR_UP, 16, ssse3); +init_ipred_funcs(hd, HOR_DOWN, 16, ssse3); } if (EXTERNAL_AVX_FAST(cpu_flags)) { init_fpel_func(2, 0, 32, put, , avx); init_fpel_func(1, 0, 64, put, , avx); init_fpel_func(0, 0, 128, put, , avx); +init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx); +init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx); +init_ipred_funcs(vl, VERT_LEFT, 16, avx); +init_ipred_funcs(vr, VERT_RIGHT, 16, avx); +init_ipred_funcs(hu, HOR_UP, 16, avx); +init_ipred_funcs(hd, HOR_DOWN, 16, avx); } if (EXTERNAL_AVX2(cpu_flags)) { diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 6da42cf..8abee89 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -27,6 +27,11 @@ SECTION_RODATA 32 pd_2: times 8 dd 2 pd_4: times 8 dd 4 pd_8: times 8 dd 8 +pd_65535: times 8 dd 0x + +pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15 +pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0 +pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7 cextern pw_1 cextern pw_1023 @@ -34,8 +39,48 @@ cextern pw_4095 cextern pd_16 cextern pd_32 +; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take +; only 3 registers on x86-32, which would make it one cycle faster, but that +; would make the code quite a bit uglier... + SECTION .text +%macro SCRATCH 3-4 +%if ARCH_X86_64 +SWAP%1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else +mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 +SWAP%1, %2 +%else +mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 +mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + INIT_MMX mmx cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a movifnidn aq, amp @@ -669,3 +714,1571 @@ cglobal vp9_ipred_tm_32x32_10, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a cglobal vp9_ipred_tm_32x32_12, 4, 5, 10, 32 * ARCH_X86_32, dst, stride, l, a movam0, [pw_4095] jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body + +; Directional intra
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
On Wed, Sep 30, 2015 at 9:45 PM, James Almerwrote: > Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines do. I actually have a patch locally somewhere that makes that work. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
Hi, On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramnerwrote: > On Wed, Sep 30, 2015 at 9:45 PM, James Almer wrote: > > Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines > do. > I actually have a patch locally somewhere that makes that work. Oh that's great! I guess I'll just do nothing then...? Ronald ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
On Wed, Sep 30, 2015 at 10:49 PM, Ronald S. Bultjewrote: On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramner wrote: >> I actually have a patch locally somewhere that makes that work. > > Oh that's great! I guess I'll just do nothing then...? I just wanted to make sure that my patch fixes this first before posting it, but I'm unable to apply your patch. Is it based on top of another patch? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
Hi, On Wed, Sep 30, 2015 at 5:01 PM, Henrik Gramnerwrote: > On Wed, Sep 30, 2015 at 10:49 PM, Ronald S. Bultje > wrote: > On Wed, Sep 30, 2015 at 4:02 PM, Henrik Gramner > wrote: > >> I actually have a patch locally somewhere that makes that work. > > > > Oh that's great! I guess I'll just do nothing then...? > > I just wanted to make sure that my patch fixes this first before > posting it, but I'm unable to apply your patch. Is it based on top of > another patch? Yes, see https://github.com/rbultje/ffmpeg/commits/vp9-16bpp-asm Ronald ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.
On 9/30/2015 4:36 PM, Ronald S. Bultje wrote: > +cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \ /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1141: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1143: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1145: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1922: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1922: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1924: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1924: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1926: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/libavcodec/x86/vp9intrapred_16bpp.asm:1926: error: (WIN64_SPILL_XMM:1) expecting `)' /ffmpeg/src/library.mak:30: recipe for target 'libavcodec/x86/vp9intrapred_16bpp.o' failed make: *** [libavcodec/x86/vp9intrapred_16bpp.o] Error 1 Guess notcpuflag(ssse3) does not work here like ARCH_ or HAVE_ defines do. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel