Re: [FFmpeg-devel] [PATCH] x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format

2016-02-14 Thread Timothy Gu
On Sun, Feb 14, 2016 at 01:21:57PM +0100, Christophe Gisquet wrote:
> Hi,
> 
> 2016-02-14 6:49 GMT+01:00 Timothy Gu :
> >  %if HAVE_MMX_INLINE
> 
> Isn't that macro meant for C code (and in config.asm without much of a
> purpose)?

Yes, but this code isn't used unless inline asm is enabled so I don't want to
fill the binary up with bloat.

> >  ; Compute the rounder 32-r or 8-r and unpacks it to m7
> >  %macro LOAD_ROUNDER_MMX 1 ; round
> > -movd  m7, %1
> > +movh  m7, %1
> 
> Same here: until there's a SSE2 version for some of those functions,
> which will require specific code (like a shufps), that's unrelated.

Okay, removed.

> 
> > +cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
> 
> Do you intend to port more?

Yes, when I get the time.

> Because in that case, having a SSE2
> version of this one would be possible.
> And the wider version would be straightforward in SSE2.
> And that would allow you to put at last a (c) in this code you've
> spent so much time with.

I do intend on writing a SSE2 version but I'd rather port them all first.

> 
> Otherwise looks OK, but I haven't tested.

FATE passes, so pushed.

Timothy
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format

2016-02-14 Thread Christophe Gisquet
Hi,

2016-02-14 6:49 GMT+01:00 Timothy Gu :
>  %if HAVE_MMX_INLINE

Isn't that macro meant for C code (and in config.asm without much of a purpose)?

I suspect it is not useful, but I haven't dug into that.

> +; XXX some of these macros are not used right now, but they will in the 
> future
> +; when more functions are ported.

I would still recommend only porting the ones needed in this commit,
but I'm not going to bother you further with this.

>  ; Compute the rounder 32-r or 8-r and unpacks it to m7
>  %macro LOAD_ROUNDER_MMX 1 ; round
> -movd  m7, %1
> +movh  m7, %1

Same here: until there's a SSE2 version for some of those functions,
which will require specific code (like a shufps), that's unrelated.

> +cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h

Do you intend to port more? Because in that case, having a SSE2
version of this one would be possible.
And the wider version would be straightforward in SSE2.
And that would allow you to put at last a (c) in this code you've
spent so much time with.

Otherwise looks OK, but I haven't tested.

-- 
Christophe
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] x86/vc1dsp: Port vc1_*_hor_16b_shift2 to NASM format

2016-02-13 Thread Timothy Gu
---
 libavcodec/x86/vc1dsp.asm   | 92 -
 libavcodec/x86/vc1dsp_mmx.c | 61 --
 2 files changed, 99 insertions(+), 54 deletions(-)

diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 91a1991..2e9f067 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -25,6 +25,7 @@
 cextern pw_4
 cextern pw_5
 cextern pw_9
+cextern pw_128
 
 section .text
 
@@ -319,9 +320,47 @@ cglobal vc1_h_loop_filter8, 3,5,8
 RET
 
 %if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+; when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+pavgb   %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+paddw   m3, m7 ; +bias-r
+paddw   m4, m7 ; +bias-r
+psraw   m3, %1
+psraw   m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+packuswbm3, m4
+%1  m3, [%2]
+mova  [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+%1  m3, [%2]
+%1  m3, [%2 + mmsize]
+mova  [%2], m3
+mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+punpcklbw   %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
 ; Compute the rounder 32-r or 8-r and unpacks it to m7
 %macro LOAD_ROUNDER_MMX 1 ; round
-movd  m7, %1
+movh  m7, %1
 punpcklwd m7, m7
 punpckldq m7, m7
 %endmacro
@@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
 dec i
 jnz .loop
 REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+;  const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+movhq, 8
+sub  srcq, 2
+sub  rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+LOAD_ROUNDER_MMX rndq
+mova   m5, [pw_9]
+mova   m6, [pw_128]
+pxor   m0, m0
+
+.loop:
+mova   m1, [srcq + 2 * 0]
+mova   m2, [srcq + 2 * 0 + mmsize]
+mova   m3, [srcq + 2 * 1]
+mova   m4, [srcq + 2 * 1 + mmsize]
+paddw  m3, [srcq + 2 * 2]
+paddw  m4, [srcq + 2 * 2 + mmsize]
+paddw  m1, [srcq + 2 * 3]
+paddw  m2, [srcq + 2 * 3 + mmsize]
+pmullw m3, m5
+pmullw m4, m5
+psubw  m3, m1
+psubw  m4, m2
+NORMALIZE_MMX  7
+; remove bias
+paddw  m3, m6
+paddw  m4, m6
+TRANSFER_DO_PACK   %1, dstq
+add  srcq, 24
+add  dstq, strideq
+dechq
+jnz .loop
+
+RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE
 
 %macro INV_TRANS_INIT 0
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index ff13d9b..8325648 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -38,6 +38,10 @@
 void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+   const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+  const int16_t *src, int rnd);
 
 #define OP_PUT(S,D)
 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -71,55 +75,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
  "punpckldq %%mm7, %%mm7   \n\t"
 
 /**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
- const int16_t *src, int rnd)\
-{\
-int h = 8;\
-\
-src -= 1;\
-rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
-__asm__ volatile(\
-LOAD_ROUNDER_MMX("%4")\
-"movq  "MANGLE(ff_pw_128)", %%mm6\n\t"\
-"movq  "MANGLE(ff_pw_9)", %%mm5 \n\t"\
-"1:\n\t"\
-"movq  2*0+0(%1), %%mm1\n\t"\
-"movq  2*0+8(%1), %%mm2\n\t"\
-"movq  2*1+0(%1), %%mm3\n\t"\
-"movq  2*1+8(%1), %%mm4\n\t"\
-"paddw