---
libavcodec/x86/vc1dsp.asm | 92 -
libavcodec/x86/vc1dsp_mmx.c | 61 --
2 files changed, 99 insertions(+), 54 deletions(-)
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 91a1991..2e9f067 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -25,6 +25,7 @@
cextern pw_4
cextern pw_5
cextern pw_9
+cextern pw_128
section .text
@@ -319,9 +320,47 @@ cglobal vc1_h_loop_filter8, 3,5,8
RET
%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+; when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+pavgb %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+paddw m3, m7 ; +bias-r
+paddw m4, m7 ; +bias-r
+psraw m3, %1
+psraw m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+packuswbm3, m4
+%1 m3, [%2]
+mova [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+%1 m3, [%2]
+%1 m3, [%2 + mmsize]
+mova [%2], m3
+mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+punpcklbw %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
; Compute the rounder 32-r or 8-r and unpacks it to m7
%macro LOAD_ROUNDER_MMX 1 ; round
-movd m7, %1
+movh m7, %1
punpcklwd m7, m7
punpckldq m7, m7
%endmacro
@@ -394,6 +433,57 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
dec i
jnz .loop
REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+; const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+movhq, 8
+sub srcq, 2
+sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+LOAD_ROUNDER_MMX rndq
+mova m5, [pw_9]
+mova m6, [pw_128]
+pxor m0, m0
+
+.loop:
+mova m1, [srcq + 2 * 0]
+mova m2, [srcq + 2 * 0 + mmsize]
+mova m3, [srcq + 2 * 1]
+mova m4, [srcq + 2 * 1 + mmsize]
+paddw m3, [srcq + 2 * 2]
+paddw m4, [srcq + 2 * 2 + mmsize]
+paddw m1, [srcq + 2 * 3]
+paddw m2, [srcq + 2 * 3 + mmsize]
+pmullw m3, m5
+pmullw m4, m5
+psubw m3, m1
+psubw m4, m2
+NORMALIZE_MMX 7
+; remove bias
+paddw m3, m6
+paddw m4, m6
+TRANSFER_DO_PACK %1, dstq
+add srcq, 24
+add dstq, strideq
+dechq
+jnz .loop
+
+RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
%endif ; HAVE_MMX_INLINE
%macro INV_TRANS_INIT 0
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index ff13d9b..8325648 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -38,6 +38,10 @@
void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift);
+void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
+void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride,
+ const int16_t *src, int rnd);
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -71,55 +75,6 @@ void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst,
"punpckldq %%mm7, %%mm7 \n\t"
/**
- * Data is already unpacked, so some operations can directly be made from
- * memory.
- */
-#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
- const int16_t *src, int rnd)\
-{\
-int h = 8;\
-\
-src -= 1;\
-rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
-__asm__ volatile(\
-LOAD_ROUNDER_MMX("%4")\
-"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
-"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
-"1:\n\t"\
-"movq 2*0+0(%1), %%mm1\n\t"\
-"movq 2*0+8(%1), %%mm2\n\t"\
-"movq 2*1+0(%1), %%mm3\n\t"\
-"movq 2*1+8(%1), %%mm4\n\t"\
-"paddw