---
 libavcodec/x86/h264_intrapred.asm |  129 ++++++------------------------------
 libavcodec/x86/vp8dsp.asm         |   21 ------
 libavutil/x86/x86util.asm         |   41 +++++++++++-
 3 files changed, 59 insertions(+), 132 deletions(-)

diff --git a/libavcodec/x86/h264_intrapred.asm 
b/libavcodec/x86/h264_intrapred.asm
index 87d32c4..5984454 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -103,15 +103,8 @@ cglobal pred16x16_horizontal, 2,3
 %else
     punpcklbw m0, m0
     punpcklbw m1, m1
-%if cpuflag(mmx2)
-    pshufw    m0, m0, 0xff
-    pshufw    m1, m1, 0xff
-%else
-    punpckhwd m0, m0
-    punpckhwd m1, m1
-    punpckhdq m0, m0
-    punpckhdq m1, m1
-%endif
+    SPLATW    m0, m0, 3
+    SPLATW    m1, m1, 3
     mova [r0+r1*0+8], m0
     mova [r0+r1*1+8], m1
 %endif
@@ -162,18 +155,8 @@ cglobal pred16x16_dc, 2,7
     shr       r2d, 5
 %if cpuflag(ssse3)
     pxor       m1, m1
-    movd       m0, r2d
-    pshufb     m0, m1
-%elif cpuflag(sse2)
-    movd       m0, r2d
-    punpcklbw  m0, m0
-    pshuflw    m0, m0, 0
-    punpcklqdq m0, m0
-%elif cpuflag(mmx2)
-    movd       m0, r2d
-    punpcklbw  m0, m0
-    pshufw     m0, m0, 0
 %endif
+    SPLATB_REG m0, r2d, m1
 
 %if mmsize==8
     mov       r3d, 8
@@ -227,12 +210,7 @@ cglobal pred16x16_tm_vp8, 2,5
     movzx     r2d, byte [r0+r1-1]
     sub       r2d, r3d
     movd      mm4, r2d
-%if cpuflag(mmx2)
-    pshufw    mm4, mm4, 0
-%else
-    punpcklwd mm4, mm4
-    punpckldq mm4, mm4
-%endif
+    SPLATW    mm4, mm4, 0
     movq      mm5, mm4
     movq      mm6, mm4
     movq      mm7, mm4
@@ -332,19 +310,15 @@ cglobal pred16x16_plane_%1, 2,9,7
     movhlps      m1, m0
 %endif
     paddw        m0, m1
-%if cpuflag(sse2)
-    pshuflw      m1, m0, 0xE
-%elif cpuflag(mmx2)
-    pshufw       m1, m0, 0xE
+%if cpuflag(mmx2)
+    PSHUFLW      m1, m0, 0xE
 %elif cpuflag(mmx)
     mova         m1, m0
     psrlq        m1, 32
 %endif
     paddw        m0, m1
-%if cpuflag(sse2)
-    pshuflw      m1, m0, 0x1
-%elif cpuflag(mmx2)
-    pshufw       m1, m0, 0x1
+%if cpuflag(mmx2)
+    PSHUFLW      m1, m0, 0x1
 %elif cpuflag(mmx)
     mova         m1, m0
     psrlq        m1, 16
@@ -483,25 +457,9 @@ cglobal pred16x16_plane_%1, 2,9,7
 
     movd         m1, r5d
     movd         m3, r3d
-%if cpuflag(sse2)
-    pshuflw      m0, m0, 0x0
-    pshuflw      m1, m1, 0x0
-    pshuflw      m3, m3, 0x0
-    punpcklqdq   m0, m0           ; splat H (words)
-    punpcklqdq   m1, m1           ; splat V (words)
-    punpcklqdq   m3, m3           ; splat a (words)
-%elif cpuflag(mmx2)
-    pshufw       m0, m0, 0x0
-    pshufw       m1, m1, 0x0
-    pshufw       m3, m3, 0x0
-%elif cpuflag(mmx)
-    punpcklwd    m0, m0
-    punpcklwd    m1, m1
-    punpcklwd    m3, m3
-    punpckldq    m0, m0
-    punpckldq    m1, m1
-    punpckldq    m3, m3
-%endif
+    SPLATW       m0, m0, 0        ; H
+    SPLATW       m1, m1, 0        ; V
+    SPLATW       m3, m3, 0        ; a
 %ifidn %1, svq3
     SWAP          0, 1
 %endif
@@ -626,10 +584,8 @@ cglobal pred8x8_plane, 2,9,7
     paddw        m0, m1
 
 %if notcpuflag(ssse3)
-%if cpuflag(sse2) ; mmsize == 16
-    pshuflw      m1, m0, 0xE
-%elif cpuflag(mmx2)
-    pshufw       m1, m0, 0xE
+%if cpuflag(mmx2)
+    PSHUFLW      m1, m0, 0xE
 %elif cpuflag(mmx)
     mova         m1, m0
     psrlq        m1, 32
@@ -637,10 +593,8 @@ cglobal pred8x8_plane, 2,9,7
     paddw        m0, m1
 %endif ; !ssse3
 
-%if cpuflag(sse2)
-    pshuflw      m1, m0, 0x1
-%elif cpuflag(mmx2)
-    pshufw       m1, m0, 0x1
+%if cpuflag(mmx2)
+    PSHUFLW      m1, m0, 0x1
 %elif cpuflag(mmx)
     mova         m1, m0
     psrlq        m1, 16
@@ -711,25 +665,9 @@ cglobal pred8x8_plane, 2,9,7
 
     movd         m1, r5d
     movd         m3, r3d
-%if cpuflag(sse2)
-    pshuflw      m0, m0, 0x0
-    pshuflw      m1, m1, 0x0
-    pshuflw      m3, m3, 0x0
-    punpcklqdq   m0, m0           ; splat H (words)
-    punpcklqdq   m1, m1           ; splat V (words)
-    punpcklqdq   m3, m3           ; splat a (words)
-%elif cpuflag(mmx2)
-    pshufw       m0, m0, 0x0
-    pshufw       m1, m1, 0x0
-    pshufw       m3, m3, 0x0
-%elif cpuflag(mmx)
-    punpcklwd    m0, m0
-    punpcklwd    m1, m1
-    punpcklwd    m3, m3
-    punpckldq    m0, m0
-    punpckldq    m1, m1
-    punpckldq    m3, m3
-%endif
+    SPLATW       m0, m0, 0        ; H
+    SPLATW       m1, m1, 0        ; V
+    SPLATW       m3, m3, 0        ; a
 %if mmsize == 8
     mova         m2, m0
 %endif
@@ -815,24 +753,8 @@ cglobal pred8x8_horizontal, 2,3
     mova      m2, [pb_3]
 %endif
 .loop:
-    movd      m0, [r0+r1*0-4]
-    movd      m1, [r0+r1*1-4]
-%if cpuflag(ssse3)
-    pshufb    m0, m2
-    pshufb    m1, m2
-%else
-    punpcklbw m0, m0
-    punpcklbw m1, m1
-%if cpuflag(mmx2)
-    pshufw    m0, m0, 0xff
-    pshufw    m1, m1, 0xff
-%else
-    punpckhwd m0, m0
-    punpckhwd m1, m1
-    punpckhdq m0, m0
-    punpckhdq m1, m1
-%endif
-%endif
+    SPLATB_LOAD m0, r0+r1*0-1, m2
+    SPLATB_LOAD m1, r0+r1*1-1, m2
     mova [r0+r1*0], m0
     mova [r0+r1*1], m1
     lea       r0, [r0+r1*2]
@@ -1000,15 +922,8 @@ cglobal pred8x8_tm_vp8, 2,6
     sub       r3d, r4d
     movd      mm2, r2d
     movd      mm4, r3d
-%if cpuflag(mmx2)
-    pshufw    mm2, mm2, 0
-    pshufw    mm4, mm4, 0
-%else
-    punpcklwd mm2, mm2
-    punpcklwd mm4, mm4
-    punpckldq mm2, mm2
-    punpckldq mm4, mm4
-%endif
+    SPLATW    mm2, mm2, 0
+    SPLATW    mm4, mm4, 0
     movq      mm3, mm2
     movq      mm5, mm4
     paddw     mm2, mm0
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 82f21fe..531b205 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1465,27 +1465,6 @@ VP8_DC_WHT
 %endif
 %endmacro
 
-%macro SPLATB_REG 2-3
-%if cpuflag(ssse3)
-    movd           %1, %2d
-    pshufb         %1, %3
-%elif cpuflag(sse2)
-    movd           %1, %2d
-    punpcklbw      %1, %1
-    pshuflw        %1, %1, 0x0
-    punpcklqdq     %1, %1
-%elif cpuflag(mmx2)
-    movd           %1, %2d
-    punpcklbw      %1, %1
-    pshufw         %1, %1, 0x0
-%else
-    movd           %1, %2d
-    punpcklbw      %1, %1
-    punpcklwd      %1, %1
-    punpckldq      %1, %1
-%endif
-%endmacro
-
 %macro SIMPLE_LOOPFILTER 2
 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
 %if mmsize == 8 ; mmx/mmxext
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 066384b..941ec76 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -256,15 +256,26 @@
 %define ABSB ABSB_MMX
 %define ABSB2 ABSB2_MMX
 
-%macro SPLATB_MMX 3
+%macro SPLATB_LOAD 3
+%if cpuflag(ssse3)
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%else
     movd      %1, [%2-3] ;to avoid crossing a cacheline
     punpcklbw %1, %1
     SPLATW    %1, %1, 3
+%endif
 %endmacro
 
-%macro SPLATB_SSSE3 3
-    movd      %1, [%2-3]
+%macro SPLATB_REG 3
+%if cpuflag(ssse3)
+    movd      %1, %2d
     pshufb    %1, %3
+%else
+    movd      %1, %2d
+    punpcklbw %1, %1
+    SPLATW    %1, %1, 0
+%endif
 %endmacro
 
 %macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp
@@ -296,6 +307,14 @@
 %endif
 %endmacro
 
+%macro PSHUFLW 1+
+    %if mmsize == 8
+        pshufw %1
+    %else
+        pshuflw %1
+    %endif
+%endmacro
+
 %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
 %ifnum %5
     pand   m%3, m%5, m%4 ; src .. y6 .. y4
@@ -521,8 +540,22 @@
 %if mmsize == 16
     pshuflw    %1, %2, (%3)*0x55
     punpcklqdq %1, %1
-%else
+%elif cpuflag(mmx2)
     pshufw     %1, %2, (%3)*0x55
+%else
+    %ifnidn %1, %2
+        mova       %1, %2
+    %endif
+    %if %3 & 2
+        punpckhwd  %1, %1
+    %else
+        punpcklwd  %1, %1
+    %endif
+    %if %3 & 1
+        punpckhwd  %1, %1
+    %else
+        punpcklwd  %1, %1
+    %endif
 %endif
 %endmacro
 
-- 
1.7.4.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to