the original macro has been split into 3 separate macros because they only
share a few lines of code, and splitting it makes the code simpler and easier
to understand.
---
 libavcodec/x86/dsputil_mmx.c    |   12 ++--
 libavcodec/x86/dsputil_yasm.asm |  114 +++++++++++++++++++++------------------
 2 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index be0ac2e..0fba8ed 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2396,13 +2396,13 @@ int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t 
*v1, const int16_t *v2, con
 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, 
const int16_t *v3, int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, 
const int16_t *v3, int order, int mul);
 
-void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_mmx2      (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_mmx2   (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_sse2   (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
@@ -2850,9 +2850,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
*avctx)
             c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
             c->scalarproduct_and_madd_int16 = 
ff_scalarproduct_and_madd_int16_mmx2;
             if (avctx->flags & CODEC_FLAG_BITEXACT) {
-                c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+                c->apply_window_int16 = ff_apply_window_int16_ba_mmx2;
             } else {
-                c->apply_window_int16 = ff_apply_window_int16_mmxext;
+                c->apply_window_int16 = ff_apply_window_int16_mmx2;
             }
 #endif
         }
@@ -2886,7 +2886,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
*avctx)
                 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
             }
             if (avctx->flags & CODEC_FLAG_BITEXACT) {
-                c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+                c->apply_window_int16 = ff_apply_window_int16_ba_sse2;
             } else {
                 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
                     c->apply_window_int16 = ff_apply_window_int16_sse2;
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 373e430..9801573 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -210,24 +210,20 @@ SCALARPRODUCT_LOOP 0
 ;                            const int16_t *window, unsigned int len)
 ;-----------------------------------------------------------------------------
 
-%macro REVERSE_WORDS_MMXEXT 1-2
-    pshufw   %1, %1, 0x1B
-%endmacro
-
-%macro REVERSE_WORDS_SSE2 1-2
+%macro REVERSE_WORDS 1
+%if sizeof%1 == 16
     pshuflw  %1, %1, 0x1B
     pshufhw  %1, %1, 0x1B
     pshufd   %1, %1, 0x4E
-%endmacro
-
-%macro REVERSE_WORDS_SSSE3 2
-    pshufb  %1, %2
+%else
+    pshufw   %1, %1, 0x1B
+%endif
 %endmacro
 
 ; dst = (dst * src) >> 15
 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
 ; in from the pmullw result.
-%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
+%macro MUL16FIXED 3 ; dst, src, temp
     mova    %3, %1
     pmulhw  %1, %2
     pmullw  %3, %2
@@ -236,25 +232,15 @@ SCALARPRODUCT_LOOP 0
     por     %1, %3
 %endmacro
 
-; dst = ((dst * src) + (1<<14)) >> 15
-%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
-    pmulhrsw   %1, %2
-%endmacro
-
-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact 
version, %3=has_ssse3
-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+; This version expands 16-bit to 32-bit, multiplies by the window,
+; adds 16384 for rounding, right shifts 15, then repacks back to words to
+; save to the output. The window is reversed for the second half.
+; This is bit-identical to the C version.
+%macro APPLY_WINDOW_INT16_BA 0
+cglobal apply_window_int16_ba, 4,5,6, output, input, window, offset, offset2
     lea     offset2q, [offsetq-mmsize]
-%if %2
     mova          m5, [pd_16384]
-%elifidn %1, ssse3
-    mova          m5, [pb_revwords]
-    ALIGN 16
-%endif
 .loop:
-%if %2
-    ; This version expands 16-bit to 32-bit, multiplies by the window,
-    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
-    ; save to the output. The window is reversed for the second half.
     mova          m3, [windowq+offset2q]
     mova          m4, [ inputq+offset2q]
     pxor          m0, m0
@@ -287,20 +273,19 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, 
window, offset, offset2
     psrad         m2, 15
     packssdw      m0, m2
     mova  [outputq+offsetq], m0
-%elif %3
-    ; This version does the 16x16->16 multiplication in-place without expanding
-    ; to 32-bit. The ssse3 version is bit-identical.
-    mova          m0, [windowq+offset2q]
-    mova          m1, [ inputq+offset2q]
-    pmulhrsw      m1, m0
-    REVERSE_WORDS m0, m5
-    pmulhrsw      m0, [ inputq+offsetq ]
-    mova  [outputq+offset2q], m1
-    mova  [outputq+offsetq ], m0
-%else
-    ; This version does the 16x16->16 multiplication in-place without expanding
-    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
-    ; therefore are not bit-identical to the C version.
+    add      offsetd, mmsize
+    sub     offset2d, mmsize
+    jae .loop
+    REP_RET
+%endmacro
+
+; This version does the 16x16->16 multiplication in-place without expanding
+; to 32-bit. It does not use rounding, and therefore is not bit-identical
+; to the C version.
+%macro APPLY_WINDOW_INT16 0
+cglobal apply_window_int16, 4,5,4, output, input, window, offset, offset2
+    lea     offset2q, [offsetq-mmsize]
+.loop:
     mova          m0, [windowq+offset2q]
     mova          m1, [ inputq+offset2q]
     mova          m2, [ inputq+offsetq ]
@@ -309,28 +294,53 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, 
window, offset, offset2
     MUL16FIXED    m2, m0, m3
     mova  [outputq+offset2q], m1
     mova  [outputq+offsetq ], m2
+    add      offsetd, mmsize
+    sub     offset2d, mmsize
+    jae .loop
+    REP_RET
+%endmacro
+
+; This version does the 16x16->16 multiplication in-place without expanding
+; to 32-bit. It is bit-identical to the C version.
+%macro APPLY_WINDOW_INT16_SSSE3 0
+cglobal apply_window_int16, 4,5,3, output, input, window, offset, offset2
+    lea     offset2q, [offsetq-mmsize]
+%if notcpuflag(atom)
+    mova          m2, [pb_revwords]
+%endif
+    ALIGN 16
+.loop:
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    pmulhrsw      m1, m0
+%if cpuflag(atom)
+    REVERSE_WORDS m0
+%else
+    pshufb        m0, m2
 %endif
+    pmulhrsw      m0, [ inputq+offsetq ]
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m0
     add      offsetd, mmsize
     sub     offset2d, mmsize
     jae .loop
     REP_RET
 %endmacro
 
-INIT_MMX
-%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
-%define MUL16FIXED MUL16FIXED_MMXEXT
-APPLY_WINDOW_INT16 mmxext,     0, 0
-APPLY_WINDOW_INT16 mmxext_ba,  1, 0
-INIT_XMM
-%define REVERSE_WORDS REVERSE_WORDS_SSE2
-APPLY_WINDOW_INT16 sse2,       0, 0
-APPLY_WINDOW_INT16 sse2_ba,    1, 0
-APPLY_WINDOW_INT16 ssse3_atom, 0, 1
-%define REVERSE_WORDS REVERSE_WORDS_SSSE3
-APPLY_WINDOW_INT16 ssse3,      0, 1
+INIT_MMX mmx2
+APPLY_WINDOW_INT16
+APPLY_WINDOW_INT16_BA
+INIT_XMM sse2
+APPLY_WINDOW_INT16
+APPLY_WINDOW_INT16_BA
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16_SSSE3
+INIT_XMM ssse3
+APPLY_WINDOW_INT16_SSSE3
 
 
 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const 
uint8_t *diff, int w, int *left, int *left_top)
+INIT_CPUFLAGS
 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, 
left_top
     movq    mm0, [topq]
     movq    mm2, mm0
-- 
1.7.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to