the original macro has been split into 3 separate macros because they only
share a few lines of code, and splitting it makes the code simpler and easier
to understand.
---
libavcodec/x86/dsputil_mmx.c | 12 +++---
libavcodec/x86/dsputil_yasm.asm | 85 +++++++++++++++++++++++---------------
2 files changed, 57 insertions(+), 40 deletions(-)
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index ed80ab6..8be2c01 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2396,13 +2396,13 @@ int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t
*v1, const int16_t *v2, con
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3, int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3, int order, int mul);
-void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_mmx2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_mmx2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_ba_sse2 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
@@ -2852,9 +2852,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
*avctx)
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 =
ff_scalarproduct_and_madd_int16_mmx2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
- c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+ c->apply_window_int16 = ff_apply_window_int16_ba_mmx2;
} else {
- c->apply_window_int16 = ff_apply_window_int16_mmxext;
+ c->apply_window_int16 = ff_apply_window_int16_mmx2;
}
#endif
}
@@ -2888,7 +2888,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
*avctx)
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
- c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+ c->apply_window_int16 = ff_apply_window_int16_ba_sse2;
} else {
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 217a1ac..c07c1f3 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -241,20 +241,15 @@ SCALARPRODUCT_LOOP 0
pmulhrsw %1, %2
%endmacro
-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact
version, %3=has_ssse3
-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+; This version expands 16-bit to 32-bit, multiplies by the window,
+; adds 16384 for rounding, right shifts 15, then repacks back to words to
+; save to the output. The window is reversed for the second half.
+; This is bit-identical to the C version.
+%macro APPLY_WINDOW_INT16_BA 0
+cglobal apply_window_int16_ba, 4,5,6, output, input, window, offset, offset2
lea offset2q, [offsetq-mmsize]
-%if %2
mova m5, [pd_16384]
-%elifidn %1, ssse3
- mova m5, [pb_revwords]
- ALIGN 16
-%endif
.loop:
-%if %2
- ; This version expands 16-bit to 32-bit, multiplies by the window,
- ; adds 16384 for rounding, right shifts 15, then repacks back to words to
- ; save to the output. The window is reversed for the second half.
mova m3, [windowq+offset2q]
mova m4, [ inputq+offset2q]
pxor m0, m0
@@ -287,20 +282,19 @@ cglobal apply_window_int16_%1, 4,5,6, output, input,
window, offset, offset2
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
-%elif %3
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The ssse3 version is bit-identical.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- pmulhrsw m1, m0
- REVERSE_WORDS m0, m5
- pmulhrsw m0, [ inputq+offsetq ]
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m0
-%else
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
- ; therefore are not bit-identical to the C version.
+ add offsetd, mmsize
+ sub offset2d, mmsize
+ jae .loop
+ REP_RET
+%endmacro
+
+; This version does the 16x16->16 multiplication in-place without expanding
+; to 32-bit. It does not use rounding, and therefore is not bit-identical
+; to the C version.
+%macro APPLY_WINDOW_INT16 0
+cglobal apply_window_int16, 4,5,4, output, input, window, offset, offset2
+ lea offset2q, [offsetq-mmsize]
+.loop:
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
mova m2, [ inputq+offsetq ]
@@ -309,28 +303,51 @@ cglobal apply_window_int16_%1, 4,5,6, output, input,
window, offset, offset2
MUL16FIXED m2, m0, m3
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m2
-%endif
add offsetd, mmsize
sub offset2d, mmsize
jae .loop
REP_RET
%endmacro
-INIT_MMX
+; This version does the 16x16->16 multiplication in-place without expanding
+; to 32-bit. It is bit-identical to the C version.
+%macro APPLY_WINDOW_INT16_SSSE3 0
+cglobal apply_window_int16, 4,5,3, output, input, window, offset, offset2
+ lea offset2q, [offsetq-mmsize]
+ mova m2, [pb_revwords]
+ ALIGN 16
+.loop:
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ pmulhrsw m1, m0
+ REVERSE_WORDS m0, m2
+ pmulhrsw m0, [ inputq+offsetq ]
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m0
+ add offsetd, mmsize
+ sub offset2d, mmsize
+ jae .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx2
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
%define MUL16FIXED MUL16FIXED_MMXEXT
-APPLY_WINDOW_INT16 mmxext, 0, 0
-APPLY_WINDOW_INT16 mmxext_ba, 1, 0
-INIT_XMM
+APPLY_WINDOW_INT16
+APPLY_WINDOW_INT16_BA
+INIT_XMM sse2
%define REVERSE_WORDS REVERSE_WORDS_SSE2
-APPLY_WINDOW_INT16 sse2, 0, 0
-APPLY_WINDOW_INT16 sse2_ba, 1, 0
-APPLY_WINDOW_INT16 ssse3_atom, 0, 1
+APPLY_WINDOW_INT16
+APPLY_WINDOW_INT16_BA
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16_SSSE3
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
-APPLY_WINDOW_INT16 ssse3, 0, 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16_SSSE3
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
uint8_t *diff, int w, int *left, int *left_top)
+INIT_CPUFLAGS
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left,
left_top
movq mm0, [topq]
movq mm2, mm0
--
1.7.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel