On 6/11/2016 5:55 PM, Martin Storsjö wrote: > On Sat, 11 Jun 2016, James Almer wrote: > >> On 6/11/2016 5:32 PM, Martin Storsjö wrote: >>> --- >>> libavcodec/x86/h264_idct.asm | 5 +++++ >>> libavcodec/x86/h264_idct_10bit.asm | 4 ++++ >>> 2 files changed, 9 insertions(+) >>> >>> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm >>> index 313791a..9abed3c 100644 >>> --- a/libavcodec/x86/h264_idct.asm >>> +++ b/libavcodec/x86/h264_idct.asm >>> @@ -82,6 +82,7 @@ SECTION .text >>> INIT_MMX mmx >>> ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) >>> cglobal h264_idct_add_8, 3, 3, 0 >>> + movsxd r2, r2d >>> IDCT4_ADD r0, r1, r2 >>> RET >>> >>> @@ -204,6 +205,7 @@ cglobal h264_idct_add_8, 3, 3, 0 >>> INIT_MMX mmx >>> ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) >>> cglobal h264_idct8_add_8, 3, 4, 0 >>> + movsxd r2, r2d >>> %assign pad 128+4-(stack_offset&7) >>> SUB rsp, pad >>> >>> @@ -272,6 +274,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 >>> INIT_XMM sse2 >>> ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) >>> cglobal h264_idct8_add_8, 3, 4, 10 >>> + movsxd r2, r2d >>> IDCT8_ADD_SSE r0, r1, r2, r3 >>> RET >>> >>> @@ -310,6 +313,7 @@ INIT_MMX mmxext >>> ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int >>> stride) >>> %if ARCH_X86_64 >>> cglobal h264_idct_dc_add_8, 3, 4, 0 >>> + movsxd r2, r2d >>> movsx r3, word [r1] >>> mov dword [r1], 0 >>> DC_ADD_MMXEXT_INIT r3, r2 >>> @@ -318,6 +322,7 @@ cglobal h264_idct_dc_add_8, 3, 4, 0 >>> >>> ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int >>> stride) >>> cglobal h264_idct8_dc_add_8, 3, 4, 0 >>> + movsxd r2, r2d >>> movsx r3, word [r1] >>> mov dword [r1], 0 >>> DC_ADD_MMXEXT_INIT r3, r2 >>> diff --git a/libavcodec/x86/h264_idct_10bit.asm >>> b/libavcodec/x86/h264_idct_10bit.asm >>> index b7d5105..a5bfb34 100644 >>> --- a/libavcodec/x86/h264_idct_10bit.asm >>> +++ b/libavcodec/x86/h264_idct_10bit.asm >>> @@ -77,6 +77,7 @@ SECTION .text >>> >>> %macro IDCT_ADD_10 0 >>> cglobal h264_idct_add_10, 3,3 >>> + movsxd r2, r2d >>> IDCT4_ADD_10 r0, r1, r2 >>> RET >>> %endmacro >>> @@ -190,6 +191,7 @@ IDCT_ADD16_10 >>> >>> INIT_MMX mmxext >>> cglobal h264_idct_dc_add_10,3,3 >>> + movsxd r2, r2d >>> movd m0, [r1] >>> mov dword [r1], 0 >>> paddd m0, [pd_32] >>> @@ -205,6 +207,7 @@ cglobal h264_idct_dc_add_10,3,3 >>> >>> ;----------------------------------------------------------------------------- >>> %macro IDCT8_DC_ADD 0 >>> cglobal h264_idct8_dc_add_10,3,4,7 >>> + movsxd r2, r2d >>> movd m0, [r1] >>> mov dword[r1], 0 >>> paddd m0, [pd_32] >>> @@ -438,6 +441,7 @@ IDCT_ADD8 >>> >>> %macro IDCT8_ADD 0 >>> cglobal h264_idct8_add_10, 3,4,16 >>> + movsxd r2, r2d >>> %if UNIX64 == 0 >>> %assign pad 16-gprsize-(stack_offset&15) >>> sub rsp, pad >>> >> >> Use the movsxdifnidn macro, to avoid emitting a movsxd on x86_32. > > Thanks, I can change that. > > Some of the codepaths are within an %if ARCH_X86_64 (like > ff_h264_idct_dc_add_8_mmxext); is it still preferred to use that macro form > there, or an explicit movsxd?
I don't have a preference. movsxd would make it clear to whoever reads the code that it's an x86_64 codepath whereas using movsxdifnidn everywhere is more consistent in a way. It's mostly a cosmetic choice, i guess. > > // Martin > _______________________________________________ > libav-devel mailing list > [email protected] > https://lists.libav.org/mailman/listinfo/libav-devel _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
