1.12x faster (638±12.7 vs. 568±4.3 decicycles) compared with mmxext --- libavcodec/x86/h264_idct.asm | 11 +++++++++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 13 insertions(+)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index c4b6e55..a74e095 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -1188,3 +1188,14 @@ cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ DC_ADD_INIT r3 DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 RET + +; Not any faster +cglobal h264_idct8_dc_add_8, 3, 4, 0 + movsxdifnidn stride_q, stride_d + movsx r3d, word [block_q] + mov dword [block_q], 0 + DC_ADD_INIT r3 + DC_ADD_MMXEXT_OP movq, dst_q, stride_q, r3 + lea dst_q, [dst_q + stride_q*4] + DC_ADD_MMXEXT_OP movq, dst_q, stride_q, r3 +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 1aa66a8..de7becf 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -38,6 +38,7 @@ IDCT_ADD_FUNC(_dc, 8, mmxext) IDCT_ADD_FUNC(_dc, 8, avx) IDCT_ADD_FUNC(_dc, 10, mmxext) IDCT_ADD_FUNC(8_dc, 8, mmxext) +IDCT_ADD_FUNC(8_dc, 8, avx) IDCT_ADD_FUNC(8_dc, 10, sse2) IDCT_ADD_FUNC(8, 8, mmx) IDCT_ADD_FUNC(8, 8, sse2) @@ -344,6 +345,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct_add = ff_h264_idct_add_8_avx; c->h264_idct8_add = ff_h264_idct8_add_8_avx; c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel