1.01x faster (1069±1.9 vs. 1060±0.7 decicycles) compared with sse2 --- libavcodec/x86/h264_idct.asm | 5 +++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 7 insertions(+)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 24fb4d2..ca8ffdb 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -1162,3 +1162,8 @@ cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ movsxdifnidn stride_q, stride_d IDCT4_ADD dst_q, block_q, stride_q RET + +cglobal h264_idct8_add_8, 3, 4, 10, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + IDCT8_ADD_SSE dst_q, block_q, stride_q, r3 +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 8ba085f..2172a71 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -40,6 +40,7 @@ IDCT_ADD_FUNC(8_dc, 8, mmxext) IDCT_ADD_FUNC(8_dc, 10, sse2) IDCT_ADD_FUNC(8, 8, mmx) IDCT_ADD_FUNC(8, 8, sse2) +IDCT_ADD_FUNC(8, 8, avx) IDCT_ADD_FUNC(8, 10, sse2) IDCT_ADD_FUNC(, 10, avx) IDCT_ADD_FUNC(8_dc, 10, avx) @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, } c->h264_idct_add = ff_h264_idct_add_8_avx; + c->h264_idct8_add = ff_h264_idct8_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel