Broken FATE 1.02x faster (1580±4.8 vs. 1555±3.9 decicycles) compared with sse2 --- libavcodec/x86/h264_idct.asm | 43 +++++++++++++++++++++++++++++++++++++++++-- libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index 1515ea5..16998dc 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -895,7 +895,7 @@ REP_RET %else add r0, r0m %endif - call h264_add8x4_idct_sse2 + call h264_add8x4_idct_ %+ cpuname jmp %%skip %%trydc: movsx r0, word [r2 ] @@ -907,13 +907,15 @@ REP_RET %else add r0, r0m %endif - call h264_idct_dc_add8_mmxext + call h264_idct_dc_add8_ %+ cpuname %%skip: %if %1 < 7 add r2, 64 %endif %endmacro +%define h264_idct_dc_add8_sse2 h264_idct_dc_add8_mmxext + ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, ; int16_t *block, int stride, ; const uint8_t nnzc[6 * 8]) @@ -1193,6 +1195,27 @@ ret packuswb m1, m1 %endmacro +ALIGN 16 +; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered +; FIXME: I produce incorrect output +h264_idct_dc_add8_avx: + movsxdifnidn r3, r3d + movd m0, [r2 ] ; 0 0 X D + mov word [r2+ 0], 0 + punpcklwd m0, [r2+32] ; x X d D + mov word [r2+32], 0 + paddsw m0, [pw_32] + psraw m0, 6 + punpcklwd m0, m0 ; d d D D + pxor m1, m1 ; 0 0 0 0 + psubw m1, m0 ; -d-d-D-D + packuswb m0, m1 ; -d-d-D-D d d D D + pshuflw m1, m0, q3322 ; -d-d-d-d-D-D-D-D + punpcklwd m0, m0 ; d d d d D D D D + lea r6, [r3*3] + DC_ADD_MMXEXT_OP movq, r0, r3, r6 +ret + cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ movsxdifnidn stride_q, stride_d IDCT4_ADD dst_q, block_q, stride_q @@ -1238,6 +1261,22 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s add16_sse2_cycle 7, 0x26 RET +; FIXME: I produce incorrect output +cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_ + movsxdifnidn stride_q, stride_d + %if ARCH_X86_64 + mov r7, r0 + %endif + add16intra_sse2_cycle 0, 0xc + add16intra_sse2_cycle 1, 0x14 + add16intra_sse2_cycle 2, 0xe + add16intra_sse2_cycle 3, 0x16 + add16intra_sse2_cycle 4, 0x1c + add16intra_sse2_cycle 5, 0x24 + add16intra_sse2_cycle 6, 0x1e + add16intra_sse2_cycle 7, 0x26 +RET + ; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg ; 0 1 2 3 4 5 6 7 8 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 4050276..e09566d 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -68,6 +68,7 @@ IDCT_ADD_REP_FUNC(, 16, 10, sse2) IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) +IDCT_ADD_REP_FUNC(, 16intra, 8, avx) IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) IDCT_ADD_REP_FUNC(, 16, 10, avx) IDCT_ADD_REP_FUNC(, 16intra, 10, avx) @@ -350,6 +351,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx; c->h264_idct_add16 = ff_h264_idct_add16_8_avx; c->h264_idct8_add4 = ff_h264_idct8_add4_8_avx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel