+%macro IDCT_ADD16_10 1 +cglobal h264_idct_add16_10_%1, 5,7 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6
cmp byte [r4+r6], 0
+ jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6]
add r6, r0
+ IDCT4_ADD_10 r6, r2, r3 +.skipblock + inc r5 + add r2, 64 + cmp r5, 16 + jl .nextblock + REP_RET +%endmacro
Are you sure you don't want to deinline the idct part and unroll the loop over blocks? If not, what's different about h264_idct_add16_sse2?
+%macro IDCT_ADD16INTRA_10 1 +cglobal h264_idct_add16intra_10_%1,5,7 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + or r6d, dword [r2] + test r6, r6
or already sets flags. Check dc-only, or is that rarer in 10bit?
+cglobal h264_idct_dc_add_10_mmx2,3,3 + mov r1d, dword [r1] + add r1, 32 + sar r1, 6 + movd m0, r1d
I would expect that to be faster in mmx, even though no simd is possible. Especially on amd, where movd mm,r32 is slow.
+cglobal h264_idct8_add_10_%1, 3,4,8 + %assign pad 256+16-gprsize-(stack_offset&15) + SUB rsp, pad + + add dword [r1], 32 + IDCT8_ADD_SSE_START r1 , rsp + IDCT8_ADD_SSE_START r1+16, rsp+128 + lea r3, [r0+8] + IDCT8_ADD_SSE_END r0 , rsp, r2 + IDCT8_ADD_SSE_END r3 , rsp+16, r2
In a previous patch you had deinlined IDCT8. Did you decide that it's ok to spend 2kb on this function? Or 4kb since h264_idct8_add4_10 doesn't call h264_idct8_add_10?
--Loren Merritt _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
