+%macro IDCT_ADD16_10 1
+cglobal h264_idct_add16_10_%1, 5,7
+    xor          r5, r5
+%ifdef PIC
+    lea         r11, [scan8_mem]
+%endif
+.nextblock
+    movzx        r6, byte [scan8+r5]
+    movzx        r6, byte [r4+r6]
+    test         r6, r6

cmp byte [r4+r6], 0

+    jz .skipblock
+    mov         r6d, dword [r1+r5*4]
+    lea          r6, [r0+r6]

add r6, r0

+    IDCT4_ADD_10 r6, r2, r3
+.skipblock
+    inc          r5
+    add          r2, 64
+    cmp          r5, 16
+    jl .nextblock
+    REP_RET
+%endmacro

Are you sure you don't want to deinline the idct part and unroll the loop over blocks? If not, what's different about h264_idct_add16_sse2?

+%macro IDCT_ADD16INTRA_10 1
+cglobal h264_idct_add16intra_10_%1,5,7
+    xor          r5, r5
+%ifdef PIC
+    lea         r11, [scan8_mem]
+%endif
+.nextblock
+    movzx        r6, byte [scan8+r5]
+    movzx        r6, byte [r4+r6]
+    or          r6d, dword [r2]
+    test         r6, r6

or already sets flags.
Check dc-only, or is that rarer in 10bit?

+cglobal h264_idct_dc_add_10_mmx2,3,3
+    mov      r1d, dword [r1]
+    add       r1, 32
+    sar       r1, 6
+    movd      m0, r1d

I would expect that to be faster in mmx, even though no simd is possible. Especially on amd, where movd mm,r32 is slow.

+cglobal h264_idct8_add_10_%1, 3,4,8
+    %assign pad 256+16-gprsize-(stack_offset&15)
+    SUB         rsp, pad
+
+    add   dword [r1], 32
+    IDCT8_ADD_SSE_START r1   , rsp
+    IDCT8_ADD_SSE_START r1+16, rsp+128
+    lea          r3, [r0+8]
+    IDCT8_ADD_SSE_END   r0  , rsp,    r2
+    IDCT8_ADD_SSE_END   r3  , rsp+16, r2

In a previous patch you had deinlined IDCT8. Did you decide that it's ok to spend 2kb on this function? Or 4kb since h264_idct8_add4_10 doesn't call h264_idct8_add_10?

--Loren Merritt
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to