On Tue, 28 Jun 2011, Daniel Kang wrote:

> @@ -2649,6 +2650,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
> *avctx)
>              SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
>              SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
>              }
> +#if HAVE_YASM
> +#define SET_QPEL_FUNCS_10(PFX, IDX, SIZE, CPU) \
> +            c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## SIZE ## _mc00_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 1] = ff_ ## PFX ## SIZE ## _mc10_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## SIZE ## _mc20_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 3] = ff_ ## PFX ## SIZE ## _mc30_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## SIZE ## _mc01_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 5] = ff_ ## PFX ## SIZE ## _mc11_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 6] = ff_ ## PFX ## SIZE ## _mc21_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 7] = ff_ ## PFX ## SIZE ## _mc31_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## SIZE ## _mc02_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][ 9] = ff_ ## PFX ## SIZE ## _mc12_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][10] = ff_ ## PFX ## SIZE ## _mc22_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][11] = ff_ ## PFX ## SIZE ## _mc32_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## SIZE ## _mc03_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][13] = ff_ ## PFX ## SIZE ## _mc13_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][14] = ff_ ## PFX ## SIZE ## _mc23_10_ 
> ## CPU; \
> +            c->PFX ## _pixels_tab[IDX][15] = ff_ ## PFX ## SIZE ## _mc33_10_ 
> ## CPU
> +            else if (bit_depth == 10) {
> +                SET_QPEL_FUNCS_10(put_h264_qpel, 0, 16, mmxext);

Useless macro.
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext);

> @@ -2777,7 +2805,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
> *avctx)
>              H264_QPEL_FUNCS(3, 3, sse2);
>              }
>  #if HAVE_YASM
> +#define H264_QPEL_FUNCS_10(x, y, CPU)\
> +            c->put_h264_qpel_pixels_tab[0][x+y*4] = 
> ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
> +            c->put_h264_qpel_pixels_tab[1][x+y*4] = 
> ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
> +            c->avg_h264_qpel_pixels_tab[0][x+y*4] = 
> ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
> +            c->avg_h264_qpel_pixels_tab[1][x+y*4] = 
> ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
>              if (bit_depth == 10) {
> +                H264_QPEL_FUNCS_10(0, 0, sse2);
> +                H264_QPEL_FUNCS_10(0, 1, sse2);
> +                H264_QPEL_FUNCS_10(0, 2, sse2);
> +                H264_QPEL_FUNCS_10(0, 3, sse2);
> +                H264_QPEL_FUNCS_10(1, 1, sse2);
> +                H264_QPEL_FUNCS_10(1, 2, sse2);
> +                H264_QPEL_FUNCS_10(1, 3, sse2);
> +                H264_QPEL_FUNCS_10(2, 1, sse2);
> +                H264_QPEL_FUNCS_10(2, 2, sse2);
> +                H264_QPEL_FUNCS_10(2, 3, sse2);
> +                H264_QPEL_FUNCS_10(3, 1, sse2);
> +                H264_QPEL_FUNCS_10(3, 2, sse2);
> +                H264_QPEL_FUNCS_10(3, 3, sse2);

Missing some subpels.
SET_QPEL_FUNCS.

> +cglobal %1_h264_qpel8_mc00_10_sse2, 3,3

3,4

> +cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
> +    mov     r3d, %3
> +    pxor     m0, m0
> +    mova     m1, [pw_pixel_max]
> +    mova     m6, [pw_16]
> +.nextrow
> +    movu     m2, [r1-4]
> +    movu     m3, [r1-2]
> +    movu     m4, [r1+0]
> +    ADDW     m2, [r1+6], m5
> +    ADDW     m3, [r1+4], m5
> +    ADDW     m4, [r1+2], m5

palignr

> +%macro MC11 3
> +; this REALLY needs x86_64

yes

> +cglobal %2_h264_qpel%3_mc11_10_%1, 3,6,8
> +    mov      r4, r1
> +.body
> +    PRELOAD_V
> +
> +    sub      r0, r2
> +    sub      r4, r2
> +    mov      r5, r2
> +    neg      r5
> +%assign j 0
> +%rep %3
> +    %assign i (j % 6)
> +    call v_filt%3_ %+ i %+ _10_%1
> +    call h_filt%3_ %+ i %+ _10_%1
> +    OP_MOV [r0], m0
> +%if j<%3-1
> +;reload m5
> +    movu     m5, [r1+r5]

Factor movu into h_filt (i.e. into all the phases of h_filt that aren't
at the end of the loop. Dunno if this will be useful after the x86_64
opt of having only 1 phase.)

> +put_hv%2_10_%1:
> +    add     rsp, gprsize

Puts the return address below the stack.
lea r4, [rsp+PAD+gprsize]

> +%macro H_LOOP 2
> +%if num_mmregs > 8
> +    %define s1 m8
> +    %define s2 m9
> +    %define s3 m10
> +    %define d1 m11
> +%else
> +    %define s1 [tap1]
> +    %define s2 [tap2]
> +    %define s3 [tap3]
> +    %define d1 [depad]

Try loading these only once per iteration.
Increases number of instructions, but should reduce both uops and code size.

> +#define QPEL16_OPMC(OP, MC, MMX)\
> +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t 
> *src, int stride){\
> +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
> +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
> +    src += 8*stride;\
> +    dst += 8*stride;\
> +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst   , src   , stride);\
> +    ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
> +}

These wrappers are ginormous. asm could do much better by abusing calling
convention.
x86_64 gcc-4.5.2:
h264_qpel_10bit.o contains 8160 bytes of code
sse2 wrappers are 3360 bytes
mmx wrappers are 14336 bytes

There are some redundant computations in the v pass of 16x16 hv filter.

--Loren Merritt
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to