On Tue, 28 Jun 2011, Daniel Kang wrote:
> @@ -2649,6 +2650,33 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
> *avctx)
> SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
> SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
> }
> +#if HAVE_YASM
> +#define SET_QPEL_FUNCS_10(PFX, IDX, SIZE, CPU) \
> + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## SIZE ## _mc00_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 1] = ff_ ## PFX ## SIZE ## _mc10_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## SIZE ## _mc20_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 3] = ff_ ## PFX ## SIZE ## _mc30_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## SIZE ## _mc01_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 5] = ff_ ## PFX ## SIZE ## _mc11_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 6] = ff_ ## PFX ## SIZE ## _mc21_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 7] = ff_ ## PFX ## SIZE ## _mc31_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## SIZE ## _mc02_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][ 9] = ff_ ## PFX ## SIZE ## _mc12_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][10] = ff_ ## PFX ## SIZE ## _mc22_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][11] = ff_ ## PFX ## SIZE ## _mc32_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## SIZE ## _mc03_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][13] = ff_ ## PFX ## SIZE ## _mc13_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][14] = ff_ ## PFX ## SIZE ## _mc23_10_
> ## CPU; \
> + c->PFX ## _pixels_tab[IDX][15] = ff_ ## PFX ## SIZE ## _mc33_10_
> ## CPU
> + else if (bit_depth == 10) {
> + SET_QPEL_FUNCS_10(put_h264_qpel, 0, 16, mmxext);
Useless macro.
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext);
> @@ -2777,7 +2805,26 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext
> *avctx)
> H264_QPEL_FUNCS(3, 3, sse2);
> }
> #if HAVE_YASM
> +#define H264_QPEL_FUNCS_10(x, y, CPU)\
> + c->put_h264_qpel_pixels_tab[0][x+y*4] =
> ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
> + c->put_h264_qpel_pixels_tab[1][x+y*4] =
> ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
> + c->avg_h264_qpel_pixels_tab[0][x+y*4] =
> ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
> + c->avg_h264_qpel_pixels_tab[1][x+y*4] =
> ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
> if (bit_depth == 10) {
> + H264_QPEL_FUNCS_10(0, 0, sse2);
> + H264_QPEL_FUNCS_10(0, 1, sse2);
> + H264_QPEL_FUNCS_10(0, 2, sse2);
> + H264_QPEL_FUNCS_10(0, 3, sse2);
> + H264_QPEL_FUNCS_10(1, 1, sse2);
> + H264_QPEL_FUNCS_10(1, 2, sse2);
> + H264_QPEL_FUNCS_10(1, 3, sse2);
> + H264_QPEL_FUNCS_10(2, 1, sse2);
> + H264_QPEL_FUNCS_10(2, 2, sse2);
> + H264_QPEL_FUNCS_10(2, 3, sse2);
> + H264_QPEL_FUNCS_10(3, 1, sse2);
> + H264_QPEL_FUNCS_10(3, 2, sse2);
> + H264_QPEL_FUNCS_10(3, 3, sse2);
Missing some subpels.
SET_QPEL_FUNCS.
> +cglobal %1_h264_qpel8_mc00_10_sse2, 3,3
3,4
> +cglobal %2_h264_qpel%3_mc20_10_%1, 3,4,7
> + mov r3d, %3
> + pxor m0, m0
> + mova m1, [pw_pixel_max]
> + mova m6, [pw_16]
> +.nextrow
> + movu m2, [r1-4]
> + movu m3, [r1-2]
> + movu m4, [r1+0]
> + ADDW m2, [r1+6], m5
> + ADDW m3, [r1+4], m5
> + ADDW m4, [r1+2], m5
palignr
> +%macro MC11 3
> +; this REALLY needs x86_64
yes
> +cglobal %2_h264_qpel%3_mc11_10_%1, 3,6,8
> + mov r4, r1
> +.body
> + PRELOAD_V
> +
> + sub r0, r2
> + sub r4, r2
> + mov r5, r2
> + neg r5
> +%assign j 0
> +%rep %3
> + %assign i (j % 6)
> + call v_filt%3_ %+ i %+ _10_%1
> + call h_filt%3_ %+ i %+ _10_%1
> + OP_MOV [r0], m0
> +%if j<%3-1
> +;reload m5
> + movu m5, [r1+r5]
Factor movu into h_filt (i.e. into all the phases of h_filt that aren't
at the end of the loop. Dunno if this will be useful after the x86_64
opt of having only 1 phase.)
> +put_hv%2_10_%1:
> + add rsp, gprsize
Puts the return address below the stack.
lea r4, [rsp+PAD+gprsize]
> +%macro H_LOOP 2
> +%if num_mmregs > 8
> + %define s1 m8
> + %define s2 m9
> + %define s3 m10
> + %define d1 m11
> +%else
> + %define s1 [tap1]
> + %define s2 [tap2]
> + %define s3 [tap3]
> + %define d1 [depad]
Try loading these only once per iteration.
Increases number of instructions, but should reduce both uops and code size.
> +#define QPEL16_OPMC(OP, MC, MMX)\
> +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t
> *src, int stride){\
> + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
> + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
> + src += 8*stride;\
> + dst += 8*stride;\
> + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
> + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
> +}
These wrappers are ginormous. asm could do much better by abusing calling
convention.
x86_64 gcc-4.5.2:
h264_qpel_10bit.o contains 8160 bytes of code
sse2 wrappers are 3360 bytes
mmx wrappers are 14336 bytes
There are some redundant computations in the v pass of 16x16 hv filter.
--Loren Merritt
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel