Hi, On Tue, May 24, 2011 at 8:15 AM, Daniel Kang <[email protected]> wrote: > diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h [..] > @@ -66,7 +66,6 @@ typedef struct H264DSPContext{ > void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align > 16*/, int stride); > void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align > 16*/, int stride); > > - void (*h264_dct)(DCTELEM block[4][4]); > void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int > *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t > nnzc[6*8]); > void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int > *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t > nnzc[6*8]); > void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int > *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t > nnzc[6*8]); [..] > diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm [..] > @@ -125,7 +125,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0 > SUMSUB_BA w, 0, 4 > SUMSUB_BA w, 3, 2 > SUMSUB_BA w, 1, 5 > - SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 > + SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 > %endmacro > > %macro IDCT8_1D_FULL 1 [..] > @@ -261,7 +261,7 @@ cglobal h264_idct8_add_sse2, 3, 4, 10 > packuswb m1, m1 > %endmacro > > -%macro DC_ADD_MMX2_OP 3-4 > +%macro DC_ADD_MMX2_OP 4 > %1 m2, [%2 ] > %1 m3, [%2+%3 ] > %1 m4, [%2+%3*2]
All unrelated, but fine as a separate patch. > @@ -840,7 +840,7 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8 > > ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, > ; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) > -cglobal h264_idct_add8_sse2, 5, 7, 8 > +cglobal h264_idct_add8_8_sse2, 5, 7, 8 > add r2, 512 > %ifdef ARCH_X86_64 > mov r10, r0 I'd recommend doing the 8bit to _8 renames in a separate patch also, makes the actual patch a little smaller… > diff --git a/libavcodec/x86/h264_idct_10bit.asm > b/libavcodec/x86/h264_idct_10bit.asm [..] > +;----------------------------------------------------------------------------- > +; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) > +;----------------------------------------------------------------------------- > +%macro IDCT_DC_ADD_OP_10 0 > + mova m1, [r0+ 0] > + mova m2, [r0+r2] > + mova m3, [r1+ 0] > + mova m4, [r1+r2] > + pxor m5, m5 > + paddw m1, m0 > + paddw m2, m0 > + paddw m3, m0 > + paddw m4, m0 > + CLIPW m1, m5, m6 > + CLIPW m2, m5, m6 > + CLIPW m3, m5, m6 > + CLIPW m4, m5, m6 Strictly speaking, CLIPW is not a single instruction, so I'm assuming that interleaving this (that's why STORE_DIFFx2 exists, for example) in a CLIPWx2 or even CLIPWx4 instruction might help on Atoms… Rest looks pretty good, nice work! Ronald _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
