Hi,

On Tue, May 24, 2011 at 8:15 AM, Daniel Kang <[email protected]> wrote:
> diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
[..]
> @@ -66,7 +66,6 @@ typedef struct H264DSPContext{
>      void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 
> 16*/, int stride);
>      void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 
> 16*/, int stride);
>
> -    void (*h264_dct)(DCTELEM block[4][4]);
>      void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int 
> *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t 
> nnzc[6*8]);
>      void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int 
> *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t 
> nnzc[6*8]);
>      void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int 
> *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t 
> nnzc[6*8]);
[..]
> diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
[..]
> @@ -125,7 +125,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0
>      SUMSUB_BA    w, 0, 4
>      SUMSUB_BA    w, 3, 2
>      SUMSUB_BA    w, 1, 5
> -    SWAP          7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
> +    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
>  %endmacro
>
>  %macro IDCT8_1D_FULL 1
[..]
> @@ -261,7 +261,7 @@ cglobal h264_idct8_add_sse2, 3, 4, 10
>      packuswb     m1, m1
>  %endmacro
>
> -%macro DC_ADD_MMX2_OP 3-4
> +%macro DC_ADD_MMX2_OP 4
>      %1           m2, [%2     ]
>      %1           m3, [%2+%3  ]
>      %1           m4, [%2+%3*2]

All unrelated, but fine as a separate patch.

> @@ -840,7 +840,7 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
>
>  ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
>  ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
> -cglobal h264_idct_add8_sse2, 5, 7, 8
> +cglobal h264_idct_add8_8_sse2, 5, 7, 8
>      add          r2, 512
>  %ifdef ARCH_X86_64
>      mov         r10, r0

I'd recommend doing the 8bit to _8 renames in a separate patch also,
makes the actual patch a little smaller…

> diff --git a/libavcodec/x86/h264_idct_10bit.asm 
> b/libavcodec/x86/h264_idct_10bit.asm
[..]
> +;-----------------------------------------------------------------------------
> +; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
> +;-----------------------------------------------------------------------------
> +%macro IDCT_DC_ADD_OP_10 0
> +    mova      m1, [r0+ 0]
> +    mova      m2, [r0+r2]
> +    mova      m3, [r1+ 0]
> +    mova      m4, [r1+r2]
> +    pxor      m5, m5
> +    paddw     m1, m0
> +    paddw     m2, m0
> +    paddw     m3, m0
> +    paddw     m4, m0
> +    CLIPW     m1, m5, m6
> +    CLIPW     m2, m5, m6
> +    CLIPW     m3, m5, m6
> +    CLIPW     m4, m5, m6

Strictly speaking, CLIPW is not a single instruction, so I'm assuming
that interleaving this (that's why STORE_DIFFx2 exists, for example)
in a CLIPWx2 or even CLIPWx4 instruction might help on Atoms…

Rest looks pretty good, nice work!

Ronald
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to