Re: [libav-devel] [PATCH 16/29] x86: hpeldsp: Move half-pel assembly from dsputil to hpeldsp

Diego Biurrun Fri, 12 Apr 2013 01:16:10 -0700

On Wed, Apr 10, 2013 at 07:24:30PM +0300, Martin Storsjö wrote:
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -270,30 +189,20 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t 
> *dst, uint8_t *src,
>  #if HAVE_YASM
>  
>  /***********************************/
> -/* 3Dnow specific */
> -
> -#define DEF(x) x ## _3dnow
> -
> -#include "dsputil_avg_template.c"
> -
> -#undef DEF
> -
> -/***********************************/
>  /* MMXEXT specific */
>  
> -#define DEF(x) x ## _mmxext
> -
> -#include "dsputil_avg_template.c"
> -
> -#undef DEF
> +//FIXME the following could be optimized too ...
> +static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
> +                                   int line_size, int h)
> +{
> +    ff_avg_pixels8_mmxext(block,     pixels,     line_size, h);
> +    ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
> +}


This is an awkward place for that function and it also uses the
wrong type for the line_size parameter, should be ptrdiff_t.
The comment is also misleading; it should warn that this function is
duplicated.

> diff --git a/libavcodec/x86/dsputil_rnd_template.c 
> b/libavcodec/x86/hpeldsp_rnd_template.c
> similarity index 72%
> copy from libavcodec/x86/dsputil_rnd_template.c
> copy to libavcodec/x86/hpeldsp_rnd_template.c
> index 6ce926c..69e9353 100644

> diff --git a/libavcodec/x86/dsputil_rnd_template.c 
> b/libavcodec/x86/dsputil_rnd_template.c
> index 6ce926c..d441950 100644
> --- a/libavcodec/x86/dsputil_rnd_template.c
> +++ b/libavcodec/x86/dsputil_rnd_template.c

There now is quite a bit of duplication between these two files:

static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)

All these functions are duplicated.

> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -0,0 +1,415 @@
> +
> +//#undef NDEBUG
> +//#include <assert.h>

I'd suggest not to carry cruft like this forward.

> +#if HAVE_INLINE_ASM
> +
> +#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
> +#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
> +
> +#define MOVQ_BFE(regd)                                  \
> +    __asm__ volatile (                                  \
> +        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
> +        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
> +
> +#ifndef PIC
> +#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: 
> "m"(ff_bone))
> +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: 
> "m"(ff_wtwo))
> +#else
> +// for shared library it's better to use this way for accessing constants
> +// pcmpeqd -> -1
> +#define MOVQ_BONE(regd)                                 \
> +    __asm__ volatile (                                  \
> +        "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
> +        "psrlw          $15, %%"#regd"  \n\t"           \
> +        "packuswb %%"#regd", %%"#regd"  \n\t" ::)
> +
> +#define MOVQ_WTWO(regd)                                 \
> +    __asm__ volatile (                                  \
> +        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
> +        "psrlw         $15, %%"#regd"   \n\t"           \
> +        "psllw          $1, %%"#regd"   \n\t"::)
> +
> +#endif
> +
> +// using regr as temporary and for the output result
> +// first argument is unmodifed and second is trashed
> +// regfe is supposed to contain 0xfefefefefefefefe
> +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
> +    "movq   "#rega", "#regr"            \n\t"                    \
> +    "pand   "#regb", "#regr"            \n\t"                    \
> +    "pxor   "#rega", "#regb"            \n\t"                    \
> +    "pand  "#regfe", "#regb"            \n\t"                    \
> +    "psrlq       $1, "#regb"            \n\t"                    \
> +    "paddb  "#regb", "#regr"            \n\t"
> +
> +#define PAVGB_MMX(rega, regb, regr, regfe)                       \
> +    "movq   "#rega", "#regr"            \n\t"                    \
> +    "por    "#regb", "#regr"            \n\t"                    \
> +    "pxor   "#rega", "#regb"            \n\t"                    \
> +    "pand  "#regfe", "#regb"            \n\t"                    \
> +    "psrlq       $1, "#regb"            \n\t"                    \
> +    "psubb  "#regb", "#regr"            \n\t"
> +
> +// mm6 is supposed to contain 0xfefefefefefefefe
> +#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
> +    "movq  "#rega", "#regr"             \n\t"                    \
> +    "movq  "#regc", "#regp"             \n\t"                    \
> +    "pand  "#regb", "#regr"             \n\t"                    \
> +    "pand  "#regd", "#regp"             \n\t"                    \
> +    "pxor  "#rega", "#regb"             \n\t"                    \
> +    "pxor  "#regc", "#regd"             \n\t"                    \
> +    "pand    %%mm6, "#regb"             \n\t"                    \
> +    "pand    %%mm6, "#regd"             \n\t"                    \
> +    "psrlq      $1, "#regb"             \n\t"                    \
> +    "psrlq      $1, "#regd"             \n\t"                    \
> +    "paddb "#regb", "#regr"             \n\t"                    \
> +    "paddb "#regd", "#regp"             \n\t"
> +
> +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
> +    "movq  "#rega", "#regr"             \n\t"                    \
> +    "movq  "#regc", "#regp"             \n\t"                    \
> +    "por   "#regb", "#regr"             \n\t"                    \
> +    "por   "#regd", "#regp"             \n\t"                    \
> +    "pxor  "#rega", "#regb"             \n\t"                    \
> +    "pxor  "#regc", "#regd"             \n\t"                    \
> +    "pand    %%mm6, "#regb"             \n\t"                    \
> +    "pand    %%mm6, "#regd"             \n\t"                    \
> +    "psrlq      $1, "#regd"             \n\t"                    \
> +    "psrlq      $1, "#regb"             \n\t"                    \
> +    "psubb "#regb", "#regr"             \n\t"                    \
> +    "psubb "#regd", "#regp"             \n\t"

All of this is duplicated from dsputil_mmx.c.  It could be shared through
dsputil_mmx.h for lack of a better place.

> +static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
> +{
> +#if HAVE_YASM
> +    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
> +    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
> +
> +    c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
> +    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
> +    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
> +
> +    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
> +    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
> +
> +    c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
> +    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
> +    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
> +
> +    if (!(flags & CODEC_FLAG_BITEXACT)) {
> +        c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
> +        c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
> +        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
> +        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
> +
> +        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
> +        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
> +    }
> +#endif /* HAVE_YASM */
> +
> +#if HAVE_MMXEXT_EXTERNAL
> +    if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
> +        c->put_no_rnd_pixels_tab[1][1] = 
> ff_put_no_rnd_pixels8_x2_exact_mmxext;
> +        c->put_no_rnd_pixels_tab[1][2] = 
> ff_put_no_rnd_pixels8_y2_exact_mmxext;
> +    }
> +#endif /* HAVE_MMXEXT_EXTERNAL */
> +}

Just HAVE_YASM around the function body is enough.

> +static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
> +{
> +#if HAVE_SSE2_EXTERNAL
> +    if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
> +        // these functions are slower than mmx on AMD, but faster on Intel
> +        c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
> +        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
> +        c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
> +    }
> +#endif /* HAVE_SSE2_EXTERNAL */
> +}

same

Diego
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 16/29] x86: hpeldsp: Move half-pel assembly from dsputil to hpeldsp

Reply via email to