On Sun, Oct 30, 2011 at 05:15:47AM -0400, Justin Ruggles wrote:
> ---
> Benchmarks for the two calls combined.
> 
> Athlon64:
> C   - 11851
> SSE -  6603
> 
> SandyBridge:
> C   -  5655
> SSE -  2380
> 
>  libavcodec/dsputil.c            |   13 +++++++++++++
>  libavcodec/dsputil.h            |   17 +++++++++++++++++
>  libavcodec/twinvq.c             |   34 ++++++++++++++++------------------
>  libavcodec/x86/dsputil_mmx.c    |    4 ++++
>  libavcodec/x86/dsputil_yasm.asm |   31 +++++++++++++++++++++++++++++++
>  5 files changed, 81 insertions(+), 18 deletions(-)
> 
> diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
> index 182063c..9123857 100644
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, 
> float *restrict v2,
>      }
>  }
>  
> +static void butterflies_float_interleave_c(float *dst, const float *src0,
> +                                           const float *src1, int len)
> +{
> +    int i;
> +    for (i = 0; i < len; i++) {
> +        float f1 = src0[i];
> +        float f2 = src1[i];
> +        dst[2*i    ] = f1 + f2;
> +        dst[2*i + 1] = f1 - f2;
> +    }
> +}
> +
>  static float scalarproduct_float_c(const float *v1, const float *v2, int len)
>  {
>      float p = 0.0;
> @@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext 
> *avctx)
>      c->vector_clip_int32 = vector_clip_int32_c;
>      c->scalarproduct_float = scalarproduct_float_c;
>      c->butterflies_float = butterflies_float_c;
> +    c->butterflies_float_interleave = butterflies_float_interleave_c;
>      c->vector_fmul_scalar = vector_fmul_scalar_c;
>      c->vector_fmac_scalar = vector_fmac_scalar_c;
>  
> diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
> index acb2041..587a54d 100644
> --- a/libavcodec/dsputil.h
> +++ b/libavcodec/dsputil.h
> @@ -453,6 +453,23 @@ typedef struct DSPContext {
>       */
>      void (*butterflies_float)(float *restrict v1, float *restrict v2, int 
> len);
>  
> +    /**
> +     * Calculate the sum and difference of two vectors of floats and 
> interleave
> +     * results into a separate output vector of floats, with each sum
> +     * positioned before the corresponding difference.
> +     *
> +     * @param dst  output vector
> +     *             constraints: 16-byte aligned
> +     * @param src0 first input vector
> +     *             constraints: 16-byte aligned
> +     * @param src1 second input vector
> +     *             constraints: 16-byte aligned
> +     * @param len  number of elements in the input
> +     *             constraints: multiple of 4
> +     */
> +    void (*butterflies_float_interleave)(float *dst, const float *src0,
> +                                         const float *src1, int len);
> +
>      /* (I)DCT */
>      void (*fdct)(DCTELEM *block/* align 16*/);
>      void (*fdct248)(DCTELEM *block/* align 16*/);
> diff --git a/libavcodec/twinvq.c b/libavcodec/twinvq.c
> index 73eb7c1..a285156 100644
> --- a/libavcodec/twinvq.c
> +++ b/libavcodec/twinvq.c
> @@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum 
> FrameType ftype, int wtype,
>                           float *out)
>  {
>      const ModeTab *mtab = tctx->mtab;
> +    int size1, size2;
>      float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0];
> -    int i, j;
> +    int i;
>  
>      for (i = 0; i < tctx->avctx->channels; i++) {
>          imdct_and_window(tctx, ftype, wtype,
> @@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum 
> FrameType ftype, int wtype,
>                           i);
>      }
>  
> +    size2 = tctx->last_block_pos[0];
> +    size1 = mtab->size - size2;
>      if (tctx->avctx->channels == 2) {
> -        for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
> -            float f1 = prev_buf[               i];
> -            float f2 = prev_buf[2*mtab->size + i];
> -            out[2*i    ] = f1 + f2;
> -            out[2*i + 1] = f1 - f2;
> -        }
> -        for (j = 0; i < mtab->size; j++,i++) {
> -            float f1 = tctx->curr_frame[               j];
> -            float f2 = tctx->curr_frame[2*mtab->size + j];
> -            out[2*i    ] = f1 + f2;
> -            out[2*i + 1] = f1 - f2;
> -        }
> +        tctx->dsp.butterflies_float_interleave(out, prev_buf,
> +                                               &prev_buf[2*mtab->size],
> +                                               size1);
> +
> +        out += 2 * size1;
> +
> +        tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame,
> +                                               
> &tctx->curr_frame[2*mtab->size],
> +                                               size2);
>      } else {
> -        memcpy(out, prev_buf,
> -               (mtab->size - tctx->last_block_pos[0]) * sizeof(*out));
> +        memcpy(out, prev_buf, size1 * sizeof(*out));
>  
> -        out +=  mtab->size - tctx->last_block_pos[0];
> +        out += size1;
>  
> -        memcpy(out, tctx->curr_frame,
> -               (tctx->last_block_pos[0]) * sizeof(*out));
> +        memcpy(out, tctx->curr_frame, size2 * sizeof(*out));
>      }
>  
>  }
> diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
> index 959a2c2..8e0376d 100644
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -2424,6 +2424,9 @@ void ff_vector_clip_int32_sse2_int(int32_t *dst, const 
> int32_t *src, int32_t min
>  void ff_vector_clip_int32_sse41   (int32_t *dst, const int32_t *src, int32_t 
> min,
>                                     int32_t max, unsigned int len);
>  
> +extern void ff_butterflies_float_interleave_sse(float *dst, const float 
> *src0,
> +                                                const float *src1, int len);
> +
>  void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
>  {
>      int mm_flags = av_get_cpu_flags();
> @@ -2868,6 +2871,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext 
> *avctx)
>              c->vector_clipf = vector_clipf_sse;
>  #if HAVE_YASM
>              c->scalarproduct_float = ff_scalarproduct_float_sse;
> +            c->butterflies_float_interleave = 
> ff_butterflies_float_interleave_sse;
>  #endif
>          }
>          if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
> diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
> index fe96d8b..d1b9ce3 100644
> --- a/libavcodec/x86/dsputil_yasm.asm
> +++ b/libavcodec/x86/dsputil_yasm.asm
> @@ -1123,3 +1123,34 @@ VECTOR_CLIP_INT32 sse41, 11, 1, 1
>  %else
>  VECTOR_CLIP_INT32 sse41, 6, 1, 0
>  %endif
> +
> +;-----------------------------------------------------------------------------
> +;  void ff_butterflies_float_interleave(float *dst, const float *src0,
> +;                                       const float *src1, int len);
> +;-----------------------------------------------------------------------------
> +
> +INIT_XMM
> +cglobal butterflies_float_interleave_sse, 4,4,4, dst, src0, src1, len
> +    test      lenq, lenq
> +    jz .end
> +    shl       lenq, 2
> +    lea      src0q, [src0q +   lenq]
> +    lea      src1q, [src1q +   lenq]
> +    lea       dstq, [ dstq + 2*lenq]
> +    neg       lenq
> +.loop:
> +    mova        m0, [src0q + lenq]
> +    mova        m2, [src1q + lenq]
> +    mova        m1, m0
> +    mova        m3, m2
> +    addps       m0, m2
> +    subps       m1, m3

subps m1, m2 and throw mova m3, m2 out

> +    mova        m2, m0
> +    unpcklps    m0, m1
> +    unpckhps    m2, m1
> +    mova [dstq + 2*lenq         ], m0
> +    mova [dstq + 2*lenq + mmsize], m2
> +    add       lenq, mmsize
> +    jl .loop
> +.end:
> +    REP_RET
> -- 

In general - LGTM
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to