[libav-devel] [PATCH 3/4] ac3enc: move int32_t array clipping function to DSPUtil and add x86 versions.

Justin Ruggles Sat, 04 Jun 2011 13:40:29 -0700

---
 libavcodec/ac3enc.c             |   22 ++------
 libavcodec/dsputil.c            |   17 ++++++
 libavcodec/dsputil.h            |   14 +++++
 libavcodec/x86/dsputil_mmx.c    |   24 ++++++++
 libavcodec/x86/dsputil_yasm.asm |  118 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 178 insertions(+), 17 deletions(-)

diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index bcdc183..17a2b16 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -525,19 +525,6 @@ static void compute_coupling_strategy(AC3EncodeContext *s)
 
 
 /**
- * Clip an array of integers to a specified range.
- * TODO: move to dsputil
- */
-static void vector_clip_int32(int32_t *dst, int32_t *src, int32_t min,
-                              int32_t max, unsigned int len)
-{
-    int i;
-    for (i = 0; i < len; i++)
-        dst[i] = av_clip(src[i], min, max);
-}
-
-
-/**
  * Calculate a single coupling coordinate.
  */
 static inline float calc_cpl_coord(float energy_ch, float energy_cpl)
@@ -720,8 +707,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
         s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
                                    cpl_coords[blk][1],
                                    s->fbw_channels * 16);
-        vector_clip_int32(fixed_cpl_coords[blk][1], fixed_cpl_coords[blk][1],
-                          -16777215, 16777215, s->fbw_channels * 16);
+        s->dsp.vector_clip_int32(fixed_cpl_coords[blk][1], fixed_cpl_coords[blk][1],
+                                 -16777215, 16777215, s->fbw_channels * 16);
         s->ac3dsp.extract_exponents(block->cpl_coord_exp[1],
                                     fixed_cpl_coords[blk][1],
                                     s->fbw_channels * 16);
@@ -919,8 +906,9 @@ static void extract_exponents(AC3EncodeContext *s)
     int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS * (s->channels - ch + 1);
     AC3Block *block = &s->blocks[0];
 
-    vector_clip_int32(block->fixed_coef[ch], block->fixed_coef[ch],
-                      -16777215, 16777215, chan_size);
+    s->dsp.vector_clip_int32(block->fixed_coef[ch], block->fixed_coef[ch],
+                             -16777215, 16777215, chan_size);
+
     s->ac3dsp.extract_exponents(block->exp[ch], block->fixed_coef[ch], chan_size);
 }
 
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 4389289..4f17b43 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2676,6 +2676,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
     }
 }
 
+static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
+                                int32_t max, unsigned int len)
+{
+    do {
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        *dst++ = av_clip(*src++, min, max);
+        len -= 8;
+    } while (len > 0);
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -3122,6 +3138,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->scalarproduct_int16 = scalarproduct_int16_c;
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     c->apply_window_int16 = apply_window_int16_c;
+    c->vector_clip_int32 = vector_clip_int32_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index cfc574a..cff8406 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -555,6 +555,20 @@ typedef struct DSPContext {
     void (*apply_window_int16)(int16_t *output, const int16_t *input,
                                const int16_t *window, unsigned int len);
 
+    /**
+     * Clip each element in an array of int32_t to a given minimum and maximum value.
+     * @param dst  destination array
+     *             constraints: 16-byte aligned
+     * @param src  source array
+     *             constraints: 16-byte aligned
+     * @param min  minimum value
+     * @param max  maximum value
+     * @param len  number of elements in the array
+     *             constraints: multiple of 16 greater than zero
+     */
+    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
+                              int32_t max, unsigned int len);
+
     /* rv30 functions */
     qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
     qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 1cc6991..83671c0 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2416,6 +2416,15 @@ int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
 
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 
+void ff_vector_clip_int32_cmov  (int32_t *dst, const int32_t *src, int32_t min,
+                                 int32_t max, unsigned int len);
+void ff_vector_clip_int32_mmx   (int32_t *dst, const int32_t *src, int32_t min,
+                                 int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse2  (int32_t *dst, const int32_t *src, int32_t min,
+                                 int32_t max, unsigned int len);
+void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
+                                 int32_t max, unsigned int len);
+
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
@@ -2556,6 +2565,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 
         c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
         c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
+
+        c->vector_clip_int32 = ff_vector_clip_int32_mmx;
 #endif
 
         if (mm_flags & AV_CPU_FLAG_MMX2) {
@@ -2809,6 +2820,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             }
 #endif
         }
+        if (mm_flags & AV_CPU_FLAG_SSE2SLOW && HAVE_FAST_CMOV) {
+#if HAVE_YASM
+            c->vector_clip_int32 = ff_vector_clip_int32_cmov;
+#endif
+        }
         if(mm_flags & AV_CPU_FLAG_SSE){
             c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
             c->ac3_downmix = ac3_downmix_sse;
@@ -2829,6 +2845,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
             c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+            if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+                c->vector_clip_int32 = ff_vector_clip_int32_sse2;
+            }
             if (avctx->flags & CODEC_FLAG_BITEXACT) {
                 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
             } else {
@@ -2854,6 +2873,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             }
 #endif
         }
+        if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
+#if HAVE_YASM
+            c->vector_clip_int32 = ff_vector_clip_int32_sse41;
+#endif
+        }
     }
 
     if (CONFIG_ENCODERS)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 8b19cc1..44b5717 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1048,3 +1048,121 @@ emu_edge sse
 %ifdef ARCH_X86_32
 emu_edge mmx
 %endif
+
+;-----------------------------------------------------------------------------
+; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
+;                           int32_t max, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro PMINSD_MMX 3 ; dst, src, tmp
+    mova      %3, %2
+    pcmpgtd   %3, %1
+    pxor      %1, %2
+    pand      %1, %3
+    pxor      %1, %2
+%endmacro
+
+%macro PMAXSD_MMX 3 ; dst, src, tmp
+    mova      %3, %1
+    pcmpgtd   %3, %2
+    pand      %1, %3
+    pandn     %3, %2
+    por       %1, %3
+%endmacro
+
+%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
+    PMINSD_MMX %1, %3, %4
+    PMAXSD_MMX %1, %2, %4
+%endmacro
+
+%macro CLIPD_SSE41 3-4 ;  src/dst, min, max, unused
+    pminsd  %1, %3
+    pmaxsd  %1, %2
+%endmacro
+
+%macro SPLATD_MMX 1
+    punpckldq  %1, %1
+%endmacro
+
+%macro SPLATD_SSE2 1
+    pshufd  %1, %1, 0
+%endmacro
+
+%macro VECTOR_CLIP_INT32 1
+cglobal vector_clip_int32_%1, 5,5,7, dst, src, min, max, len
+    movd      m4, mind
+    movd      m5, maxd
+    SPLATD    m4
+    SPLATD    m5
+.loop:
+    mova      m0, [srcq         ]
+    mova      m1, [srcq+mmsize  ]
+    mova      m2, [srcq+mmsize*2]
+    mova      m3, [srcq+mmsize*3]
+    CLIPD  m0, m4, m5, m6
+    CLIPD  m1, m4, m5, m6
+    CLIPD  m2, m4, m5, m6
+    CLIPD  m3, m4, m5, m6
+    mova  [dstq         ], m0
+    mova  [dstq+mmsize  ], m1
+    mova  [dstq+mmsize*2], m2
+    mova  [dstq+mmsize*3], m3
+    add     srcq, mmsize*4
+    add     dstq, mmsize*4
+    sub     lend, mmsize
+    ja .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define SPLATD SPLATD_MMX
+%define CLIPD CLIPD_MMX
+VECTOR_CLIP_INT32 mmx
+INIT_XMM
+%define SPLATD SPLATD_SSE2
+VECTOR_CLIP_INT32 sse2
+%define CLIPD CLIPD_SSE41
+VECTOR_CLIP_INT32 sse41
+
+; This is faster on Athlon64 where cmov is fast and SSE2 is slow.
+; GCC generates similar but slower code. For some reason it wants to use
+; branching for the max value clipping instead of cmovg.
+
+%macro CLIPD_CMOV 3 ;  src/dst, min, max
+    cmp    %1, %3
+    cmovg  %1, %3
+    cmp    %1, %2
+    cmovl  %1, %2
+%endmacro
+
+cglobal vector_clip_int32_cmov, 5,6,0, dst, src, min, max, len, tmp
+.loop:
+    mov         tmpd, [srcq]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov       [dstq], tmpd
+    mov         tmpd, [srcq+4]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov     [dstq+4], tmpd
+    mov         tmpd, [srcq+8]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov     [dstq+8], tmpd
+    mov         tmpd, [srcq+12]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov    [dstq+12], tmpd
+    mov         tmpd, [srcq+16]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov    [dstq+16], tmpd
+    mov         tmpd, [srcq+20]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov    [dstq+20], tmpd
+    mov         tmpd, [srcq+24]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov    [dstq+24], tmpd
+    mov         tmpd, [srcq+28]
+    CLIPD_CMOV  tmpd, mind, maxd
+    mov    [dstq+28], tmpd
+    add         srcq, 32
+    add         dstq, 32
+    sub         lenq, 8
+    ja .loop
+    REP_RET

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 3/4] ac3enc: move int32_t array clipping function to DSPUtil and add x86 versions.

Reply via email to