Hi

since then, it appears it is simpler to extend the buffer, and
whenever the order change, pad the end of the coeff buffer with 0.
This way, no need to handle the tail of the loop.

As the case where an unusual order is encountered (it should still be
handled fine), a sample is requested.

The changes pass fate-lossless-wma for win32 and win64 and for
CPUFLAGS in mmx/sse2/sse4.2

-- 
Christophe
From ce54b92673f2c57e50f664036d25a93db94746bc Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <[email protected]>
Date: Sat, 24 Nov 2012 15:55:49 +0100
Subject: [PATCH 2/2] wma lossless: reuse scalarproduct_and_madd_int16

This is done by padding the coefficient buffer with 0s, because the order
may be only a multiple of 4, and the DSP function requires batches of 8.

However, no sample with such a case was found, so request one if it uses
that kind of order.

Approximate relative speedup depending on instruction set:
plain C: -6%
mmxext:  51%
sse2:    54%
---
 libavcodec/wmalosslessdec.c | 62 +++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index 2f341c0..4035092 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -29,6 +29,7 @@
 #include "internal.h"
 #include "get_bits.h"
 #include "put_bits.h"
+#include "dsputil.h"
 #include "wma.h"
 #include "wma_common.h"
 
@@ -44,6 +45,7 @@
 #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS)    ///< maximum block size
 #define WMALL_BLOCK_SIZES    (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
 
+#define WMALL_COEFF_PAD_SIZE   16                       ///< pad coef buffers with 0 for use with SIMD functions
 
 /**
  * @brief frame-specific decoder context for a single channel
@@ -66,6 +68,7 @@ typedef struct {
 typedef struct WmallDecodeCtx {
     /* generic decoder variables */
     AVCodecContext  *avctx;
+    DSPContext      dsp;                           ///< accelerated DSP functions
     AVFrame         *frame;
     uint8_t         frame_data[MAX_FRAMESIZE + FF_INPUT_BUFFER_PADDING_SIZE];  ///< compressed frame data
     PutBitContext   pb;                             ///< context for filling the frame_data buffer
@@ -141,9 +144,9 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        int16_t coefs[MAX_ORDER];
-        int16_t lms_prevvalues[MAX_ORDER * 2];
-        int16_t lms_updates[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER];
+        DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2];
         int recent;
     } cdlms[2][9];
 
@@ -179,6 +182,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int i, log2_max_num_subframes;
 
     s->avctx = avctx;
+    ff_dsputil_init(&s->dsp, avctx);
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
     if (avctx->extradata_size >= 18) {
@@ -452,6 +456,13 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[0][0].order = 0;
                 return AVERROR_INVALIDDATA;
             }
+            if(s->cdlms[c][i].order & 8) {
+                static int warned;
+                if(!warned)
+                    avpriv_request_sample(s->avctx, "CDLMS of order %d",
+                                          s->cdlms[c][i].order);
+                warned = 1;
+            }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
@@ -477,6 +488,12 @@ static int decode_cdlms(WmallDecodeCtx *s)
                         (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
             }
         }
+
+        for (i = 0; i < s->cdlms_ttl[c]; i++) {
+            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order, 0,
+                   (s->cdlms[c][i].order & (WMALL_COEFF_PAD_SIZE-1))
+                   * sizeof(s->cdlms[c][i].coefs));
+        }
     }
 
     return 0;
@@ -686,35 +703,11 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
+static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input)
 {
-    int pred = 0, icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-
-    for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-        pred += s->cdlms[ich][ilms].coefs[icoef] *
-                s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
-
-    return pred;
-}
-
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
-                       int input, int residue)
-{
-    int icoef;
     int recent = s->cdlms[ich][ilms].recent;
     int range  = 1 << s->bits_per_sample - 1;
 
-    if (residue < 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] -=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    } else if (residue > 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] +=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    }
-
     if (recent)
         recent--;
     else {
@@ -775,6 +768,9 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich)
     s->update_speed[ich] = 8;
 }
 
+/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
+#define WMASIGN(x) ((x > 0) - (x < 0))
+
 static void revert_cdlms(WmallDecodeCtx *s, int ch,
                          int coef_begin, int coef_end)
 {
@@ -785,9 +781,15 @@ static void revert_cdlms(WmallDecodeCtx *s, int ch,
         for (icoef = coef_begin; icoef < coef_end; icoef++) {
             pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
             residue = s->channel_residues[ch][icoef];
-            pred += lms_predict(s, ch, ilms);
+            pred += s->dsp.scalarproduct_and_madd_int16(s->cdlms[ch][ilms].coefs,
+                                                        s->cdlms[ch][ilms].lms_prevvalues
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        s->cdlms[ch][ilms].lms_updates
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        s->cdlms[ch][ilms].order,
+                                                        WMASIGN(residue));
             input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input, residue);
+            lms_update(s, ch, ilms, input);
             s->channel_residues[ch][icoef] = input;
         }
     }
-- 
1.8.0.msysgit.0

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to