Re: [libav-devel] [PATCHES] scalarproduct_and_madd_int16 and wma lossless

Christophe Gisquet Wed, 12 Feb 2014 06:45:42 -0800

Hi,

2014-02-10 9:03 GMT+01:00 Kostya Shishkov <[email protected]>:
>> What you suggest is still valid, and would catch things even if they
>> changed, though.
>
> Maybe it's not worth warning about it at all.


Well, the format allows 8N coefficients with N>=1, so a file could
theoretically contain that. Whether it is crafted to crash the decoder
is another issue. And I was following Luca's recommendation, which
makes sense to me.


> I'd simply zero all coeffs before decoding but that's me.

I understand that is, as you mentioned, till the end of the buffer.
But that means up to (256-8) excess coefficients. Not a big deal with
an infrequently called function, but still.

I took an intermediately safe solution, where I allocate extra space
in the coeff buffer, and always zeros the most the DSP implementation
could overread, whatever the order.

-- 
Christophe

From e9b05761b196eb81d7a3536ccabaaeb523adec37 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <[email protected]>
Date: Sat, 24 Nov 2012 15:55:49 +0100
Subject: [PATCH 2/2] wma lossless: reuse scalarproduct_and_madd_int16

This is done by padding the coefficient buffers with 0s, as this
guarantees that the DSP function will neither overread nor generate
invalid results, as it processes batches of 16 elements.

However, no sample with orders not multiple of 16 is known, so request
one if it is found using that kind of order.

Approximate relative speedup depending on instruction set:
plain C: -6%
mmxext:  51%
sse2:    54%
---
 libavcodec/wmalosslessdec.c | 61 +++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c
index 2f341c0..0280edf 100644
--- a/libavcodec/wmalosslessdec.c
+++ b/libavcodec/wmalosslessdec.c
@@ -29,6 +29,7 @@
 #include "internal.h"
 #include "get_bits.h"
 #include "put_bits.h"
+#include "dsputil.h"
 #include "wma.h"
 #include "wma_common.h"
 
@@ -44,6 +45,7 @@
 #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS)    ///< maximum block size
 #define WMALL_BLOCK_SIZES    (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
 
+#define WMALL_COEFF_PAD_SIZE   16                       ///< pad coef buffers with 0 for use with SIMD functions
 
 /**
  * @brief frame-specific decoder context for a single channel
@@ -66,6 +68,7 @@ typedef struct {
 typedef struct WmallDecodeCtx {
     /* generic decoder variables */
     AVCodecContext  *avctx;
+    DSPContext      dsp;                           ///< accelerated DSP functions
     AVFrame         *frame;
     uint8_t         frame_data[MAX_FRAMESIZE + FF_INPUT_BUFFER_PADDING_SIZE];  ///< compressed frame data
     PutBitContext   pb;                             ///< context for filling the frame_data buffer
@@ -141,9 +144,9 @@ typedef struct WmallDecodeCtx {
         int scaling;
         int coefsend;
         int bitsend;
-        int16_t coefs[MAX_ORDER];
-        int16_t lms_prevvalues[MAX_ORDER * 2];
-        int16_t lms_updates[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE];
+        DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2];
+        DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2];
         int recent;
     } cdlms[2][9];
 
@@ -179,6 +182,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
     int i, log2_max_num_subframes;
 
     s->avctx = avctx;
+    ff_dsputil_init(&s->dsp, avctx);
     init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
 
     if (avctx->extradata_size >= 18) {
@@ -452,6 +456,13 @@ static int decode_cdlms(WmallDecodeCtx *s)
                 s->cdlms[0][0].order = 0;
                 return AVERROR_INVALIDDATA;
             }
+            if (s->cdlms[c][i].order & (WMALL_COEFF_PAD_SIZE-1)) {
+                static int warned;
+                if (!warned)
+                    avpriv_request_sample(s->avctx, "CDLMS of order %d",
+                                          s->cdlms[c][i].order);
+                warned = 1;
+            }
         }
 
         for (i = 0; i < s->cdlms_ttl[c]; i++)
@@ -477,6 +488,11 @@ static int decode_cdlms(WmallDecodeCtx *s)
                         (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
             }
         }
+
+        // pad the coeff buffers with 0s up to a 16-aligned position
+        for (i = 0; i < s->cdlms_ttl[c]; i++)
+            memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order, 0,
+                   WMALL_COEFF_PAD_SIZE * sizeof(s->cdlms[c][i].coefs[0]));
     }
 
     return 0;
@@ -686,35 +702,11 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
     }
 }
 
-static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
-{
-    int pred = 0, icoef;
-    int recent = s->cdlms[ich][ilms].recent;
-
-    for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-        pred += s->cdlms[ich][ilms].coefs[icoef] *
-                s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
-
-    return pred;
-}
-
-static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
-                       int input, int residue)
+static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input)
 {
-    int icoef;
     int recent = s->cdlms[ich][ilms].recent;
     int range  = 1 << s->bits_per_sample - 1;
 
-    if (residue < 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] -=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    } else if (residue > 0) {
-        for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
-            s->cdlms[ich][ilms].coefs[icoef] +=
-                s->cdlms[ich][ilms].lms_updates[icoef + recent];
-    }
-
     if (recent)
         recent--;
     else {
@@ -775,6 +767,9 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich)
     s->update_speed[ich] = 8;
 }
 
+/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
+#define WMASIGN(x) ((x > 0) - (x < 0))
+
 static void revert_cdlms(WmallDecodeCtx *s, int ch,
                          int coef_begin, int coef_end)
 {
@@ -785,9 +780,15 @@ static void revert_cdlms(WmallDecodeCtx *s, int ch,
         for (icoef = coef_begin; icoef < coef_end; icoef++) {
             pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
             residue = s->channel_residues[ch][icoef];
-            pred += lms_predict(s, ch, ilms);
+            pred += s->dsp.scalarproduct_and_madd_int16(s->cdlms[ch][ilms].coefs,
+                                                        s->cdlms[ch][ilms].lms_prevvalues
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        s->cdlms[ch][ilms].lms_updates
+                                                            + s->cdlms[ch][ilms].recent,
+                                                        s->cdlms[ch][ilms].order,
+                                                        WMASIGN(residue));
             input = residue + (pred >> s->cdlms[ch][ilms].scaling);
-            lms_update(s, ch, ilms, input, residue);
+            lms_update(s, ch, ilms, input);
             s->channel_residues[ch][icoef] = input;
         }
     }
-- 
1.8.0.msysgit.0

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCHES] scalarproduct_and_madd_int16 and wma lossless

Reply via email to