On Sun, Feb 09, 2014 at 11:21:33PM +0100, Christophe Gisquet wrote: > Hi > > since then, it appears it is simpler to extend the buffer, and > whenever the order change, pad the end of the coeff buffer with 0. > This way, no need to handle the tail of the loop. > > As the case where an unusual order is encountered (it should still be > handled fine), a sample is requested. > > The changes pass fate-lossless-wma for win32 and win64 and for > CPUFLAGS in mmx/sse2/sse4.2
The approach looks good to me, implementation has some issues though. > -- > Christophe > From ce54b92673f2c57e50f664036d25a93db94746bc Mon Sep 17 00:00:00 2001 > From: Christophe Gisquet <[email protected]> > Date: Sat, 24 Nov 2012 15:55:49 +0100 > Subject: [PATCH 2/2] wma lossless: reuse scalarproduct_and_madd_int16 > > This is done by padding the coefficient buffer with 0s, because the order > may be only a multiple of 4, and the DSP function requires batches of 8. > > However, no sample with such a case was found, so request one if it uses > that kind of order. > > Approximate relative speedup depending on instruction set: > plain C: -6% > mmxext: 51% > sse2: 54% > --- > libavcodec/wmalosslessdec.c | 62 > +++++++++++++++++++++++---------------------- > 1 file changed, 32 insertions(+), 30 deletions(-) > > diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c > index 2f341c0..4035092 100644 > --- a/libavcodec/wmalosslessdec.c > +++ b/libavcodec/wmalosslessdec.c > @@ -29,6 +29,7 @@ > #include "internal.h" > #include "get_bits.h" > #include "put_bits.h" > +#include "dsputil.h" > #include "wma.h" > #include "wma_common.h" > > @@ -44,6 +45,7 @@ > #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS) ///< maximum > block size > #define WMALL_BLOCK_SIZES (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + > 1) ///< possible block sizes > > +#define WMALL_COEFF_PAD_SIZE 16 ///< pad coef > buffers with 0 for use with SIMD functions > > /** > * @brief frame-specific decoder context for a single channel > @@ -66,6 +68,7 @@ typedef struct { > typedef struct WmallDecodeCtx { > /* generic decoder variables */ > AVCodecContext *avctx; > + DSPContext dsp; ///< accelerated DSP > functions > AVFrame *frame; > uint8_t frame_data[MAX_FRAMESIZE + > FF_INPUT_BUFFER_PADDING_SIZE]; ///< compressed frame data > PutBitContext pb; ///< context for filling > the frame_data buffer > @@ -141,9 +144,9 @@ typedef struct WmallDecodeCtx { > int scaling; > int coefsend; > int bitsend; > - int16_t coefs[MAX_ORDER]; > - int16_t lms_prevvalues[MAX_ORDER * 2]; > - int16_t lms_updates[MAX_ORDER * 2]; > + DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER]; > + DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2]; > + DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2]; > int recent; > } cdlms[2][9]; > > @@ -179,6 +182,7 @@ static av_cold int decode_init(AVCodecContext *avctx) > int i, log2_max_num_subframes; > > s->avctx = avctx; > + ff_dsputil_init(&s->dsp, avctx); > init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE); > > if (avctx->extradata_size >= 18) { > @@ -452,6 +456,13 @@ static int decode_cdlms(WmallDecodeCtx *s) > s->cdlms[0][0].order = 0; > return AVERROR_INVALIDDATA; > } > + if(s->cdlms[c][i].order & 8) { > + static int warned; > + if(!warned) > + avpriv_request_sample(s->avctx, "CDLMS of order %d", > + s->cdlms[c][i].order); > + warned = 1; > + } & 7 maybe ? > } > > for (i = 0; i < s->cdlms_ttl[c]; i++) > @@ -477,6 +488,12 @@ static int decode_cdlms(WmallDecodeCtx *s) > (get_bits(&s->gb, s->cdlms[c][i].bitsend) << > shift_l) >> shift_r; > } > } > + > + for (i = 0; i < s->cdlms_ttl[c]; i++) { > + memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order, 0, > + (s->cdlms[c][i].order & (WMALL_COEFF_PAD_SIZE-1)) wouldn't that be too much on large orders (e.g. order = 15)? I'd simply zero to the end of coefs array. > + * sizeof(s->cdlms[c][i].coefs)); not sizeof(s->cdlms[c][i].coefs[0]) ? > + } > } > > return 0; the rest maybe OK _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
