On Tue, Jan 24, 2012 at 4:55 AM, Mans Rullgard <[email protected]> wrote:
> This prepares for assembly optimisations by moving the most
> time-consuming loops to functions called through pointers
> in a new context.
>
> Signed-off-by: Mans Rullgard <[email protected]>
> ---
>  libavcodec/Makefile     |    3 +-
>  libavcodec/aacsbr.c     |  183 ++++++++++---------------------------
>  libavcodec/aacsbrdata.h |    2 +-
>  libavcodec/sbr.h        |    2 +
>  libavcodec/sbrdsp.c     |  237 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/sbrdsp.h     |   47 +++++++++
>  6 files changed, 337 insertions(+), 137 deletions(-)
>  create mode 100644 libavcodec/sbrdsp.c
>  create mode 100644 libavcodec/sbrdsp.h

>  av_cold void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr)
> @@ -1139,33 +1141,21 @@ static void sbr_dequant(SpectralBandReplication *sbr, 
> int id_aac)
>  * @param   x       pointer to the beginning of the first sample window
>  * @param   W       array of complex-valued samples split into subbands
>  */
> -static void sbr_qmf_analysis(DSPContext *dsp, FFTContext *mdct, const float 
> *in, float *x,
> +static void sbr_qmf_analysis(DSPContext *dsp, FFTContext *mdct,
> +                             SBRDSPContext *sbrdsp, const float *in, float 
> *x,
>                              float z[320], float W[2][32][32][2])
>  {
> -    int i, k;
> +    int i;
>     memcpy(W[0], W[1], sizeof(W[0]));
>     memcpy(x    , x+1024, (320-32)*sizeof(x[0]));
>     memcpy(x+288, in,         1024*sizeof(x[0]));
>     for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample 
> frames
>                                // are not supported
>         dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
> -        for (k = 0; k < 64; k++) {
> -            float f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k + 
> 256];
> -            z[k] = f;
> -        }
> -        //Shuffle to IMDCT
> -        z[64] = z[0];
> -        for (k = 1; k < 32; k++) {
> -            z[64+2*k-1] =  z[   k];
> -            z[64+2*k  ] = -z[64-k];
> -        }
> -        z[64+63] = z[32];
> -
> +        sbrdsp->sum64x5(z);
> +        sbrdsp->qmf_pre_shuffle(z);
>         mdct->imdct_half(mdct, z, z+64);
> -        for (k = 0; k < 32; k++) {
> -            W[1][i][k][0] = -z[63-k];
> -            W[1][i][k][1] = z[k];
> -        }
> +        sbrdsp->qmf_post_shuffle(W[1][i], z);
>         x += 32;
>     }
>  }
> @@ -1175,6 +1165,7 @@ static void sbr_qmf_analysis(DSPContext *dsp, 
> FFTContext *mdct, const float *in,
>  * (14496-3 sp04 p206)
>  */
>  static void sbr_qmf_synthesis(DSPContext *dsp, FFTContext *mdct,
> +                              SBRDSPContext *sbrdsp,
>                               float *out, float X[2][38][64],
>                               float mdct_buf[2][64],
>                               float *v0, int *v_off, const unsigned int div)
> @@ -1198,20 +1189,12 @@ static void sbr_qmf_synthesis(DSPContext *dsp, 
> FFTContext *mdct,
>                 X[0][i][32+n] =  X[1][i][31-n];
>             }
>             mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
> -            for (n = 0; n < 32; n++) {
> -                v[     n] =  mdct_buf[0][63 - 2*n];
> -                v[63 - n] = -mdct_buf[0][62 - 2*n];
> -            }
> +            sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
>         } else {
> -            for (n = 1; n < 64; n+=2) {
> -                X[1][i][n] = -X[1][i][n];
> -            }
> +            sbrdsp->neg_odd_64(X[1][i]);
>             mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
>             mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
> -            for (n = 0; n < 64; n++) {
> -                v[      n] = -mdct_buf[0][63 -   n] + mdct_buf[1][  n    ];
> -                v[127 - n] =  mdct_buf[0][63 -   n] + mdct_buf[1][  n    ];
> -            }
> +            sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
>         }
>         dsp->vector_fmul_add(out, v                , sbr_qmf_window           
>     , zero64, 64 >> div);
>         dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >> 
> div), out   , 64 >> div);

I think these QMFs can be performed with substantially less
computation (smaller transforms with similar amounts of pre/post
processing). I can send interested parties the relevant paper (yes I
know papers are a dime a dozen but this particular paper seems
promising).
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to