On Tue, Jan 24, 2012 at 4:55 AM, Mans Rullgard <[email protected]> wrote:
> This prepares for assembly optimisations by moving the most
> time-consuming loops to functions called through pointers
> in a new context.
>
> Signed-off-by: Mans Rullgard <[email protected]>
> ---
> libavcodec/Makefile | 3 +-
> libavcodec/aacsbr.c | 183 ++++++++++---------------------------
> libavcodec/aacsbrdata.h | 2 +-
> libavcodec/sbr.h | 2 +
> libavcodec/sbrdsp.c | 237
> +++++++++++++++++++++++++++++++++++++++++++++++
> libavcodec/sbrdsp.h | 47 +++++++++
> 6 files changed, 337 insertions(+), 137 deletions(-)
> create mode 100644 libavcodec/sbrdsp.c
> create mode 100644 libavcodec/sbrdsp.h
> av_cold void ff_aac_sbr_ctx_close(SpectralBandReplication *sbr)
> @@ -1139,33 +1141,21 @@ static void sbr_dequant(SpectralBandReplication *sbr,
> int id_aac)
> * @param x pointer to the beginning of the first sample window
> * @param W array of complex-valued samples split into subbands
> */
> -static void sbr_qmf_analysis(DSPContext *dsp, FFTContext *mdct, const float
> *in, float *x,
> +static void sbr_qmf_analysis(DSPContext *dsp, FFTContext *mdct,
> + SBRDSPContext *sbrdsp, const float *in, float
> *x,
> float z[320], float W[2][32][32][2])
> {
> - int i, k;
> + int i;
> memcpy(W[0], W[1], sizeof(W[0]));
> memcpy(x , x+1024, (320-32)*sizeof(x[0]));
> memcpy(x+288, in, 1024*sizeof(x[0]));
> for (i = 0; i < 32; i++) { // numTimeSlots*RATE = 16*2 as 960 sample
> frames
> // are not supported
> dsp->vector_fmul_reverse(z, sbr_qmf_window_ds, x, 320);
> - for (k = 0; k < 64; k++) {
> - float f = z[k] + z[k + 64] + z[k + 128] + z[k + 192] + z[k +
> 256];
> - z[k] = f;
> - }
> - //Shuffle to IMDCT
> - z[64] = z[0];
> - for (k = 1; k < 32; k++) {
> - z[64+2*k-1] = z[ k];
> - z[64+2*k ] = -z[64-k];
> - }
> - z[64+63] = z[32];
> -
> + sbrdsp->sum64x5(z);
> + sbrdsp->qmf_pre_shuffle(z);
> mdct->imdct_half(mdct, z, z+64);
> - for (k = 0; k < 32; k++) {
> - W[1][i][k][0] = -z[63-k];
> - W[1][i][k][1] = z[k];
> - }
> + sbrdsp->qmf_post_shuffle(W[1][i], z);
> x += 32;
> }
> }
> @@ -1175,6 +1165,7 @@ static void sbr_qmf_analysis(DSPContext *dsp,
> FFTContext *mdct, const float *in,
> * (14496-3 sp04 p206)
> */
> static void sbr_qmf_synthesis(DSPContext *dsp, FFTContext *mdct,
> + SBRDSPContext *sbrdsp,
> float *out, float X[2][38][64],
> float mdct_buf[2][64],
> float *v0, int *v_off, const unsigned int div)
> @@ -1198,20 +1189,12 @@ static void sbr_qmf_synthesis(DSPContext *dsp,
> FFTContext *mdct,
> X[0][i][32+n] = X[1][i][31-n];
> }
> mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
> - for (n = 0; n < 32; n++) {
> - v[ n] = mdct_buf[0][63 - 2*n];
> - v[63 - n] = -mdct_buf[0][62 - 2*n];
> - }
> + sbrdsp->qmf_deint_neg(v, mdct_buf[0]);
> } else {
> - for (n = 1; n < 64; n+=2) {
> - X[1][i][n] = -X[1][i][n];
> - }
> + sbrdsp->neg_odd_64(X[1][i]);
> mdct->imdct_half(mdct, mdct_buf[0], X[0][i]);
> mdct->imdct_half(mdct, mdct_buf[1], X[1][i]);
> - for (n = 0; n < 64; n++) {
> - v[ n] = -mdct_buf[0][63 - n] + mdct_buf[1][ n ];
> - v[127 - n] = mdct_buf[0][63 - n] + mdct_buf[1][ n ];
> - }
> + sbrdsp->qmf_deint_bfly(v, mdct_buf[1], mdct_buf[0]);
> }
> dsp->vector_fmul_add(out, v , sbr_qmf_window
> , zero64, 64 >> div);
> dsp->vector_fmul_add(out, v + ( 192 >> div), sbr_qmf_window + ( 64 >>
> div), out , 64 >> div);
I think these QMFs can be performed with substantially less
computation (smaller transforms with similar amounts of pre/post
processing). I can send interested parties the relevant paper (yes I
know papers are a dime a dozen but this particular paper seems
promising).
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel