Hi all,
        I never get around to optimizing the bit i'm working on, so i optimized
quantize_xrpow instead of finalising the mdct thing.

        This is a speedup for quantize_xrpow that is sort of taken from my old
quantize() speedup [see lame3.11 or previous].
        This will only speed up quantize_xrpow if you aren't on x86 using the
asm. (the x86 asm loop still beats this).  It's about 25% faster than before.

        Basically, about 75% of the time, xr and istep will give ix=0, so just
do a little check to see if it does, and then just set ix=0 without actually
calculating it.
        Depending on cpu/fpu architecture, it may be worthwhile doing more
comparisons.  Since this is the 1st or 2nd most intensive CPU routine in LAME
I'd suggest that those on non-x86 architectures add an extra compareval and see
if it's productive.

later
mike

---------------

void quantize_xrpow( double xr[576], int ix[576], gr_info *cod_info )
{
  /* quantize on xr^(3/4) instead of xr */
  register int j;
  double quantizerStepSize;
  double istep_l,istep0,istep1,istep2;
  double compareval0;

  quantizerStepSize = cod_info->quantizerStepSize;
  
  istep_l = pow ( 2.0, quantizerStepSize * -0.1875 );
  
  if ((cod_info->block_type==2))
    {
      istep0 = istep_l * pow(2.0,1.5* (double) cod_info->subblock_gain[0]);
      istep1 = istep_l * pow(2.0,1.5* (double) cod_info->subblock_gain[1]);
      istep2 = istep_l * pow(2.0,1.5* (double) cod_info->subblock_gain[2]);
      for (j=192;j>0;j--) 
        {
#if defined(__GNUC__) && defined(__i386__)
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep0*(*(xr++)) - 0.0946):
"st");
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep1*(*(xr++)) - 0.0946):
"st");
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep2*(*(xr++)) - 0.0946):
"st");
#else
          *(ix++) = (int)( istep0*(*(xr++))  + 0.4054);
          *(ix++) = (int)( istep1*(*(xr++))  + 0.4054);
          *(ix++) = (int)( istep2*(*(xr++))  + 0.4054);
#endif
        }
    }
  else
    {
#if defined(__GNUC__) && defined(__i386__) 
      for (j=576;j>0;j--) 
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep_l*(*(xr++)) - 0.0946):
"st");
#else
      compareval0 = (1.0 - 0.4054)/istep_l;
      /* depending on architecture, it may be worth calculating a few more
compareval's.
         eg.  compareval1 = (2.0 - 0.4054/istep_l); 
              .. and then after the first compare do this ...
              if compareval1>*xr then ix = 1;
         On a pentium166, it's only worth doing the one compare (as done here),
as the second
         compare becomes more expensive than just calculating the value.
Architectures with 
         slow FP operations may want to add some more comparevals. try it and
send your diffs 
         statistically speaking
         73% of all xr*istep_l values give ix=0
         16% will give 1
         4%  will give 2
      */
      for (j=576;j>0;j--) 
        {
          if (compareval0 > *xr) {
            *(ix++) = 0;
            xr++;
          } else
            *(ix++) = (int)( istep_l*(*(xr++))  + 0.4054);
        }
#endif
    }
}

--
MP3 ENCODER mailing list ( http://geek.rcc.se/mp3encoder/ )

Reply via email to