> From: "Mark Taylor" <[EMAIL PROTECTED]>
>
> The code enabled by
>
> #define USE_GNUC_ASM
>
> is currently broken. Takehiro was trying some improvements,
> (Takehiro, maybe you could revert CVS back to the original until
> these are working? )
I got Takehiro's code working (the fp stack was out of order) but it seems
to be slightly slower than the original. Here's the fixed code anyway - the
main loop in quantize_xrpow
{
__asm__ __volatile__(
"\n\nloop1:\n\t" // 0 1 2 3 4
"fld" F8type " 0*" F8size "(%1)\n\t" // 0 i
"fmul %%st(1)\n\t"
"fld" F8type " 1*" F8size "(%1)\n\t" // 1 0 i
"fmul %%st(2)\n\t"
"fld" F8type " 2*" F8size "(%1)\n\t" // 2 1 0 i
"fmul %%st(3)\n\t"
"fld" F8type " 3*" F8size "(%1)\n\t" // 3 2 1 0 i
"fmul %%st(4)\n\t"
"fxch %%st(3)\n\t" // 0 2 1 3 i
"fistl (%3)\n\t"
"fxch %%st(2)\n\t" // 1 2 0 3 i
"fistl 4(%3)\n\t"
"fxch %%st(1)\n\t" // 2 1 0 3 i
"fistl 8(%3)\n\t"
"fxch %%st(3)\n\t" // 3 1 0 2 i
"fistl 12(%3)\n\t"
"addl $4*" F8size ", %1\n\t"
"addl $16, %3\n\t"
"dec %4\n\t"
"movl -16(%3), %%eax\n\t"
"movl -12(%3), %%ebx\n\t"
"fxch %%st(2)\n\t" // 0 1 3 2 i
"fadd" F8type " (%2,%%eax," F8size ")\n\t"
"fxch %%st(1)\n\t" // 1 0 3 2 i
"fadd" F8type " (%2,%%ebx," F8size ")\n\t"
"movl -8(%3), %%eax\n\t"
"movl -4(%3), %%ebx\n\t"
"fxch %%st(3)\n\t" // 2 0 3 1 i
"fadd" F8type " (%2,%%eax," F8size ")\n\t"
"fxch %%st(2)\n\t" // 3 0 2 1 i
"fadd" F8type " (%2,%%ebx," F8size ")\n\t"
"fxch %%st(1)\n\t" // 0 3 2 1 i
"fistpl -16(%3)\n\t" // 3 2 1 i
"fxch %%st(2)\n\t" // 1 2 3 i
"fistpl -12(%3)\n\t" // 2 3 i
"fistpl -8(%3)\n\t" // 3 i
"fistpl -4(%3)\n\t" // i
"jnz loop1\n\n"
: /* no outputs */
: "t" (istep), "r" (xr), "r" (adj43asm), "r" (ix), "r" (576 / 4)
: "%eax", "%ebx", "memory", "cc"
);
}
-- Mat.
--
MP3 ENCODER mailing list ( http://geek.rcc.se/mp3encoder/ )