On 10/17/2016 6:24 PM, Rostislav Pehlivanov wrote: > diff --git a/libavcodec/aacenc_utils.h b/libavcodec/aacenc_utils.h > index ff9188a..f5cf77d 100644 > --- a/libavcodec/aacenc_utils.h > +++ b/libavcodec/aacenc_utils.h > @@ -37,7 +37,7 @@ > #define ROUND_TO_ZERO 0.1054f > #define C_QUANT 0.4054f > > -static inline void abs_pow34_v(float *out, const float *in, const int size) > +static inline void abs_pow34_v(float *out, const float *in, const int64_t > size)
Why int64_t? There's no need for values that big... > { > int i; > for (i = 0; i < size; i++) { ...And it's certainly not correct, seeing this for loop here. > diff --git a/libavcodec/x86/aacencdsp.asm b/libavcodec/x86/aacencdsp.asm > new file mode 100644 > index 0000000..ff4019f > --- /dev/null > +++ b/libavcodec/x86/aacencdsp.asm > @@ -0,0 +1,87 @@ > +;****************************************************************************** > +;* SIMD optimized AAC encoder DSP functions > +;* > +;* Copyright (C) 2016 Rostislav Pehlivanov <atomnu...@gmail.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +float_abs_mask: times 4 dd 0x7fffffff > + > +SECTION .text > + > +;******************************************************************* > +;void ff_abs_pow34_sse(float *out, const float *in, const int64_t size); > +;******************************************************************* > +INIT_XMM sse > +cglobal abs_pow34, 3, 3, 3, out, in, size > + mova m2, [float_abs_mask] > + shl sizeq, 2 > + add inq, sizeq > + add outq, sizeq > + neg sizeq > +.loop: > + mova m0, [inq+sizeq] > + andps m0, m2 > + sqrtps m1, m0 > + mulps m0, m1 > + sqrtps m0, m0 > + mova [outq+sizeq], m0 > + add sizeq, mmsize > + jl .loop > + RET > + > +;******************************************************************* > +;void ff_aac_quantize_bands_sse2(int *out, const float *in, const float > *scaled, > +; int size, int is_signed, int maxval, const > float Q34, > +; const float rounding) > +;******************************************************************* > +INIT_XMM sse2 > +cglobal aac_quantize_bands, 6, 6, 6, out, in, scaled, size, is_signed, > maxval, Q34, rounding 5, 5, 6 No need to load maxval into a gpr on x86_32 and win64 when cvtsi2ss can also load it directly from memory. > +%if UNIX64 == 0 > + movss m0, Q34m > + movss m1, roundingm > +%endif > + SPLATD m0 > + SPLATD m1 shufps m0, m0, 0 shufps m1, m1, 0 On sse2, SPLATD will expand to pshufd, so better stay in float domain. If you add an AVX/FMA3 version of this function, you could instead use vbroadcastss in them, and save yourself the movss. > + cvtsi2ss m3, maxvald > + SPLATD m3 %if UNIX64 cvtsi2ss m3, maxvald %else cvtsi2ss m3, dword maxvalm %endif shufps m3, m3, 0 After you made the function 5, 5, 6. > + shl is_signedd, 31 > + movd m4, is_signedd > + SPLATD m4 Use shufps here since the instruction using this register in the loop should be a float one. > + shl sizeq, 2 Even if it doesn't come from stack on win64, better use "sized" anyway. > + add inq, sizeq > + add outq, sizeq > + add scaledq, sizeq > + neg sizeq > +.loop: > + mova m2, [scaledq+sizeq] > + mulps m2, m0 mulps m2, m0, [scaledq+sizeq] > + addps m2, m1 You could combine the mulps and addps into a single fmaddps if you add an FMA3 version. Something like movaps m2, [scaledq+sizeq] fmaddps m2, m2, m0, m1 The movaps is needed because, unlike FMA4's fmaddps which is non-destructive, FMA3 needs dst to be the same as one of the three src registers. > + minps m2, m3 > + mova m5, [inq+sizeq] movaps m5, [inq+sizeq] > + pand m5, m4 andps m5, m4 > + orps m2, m5 > + cvttps2dq m2, m2 > + mova [outq+sizeq], m2 > + add sizeq, mmsize > + jl .loop > + RET Can't you make these function also process eight floats per loop? YMM versions would be nice, seeing how much these functions affect overall encoding speed. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel