On Sat, Jul 28, 2012 at 8:55 PM, Loren Merritt <[email protected]> wrote: > 13% faster on penryn, 16% on sandybridge, 15% on bulldozer > Not simd; a compiler should have generated this, but gcc didn't. > --- > libavfilter/vf_hqdn3d.c | 27 ++++++++++-- > libavfilter/x86/Makefile | 1 + > libavfilter/x86/hqdn3d.asm | 106 > ++++++++++++++++++++++++++++++++++++++++++++ > libavutil/x86/x86inc.asm | 1 + > 4 files changed, 131 insertions(+), 4 deletions(-) > create mode 100644 libavfilter/x86/hqdn3d.asm > > diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c > index d263cff..ef59691 100644 > --- a/libavfilter/vf_hqdn3d.c > +++ b/libavfilter/vf_hqdn3d.c > @@ -40,8 +40,14 @@ typedef struct { > double strength[4]; > int hsub, vsub; > int depth; > + void (*denoise_row[17])(uint8_t *src, uint8_t *dst, uint16_t *line_ant, > uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); > } HQDN3DContext; > > +void ff_hqdn3d_row_8_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, > uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); > +void ff_hqdn3d_row_9_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, > uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); > +void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, > uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); > +void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, > uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal); > + > #define LUT_BITS (depth==16 ? 8 : 4) > #define RIGHTSHIFT(a,b) (((a)+(((1<<(b))-1)>>1))>>(b)) > #define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2)) << (16-depth)) > @@ -78,7 +84,8 @@ static void denoise_temporal(uint8_t *src, uint8_t *dst, > } > > av_always_inline > -static void denoise_spatial(uint8_t *src, uint8_t *dst, > +static void denoise_spatial(HQDN3DContext *hqdn3d, > + uint8_t *src, uint8_t *dst, > uint16_t *line_ant, uint16_t *frame_ant, > int w, int h, int sstride, int dstride, > int16_t *spatial, int16_t *temporal, int depth) > @@ -103,6 +110,10 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst, > src += sstride; > dst += dstride; > frame_ant += w; > + if (hqdn3d->denoise_row[depth]) { > + hqdn3d->denoise_row[depth](src, dst, line_ant, frame_ant, w, > spatial, temporal); > + continue; > + } > pixel_ant = LOAD(0); > for (x = 0; x < w-1; x++) { > line_ant[x] = tmp = lowpass(line_ant[x], pixel_ant, spatial, > depth); > @@ -117,7 +128,8 @@ static void denoise_spatial(uint8_t *src, uint8_t *dst, > } > > av_always_inline > -static void denoise_depth(uint8_t *src, uint8_t *dst, > +static void denoise_depth(HQDN3DContext *hqdn3d, > + uint8_t *src, uint8_t *dst, > uint16_t *line_ant, uint16_t **frame_ant_ptr, > int w, int h, int sstride, int dstride, > int16_t *spatial, int16_t *temporal, int depth) > @@ -137,7 +149,7 @@ static void denoise_depth(uint8_t *src, uint8_t *dst, > } > > if (spatial[0]) > - denoise_spatial(src, dst, line_ant, frame_ant, > + denoise_spatial(hqdn3d, src, dst, line_ant, frame_ant, > w, h, sstride, dstride, spatial, temporal, depth); > else > denoise_temporal(src, dst, frame_ant, > @@ -297,6 +309,13 @@ static int config_input(AVFilterLink *inlink) > } > } > > +#if HAVE_YASM > + hqdn3d->denoise_row[ 8] = ff_hqdn3d_row_8_x86; > + hqdn3d->denoise_row[ 9] = ff_hqdn3d_row_9_x86; > + hqdn3d->denoise_row[10] = ff_hqdn3d_row_10_x86; > + hqdn3d->denoise_row[16] = ff_hqdn3d_row_16_x86; > +#endif > + > return 0; > } > > @@ -314,7 +333,7 @@ static int end_frame(AVFilterLink *inlink) > int ret, c; > > for (c = 0; c < 3; c++) { > - denoise(inpic->data[c], outpic->data[c], > + denoise(hqdn3d, inpic->data[c], outpic->data[c], > hqdn3d->line, &hqdn3d->frame_prev[c], > inpic->video->w >> (!!c * hqdn3d->hsub), > inpic->video->h >> (!!c * hqdn3d->vsub), > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index e98693d..46fc84f 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -1,2 +1,3 @@ > MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o > MMX-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o > +YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/hqdn3d.o > diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm > new file mode 100644 > index 0000000..7254194 > --- /dev/null > +++ b/libavfilter/x86/hqdn3d.asm > @@ -0,0 +1,106 @@ > +;****************************************************************************** > +;* Copyright (c) 2012 Loren Merritt > +;* > +;* This file is part of Libav. > +;* > +;* Libav is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* Libav is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with Libav; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "x86inc.asm" > + > +SECTION .text > + > +%macro LOWPASS 3 ; prevsample, cursample, lut > + sub %1q, %2q > +%if lut_bits != 8 > + sar %1q, 8-lut_bits > +%endif > + movsx %1d, word [%3q+%1q*2] > + add %1d, %2d > +%endmacro > + > +%macro LOAD 3 ; dstreg, x, bitdepth > +%if %3 == 8 > + movzx %1, byte [srcq+%2] > +%else > + movzx %1, word [srcq+(%2)*2] > +%endif > +%if %3 != 16 > + shl %1, 16-%3 > +%endif > +%endmacro > + > +%macro HQDN3D_ROW 1 ; bitdepth > +%if ARCH_X86_64 > +cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, > spatial, temporal, pixelant, t0, t1 > +%else > +cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, > spatial, temporal > +%endif > + %assign bytedepth (%1+7)>>3 > + %assign lut_bits 4+4*(%1/16) > + dec widthq > + lea srcq, [srcq+widthq*bytedepth] > + lea dstq, [dstq+widthq*bytedepth] > + lea frameantq, [frameantq+widthq*2] > + lea lineantq, [lineantq+widthq*2] > + neg widthq > + %define xq widthq > +%if ARCH_X86_32 > + mov dstmp, dstq > + mov srcmp, srcq > + mov frameantmp, frameantq > + mov lineantmp, lineantq > + %define dstq r0 > + %define frameantq r0 > + %define lineantq r0 > + %define pixelantq r1 > + %define pixelantd r1d > + DECLARE_REG_TMP 2,3 > +%endif > + LOAD pixelantd, xq, %1 > +ALIGN 16 > +.loop: > + movifnidn srcq, srcmp > + LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread > +.loop2: > + movifnidn lineantq, lineantmp > + movzx t1d, word [lineantq+xq*2] > + LOWPASS t1, pixelant, spatial > + mov [lineantq+xq*2], t1w > + LOWPASS pixelant, t0, spatial > + movifnidn frameantq, frameantmp > + movzx t0d, word [frameantq+xq*2] > + LOWPASS t0, t1, temporal > + mov [frameantq+xq*2], t0w > + movifnidn dstq, dstmp > +%if %1 != 16 > + add t0d, (1<<(15-%1))-1 > + shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only > with some contraints on register allocation > +%endif > +%if %1 == 8 > + mov [dstq+xq], t0b > +%else > + mov [dstq+xq*2], t0w > +%endif > + inc xq > + jl .loop > + je .loop2 > + REP_RET > +%endmacro ; HQDN3D_ROW > + > +HQDN3D_ROW 8 > +HQDN3D_ROW 9 > +HQDN3D_ROW 10 > +HQDN3D_ROW 16 > diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm > index b76a10c..4206f14 100644 > --- a/libavutil/x86/x86inc.asm > +++ b/libavutil/x86/x86inc.asm > @@ -137,6 +137,7 @@ CPU amdnop > %define r%1d %3 > %define r%1w %4 > %define r%1b %5 > + %define %2q %2 > %if %0 == 5 > %define r%1m %3 > %define r%1mp %2 > -- > 1.7.4.1 > > _______________________________________________ > libav-devel mailing list > [email protected] > https://lists.libav.org/mailman/listinfo/libav-devel
Looks ok to me. _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
