On Mon, Jan 18, 2016 at 4:38 PM, Diego Biurrun <[email protected]> wrote: > --- > libavcodec/x86/Makefile | 2 + > libavcodec/x86/hpeldsp.asm | 89 ------------------------------ > libavcodec/x86/hpeldsp.h | 4 ++ > libavcodec/x86/hpeldsp_init.c | 25 ++------- > libavcodec/x86/hpeldsp_vp3.asm | 111 > ++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/hpeldsp_vp3_init.c | 54 +++++++++++++++++++ > 6 files changed, 174 insertions(+), 111 deletions(-) > create mode 100644 libavcodec/x86/hpeldsp_vp3.asm > create mode 100644 libavcodec/x86/hpeldsp_vp3_init.c > > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index 4afd0a7..1ccea035 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -50,6 +50,7 @@ OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o > OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o > OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o > OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o > +OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3_init.o > OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o > OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o > > @@ -120,5 +121,6 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o > YASM-OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc.o > YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o > YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o > +YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/hpeldsp_vp3.o > YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o > YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o > diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm > index b8929b9..8e21114 100644 > --- a/libavcodec/x86/hpeldsp.asm > +++ b/libavcodec/x86/hpeldsp.asm > @@ -142,53 +142,6 @@ INIT_MMX 3dnow > PUT_NO_RND_PIXELS8_X2 > > > -; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h) > -%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 > -cglobal put_no_rnd_pixels8_x2_exact, 4,5 > - lea r4, [r2*3] > - pcmpeqb m6, m6 > -.loop: > - mova m0, [r1] > - mova m2, [r1+r2] > - mova m1, [r1+1] > - mova m3, [r1+r2+1] > - pxor m0, m6 > - pxor m2, m6 > - pxor m1, m6 > - pxor m3, m6 > - PAVGB m0, m1 > - PAVGB m2, m3 > - pxor m0, m6 > - pxor m2, m6 > - mova [r0], m0 > - mova [r0+r2], m2 > - mova m0, [r1+r2*2] > - mova m1, [r1+r2*2+1] > - mova m2, [r1+r4] > - mova m3, [r1+r4+1] > - pxor m0, m6 > - pxor m1, m6 > - pxor m2, m6 > - pxor m3, m6 > - PAVGB m0, m1 > - PAVGB m2, m3 > - pxor m0, m6 > - pxor m2, m6 > - mova [r0+r2*2], m0 > - mova [r0+r4], m2 > - lea r1, [r1+r2*4] > - lea r0, [r0+r2*4] > - sub r3d, 4 > - jg .loop > - REP_RET > -%endmacro > - > -INIT_MMX mmxext > -PUT_NO_RND_PIXELS8_X2_EXACT > -INIT_MMX 3dnow > -PUT_NO_RND_PIXELS8_X2_EXACT > - > - > ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t > line_size, int h) > %macro PUT_PIXELS8_Y2 0 > cglobal put_pixels8_y2, 4,5 > @@ -260,48 +213,6 @@ INIT_MMX 3dnow > PUT_NO_RND_PIXELS8_Y2 > > > -; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h) > -%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 > -cglobal put_no_rnd_pixels8_y2_exact, 4,5 > - lea r4, [r2*3] > - mova m0, [r1] > - pcmpeqb m6, m6 > - add r1, r2 > - pxor m0, m6 > -.loop: > - mova m1, [r1] > - mova m2, [r1+r2] > - pxor m1, m6 > - pxor m2, m6 > - PAVGB m0, m1 > - PAVGB m1, m2 > - pxor m0, m6 > - pxor m1, m6 > - mova [r0], m0 > - mova [r0+r2], m1 > - mova m1, [r1+r2*2] > - mova m0, [r1+r4] > - pxor m1, m6 > - pxor m0, m6 > - PAVGB m2, m1 > - PAVGB m1, m0 > - pxor m2, m6 > - pxor m1, m6 > - mova [r0+r2*2], m2 > - mova [r0+r4], m1 > - lea r1, [r1+r2*4] > - lea r0, [r0+r2*4] > - sub r3d, 4 > - jg .loop > - REP_RET > -%endmacro > - > -INIT_MMX mmxext > -PUT_NO_RND_PIXELS8_Y2_EXACT > -INIT_MMX 3dnow > -PUT_NO_RND_PIXELS8_Y2_EXACT > - > - > ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t > line_size, int h) > %macro AVG_PIXELS8 0 > cglobal avg_pixels8, 4,5 > diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h > index 47b0b8b..d624ed9 100644 > --- a/libavcodec/x86/hpeldsp.h > +++ b/libavcodec/x86/hpeldsp.h > @@ -22,6 +22,8 @@ > #include <stddef.h> > #include <stdint.h> > > +#include "libavcodec/hpeldsp.h" > + > void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > > @@ -35,4 +37,6 @@ void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t > *pixels, > void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > > +void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags); > + > #endif /* AVCODEC_X86_HPELDSP_H */ > diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c > index 59cb5e1..9ca2505 100644 > --- a/libavcodec/x86/hpeldsp_init.c > +++ b/libavcodec/x86/hpeldsp_init.c > @@ -44,12 +44,6 @@ void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const > uint8_t *pixels, > ptrdiff_t line_size, int h); > void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > -void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, > - const uint8_t *pixels, > - ptrdiff_t line_size, int h); > -void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, > - const uint8_t *pixels, > - ptrdiff_t line_size, int h); > void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, > @@ -58,12 +52,6 @@ void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const > uint8_t *pixels, > ptrdiff_t line_size, int h); > void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > -void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, > - const uint8_t *pixels, > - ptrdiff_t line_size, int h); > -void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, > - const uint8_t *pixels, > - ptrdiff_t line_size, int h); > void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h); > void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, > @@ -210,11 +198,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int > flags, int cpu_flags) > c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; > c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; > } > - > - if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { > - c->put_no_rnd_pixels_tab[1][1] = > ff_put_no_rnd_pixels8_x2_exact_mmxext; > - c->put_no_rnd_pixels_tab[1][2] = > ff_put_no_rnd_pixels8_y2_exact_mmxext; > - } > #endif /* HAVE_MMXEXT_EXTERNAL */ > } > > @@ -244,11 +227,6 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int > flags, int cpu_flags) > c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; > c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; > } > - > - if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { > - c->put_no_rnd_pixels_tab[1][1] = > ff_put_no_rnd_pixels8_x2_exact_3dnow; > - c->put_no_rnd_pixels_tab[1][2] = > ff_put_no_rnd_pixels8_y2_exact_3dnow; > - } > #endif /* HAVE_AMD3DNOW_EXTERNAL */ > } > > @@ -279,4 +257,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int > flags) > > if (EXTERNAL_SSE2(cpu_flags)) > hpeldsp_init_sse2(c, flags, cpu_flags); > + > + if (CONFIG_VP3_DECODER) > + ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags); > } > diff --git a/libavcodec/x86/hpeldsp_vp3.asm b/libavcodec/x86/hpeldsp_vp3.asm > new file mode 100644 > index 0000000..513f14e > --- /dev/null > +++ b/libavcodec/x86/hpeldsp_vp3.asm > @@ -0,0 +1,111 @@ > +;****************************************************************************** > +;* SIMD-optimized halfpel functions for VP3 > +;* > +;* This file is part of Libav. > +;* > +;* Libav is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* Libav is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with Libav; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text > + > +; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h) > +%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 > +cglobal put_no_rnd_pixels8_x2_exact, 4,5 > + lea r4, [r2*3] > + pcmpeqb m6, m6 > +.loop: > + mova m0, [r1] > + mova m2, [r1+r2] > + mova m1, [r1+1] > + mova m3, [r1+r2+1] > + pxor m0, m6 > + pxor m2, m6 > + pxor m1, m6 > + pxor m3, m6 > + PAVGB m0, m1 > + PAVGB m2, m3 > + pxor m0, m6 > + pxor m2, m6 > + mova [r0], m0 > + mova [r0+r2], m2 > + mova m0, [r1+r2*2] > + mova m1, [r1+r2*2+1] > + mova m2, [r1+r4] > + mova m3, [r1+r4+1] > + pxor m0, m6 > + pxor m1, m6 > + pxor m2, m6 > + pxor m3, m6 > + PAVGB m0, m1 > + PAVGB m2, m3 > + pxor m0, m6 > + pxor m2, m6 > + mova [r0+r2*2], m0 > + mova [r0+r4], m2 > + lea r1, [r1+r2*4] > + lea r0, [r0+r2*4] > + sub r3d, 4 > + jg .loop > + REP_RET > +%endmacro > + > +INIT_MMX mmxext > +PUT_NO_RND_PIXELS8_X2_EXACT > +INIT_MMX 3dnow > +PUT_NO_RND_PIXELS8_X2_EXACT > + > + > +; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int h) > +%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 > +cglobal put_no_rnd_pixels8_y2_exact, 4,5 > + lea r4, [r2*3] > + mova m0, [r1] > + pcmpeqb m6, m6 > + add r1, r2 > + pxor m0, m6 > +.loop: > + mova m1, [r1] > + mova m2, [r1+r2] > + pxor m1, m6 > + pxor m2, m6 > + PAVGB m0, m1 > + PAVGB m1, m2 > + pxor m0, m6 > + pxor m1, m6 > + mova [r0], m0 > + mova [r0+r2], m1 > + mova m1, [r1+r2*2] > + mova m0, [r1+r4] > + pxor m1, m6 > + pxor m0, m6 > + PAVGB m2, m1 > + PAVGB m1, m0 > + pxor m2, m6 > + pxor m1, m6 > + mova [r0+r2*2], m2 > + mova [r0+r4], m1 > + lea r1, [r1+r2*4] > + lea r0, [r0+r2*4] > + sub r3d, 4 > + jg .loop > + REP_RET > +%endmacro > + > +INIT_MMX mmxext > +PUT_NO_RND_PIXELS8_Y2_EXACT > +INIT_MMX 3dnow > +PUT_NO_RND_PIXELS8_Y2_EXACT > diff --git a/libavcodec/x86/hpeldsp_vp3_init.c > b/libavcodec/x86/hpeldsp_vp3_init.c > new file mode 100644 > index 0000000..06a9d67 > --- /dev/null > +++ b/libavcodec/x86/hpeldsp_vp3_init.c > @@ -0,0 +1,54 @@ > +/* > + * This file is part of Libav. > + * > + * Libav is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * Libav is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with Libav; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavcodec/avcodec.h" > +#include "libavcodec/hpeldsp.h" > +#include "hpeldsp.h" > + > +void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, > + const uint8_t *pixels, > + ptrdiff_t line_size, int h); > +void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, > + const uint8_t *pixels, > + ptrdiff_t line_size, int h); > +void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, > + const uint8_t *pixels, > + ptrdiff_t line_size, int h); > +void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, > + const uint8_t *pixels, > + ptrdiff_t line_size, int h); > + > +av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int > flags) > +{ > + if (EXTERNAL_AMD3DNOW(cpu_flags)) { > + if (flags & AV_CODEC_FLAG_BITEXACT) { > + c->put_no_rnd_pixels_tab[1][1] = > ff_put_no_rnd_pixels8_x2_exact_3dnow; > + c->put_no_rnd_pixels_tab[1][2] = > ff_put_no_rnd_pixels8_y2_exact_3dnow; > + } > + } > + > + if (EXTERNAL_MMXEXT(cpu_flags)) { > + if (flags & AV_CODEC_FLAG_BITEXACT) { > + c->put_no_rnd_pixels_tab[1][1] = > ff_put_no_rnd_pixels8_x2_exact_mmxext; > + c->put_no_rnd_pixels_tab[1][2] = > ff_put_no_rnd_pixels8_y2_exact_mmxext; > + } > + } > +} > --
IMHO, either split this off entirely into its own DSP Context, or keep it grouped with the others. - Hendrik _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
