Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Checkasm result (osx) for your last patch : hflip_byte_c: 28.5 hflip_byte_ssse3: 29.0 hflip_short_c: 277.7 hflip_short_ssse3: 65.0 if you add a "cmp xq, wq" after the simd loop you can be faster than c (clang), if width is multiple of mmsize*2 hflip_byte_c: 28.5 hflip_byte_ssse3: 27.5 see below otherwise looks ok (i will send later a much cleaner patch for the checkasm, and a patch to use one macro for both func) + > +pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 > +pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 > + > +SECTION .text > + > +INIT_XMM ssse3 > +cglobal hflip_byte, 3, 6, 3, src, dst, w, x, v, r > +movam0, [pb_flip_byte] > +mov xq, 0 > +mov wd, dword wm > +mov rq, wq > +and rq, 2 * mmsize - 1 > +cmp wq, 2 * mmsize > +jl .loop1 > +sub wq, rq > + > +.loop0: > +neg xq > +movum1, [srcq + xq - mmsize + 1] > +movum2, [srcq + xq - 2 * mmsize + 1] > +pshufb m1, m0 > +pshufb m2, m0 > +neg xq > +movu[dstq + xq ], m1 > +movu[dstq + xq + mmsize], m2 > +add xq, mmsize * 2 > +cmp xq, wq > +jl .loop0 > cmp xq, wq je .end > + > +addwq, rq > + > +.loop1: > +negxq > +movvb, [srcq + xq] > +negxq > +mov[dstq + xq], vb > +addxq, 1 > +cmpxq, wq > +jl .loop1 > .end: > +RET > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/2017 5:50 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol> --- > libavfilter/hflip.h | 38 > libavfilter/vf_hflip.c | 133 > ++-- > libavfilter/x86/Makefile| 2 + > libavfilter/x86/vf_hflip.asm| 102 ++ > libavfilter/x86/vf_hflip_init.c | 41 + > 5 files changed, 269 insertions(+), 47 deletions(-) > create mode 100644 libavfilter/hflip.h > create mode 100644 libavfilter/x86/vf_hflip.asm > create mode 100644 libavfilter/x86/vf_hflip_init.c [...] > @@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink) > s->planeheight[0] = s->planeheight[3] = inlink->h; > s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); > > +nb_planes = av_pix_fmt_count_planes(inlink->format); > + > +for (i = 0; i < nb_planes; i++) { > +switch (s->max_step[i]) { > +case 1: s->flip_line[i] = hflip_byte_c; break; > +case 2: s->flip_line[i] = hflip_short_c; break; > +case 3: s->flip_line[i] = hflip_b24_c; break; > +case 4: s->flip_line[i] = hflip_dword_c; break; > +case 6: s->flip_line[i] = hflip_b48_c; break; > +case 8: s->flip_line[i] = hflip_qword_c; break; > +default: > +return AVERROR_BUG; > +} > +} > + > +if (ARCH_X86) > +ff_hflip_init_x86(s, s->max_step); Pass nb_planes here and use it instead of the hardcoded 4. Should be good aside from that. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Signed-off-by: Paul B Mahol--- libavfilter/hflip.h | 38 libavfilter/vf_hflip.c | 133 ++-- libavfilter/x86/Makefile| 2 + libavfilter/x86/vf_hflip.asm| 102 ++ libavfilter/x86/vf_hflip_init.c | 41 + 5 files changed, 269 insertions(+), 47 deletions(-) create mode 100644 libavfilter/hflip.h create mode 100644 libavfilter/x86/vf_hflip.asm create mode 100644 libavfilter/x86/vf_hflip_init.c diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h new file mode 100644 index 00..138380427c --- /dev/null +++ b/libavfilter/hflip.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2007 Benoit Fouet + * Copyright (c) 2010 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_HFLIP_H +#define AVFILTER_HFLIP_H + +#include "avfilter.h" + +typedef struct FlipContext { +const AVClass *class; +int max_step[4];///< max pixel step for each plane, expressed as a number of bytes +int planewidth[4]; ///< width of each plane +int planeheight[4]; ///< height of each plane + +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); +} FlipContext; + +void ff_hflip_init_x86(FlipContext *s, int step[4]); + +#endif /* AVFILTER_HFLIP_H */ diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c index cf20c193f7..030015df0a 100644 --- a/libavfilter/vf_hflip.c +++ b/libavfilter/vf_hflip.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "avfilter.h" #include "formats.h" +#include "hflip.h" #include "internal.h" #include "video.h" #include "libavutil/pixdesc.h" @@ -36,13 +37,6 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" -typedef struct FlipContext { -const AVClass *class; -int max_step[4];///< max pixel step for each plane, expressed as a number of bytes -int planewidth[4]; ///< width of each plane -int planeheight[4]; ///< height of each plane -} FlipContext; - static const AVOption hflip_options[] = { { NULL } }; @@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) +{ +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint16_t *src = (const uint16_t *)ssrc; +uint16_t *dst = (uint16_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint32_t *src = (const uint32_t *)ssrc; +uint32_t *dst = (uint32_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 3, in -= 3) { +int32_t v = AV_RB24(in); + +AV_WB24(out, v); +} +} + +static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 6, in -= 6) { +int64_t v = AV_RB48(in); + +AV_WB48(out, v); +} +} + +static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint64_t *src = (const uint64_t *)ssrc; +uint64_t *dst = (uint64_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + static int config_props(AVFilterLink *inlink) { FlipContext *s = inlink->dst->priv; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); const int hsub = pix_desc->log2_chroma_w; const int vsub = pix_desc->log2_chroma_h; +int nb_planes, i; av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink) s->planeheight[0] = s->planeheight[3] = inlink->h; s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); +nb_planes =
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
I modify the checkasm test, to test various width if (check_func(s.flip_line[0], "hflip_%s", report_name)) { for (i = 1; i < w; i++) { call_ref(src, dst_ref, i); call_new(src, dst_new, i); if (memcmp(dst_ref, dst_new, WIDTH)) { printf("FAIL : W = %d\n", i); fail(); } } bench_new(src, dst_new, WIDTH); } This asm seems to be ok (same idea for the hflip_short version) hflip_byte_c: 28.4 hflip_byte_ssse3: 23.7 hflip_short_c: 275.9 hflip_short_ssse3: 65.2 INIT_XMM ssse3 cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v movam0, [pb_flip_byte] mov xq, 0 mov wd, dword wm sub wq, 2 * mmsize ;cmp wq, mmsize ; < Doesn't seems to be need jl .skip .loop0: neg xq movum1, [srcq + xq - mmsize + 1] movum2, [srcq + xq - 2 * mmsize + 1] pshufb m1, m0 pshufb m2, m0 neg xq movu[dstq + xq ], m1 movu[dstq + xq + mmsize], m2 add xq, mmsize * 2 cmp xq, wq jl .loop0 cmp xq, wq ;< je .end ;< sub xq, mmsize *2 ;< jmp .loop1 ;< .skip: add wq, 2 * mmsize .loop1: negxq movvb, [srcq + xq] negxq mov[dstq + xq], vb addxq, 1 cmpxq, wq jl .loop1 .end: RET ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Paul B Maholwrote: > On 12/3/17, Martin Vignali wrote: >> Maybe the problem come from the skip part : >> >> +INIT_XMM ssse3 >>> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v >>> +movam0, [pb_flip_byte] >>> +mov xq, 0 >>> +mov wd, dword wm >>> +sub wq, 2 * mmsize >>> +cmp wq, mmsize >>> +jl .skip >>> + >>> +.loop0: >>> +neg xq >>> +movum1, [srcq + xq - mmsize + 1] >>> +movum2, [srcq + xq - 2 * mmsize + 1] >>> +pshufb m1, m0 >>> +pshufb m2, m0 >>> +neg xq >>> +movu[dstq + xq ], m1 >>> +movu[dstq + xq + mmsize], m2 >>> +add xq, mmsize * 2 >>> +cmp xq, wq >>> +jl .loop0 >>> + >>> +.skip: >>> +add wq, 2 * mmsize >>> >> >> ==> use xq instead of wq ? > > Nope. > >> >> >>> +.loop1: >>> +negxq >>> +movvb, [srcq + xq] >>> +negxq >>> +mov[dstq + xq], vb >>> +addxq, 1 >>> +cmpxq, wq >>> +jl .loop1 >>> +RET >>> + >>> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v >>> +movam0, [pb_flip_short] >>> +mov xq, 0 >>> +mov wd, dword wm >>> +add wq, wq >>> +sub wq, 2 * mmsize >>> +cmp wq, mmsize >>> +jl .skip >>> + >>> +.loop0: >>> +neg xq >>> +movum1, [srcq + xq - mmsize + 2] >>> +movum2, [srcq + xq - 2 * mmsize + 2] >>> +pshufb m1, m0 >>> +pshufb m2, m0 >>> +neg xq >>> +movu[dstq + xq ], m1 >>> +movu[dstq + xq + mmsize], m2 >>> +add xq, mmsize >>> +cmp xq, wq >>> +jl .loop0 >>> + >>> +.skip: >>> +add wq, 2 * mmsize >>> >> >> >> ==> same here ? > > Nope, This is for case when width is not multiple of mmsize. > Can I get final verdict? I would like to move to other things. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Martin Vignaliwrote: > Maybe the problem come from the skip part : > > +INIT_XMM ssse3 >> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v >> +movam0, [pb_flip_byte] >> +mov xq, 0 >> +mov wd, dword wm >> +sub wq, 2 * mmsize >> +cmp wq, mmsize >> +jl .skip >> + >> +.loop0: >> +neg xq >> +movum1, [srcq + xq - mmsize + 1] >> +movum2, [srcq + xq - 2 * mmsize + 1] >> +pshufb m1, m0 >> +pshufb m2, m0 >> +neg xq >> +movu[dstq + xq ], m1 >> +movu[dstq + xq + mmsize], m2 >> +add xq, mmsize * 2 >> +cmp xq, wq >> +jl .loop0 >> + >> +.skip: >> +add wq, 2 * mmsize >> > > ==> use xq instead of wq ? Nope. > > >> +.loop1: >> +negxq >> +movvb, [srcq + xq] >> +negxq >> +mov[dstq + xq], vb >> +addxq, 1 >> +cmpxq, wq >> +jl .loop1 >> +RET >> + >> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v >> +movam0, [pb_flip_short] >> +mov xq, 0 >> +mov wd, dword wm >> +add wq, wq >> +sub wq, 2 * mmsize >> +cmp wq, mmsize >> +jl .skip >> + >> +.loop0: >> +neg xq >> +movum1, [srcq + xq - mmsize + 2] >> +movum2, [srcq + xq - 2 * mmsize + 2] >> +pshufb m1, m0 >> +pshufb m2, m0 >> +neg xq >> +movu[dstq + xq ], m1 >> +movu[dstq + xq + mmsize], m2 >> +add xq, mmsize >> +cmp xq, wq >> +jl .loop0 >> + >> +.skip: >> +add wq, 2 * mmsize >> > > > ==> same here ? Nope, This is for case when width is not multiple of mmsize. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Martin Vignaliwrote: > 2017-12-03 20:36 GMT+01:00 Paul B Mahol : > >> On 12/3/17, Martin Vignali wrote: >> >> >> >> In any case, if clang or gcc can generate better code, then the hand >> >> written version needs to be optimized to be as fast or faster. >> >> >> >> >> >> >> > Quick test : pass checkasm (but probably only because width = 256) >> > hflip_byte_c: 26.4 >> > hflip_byte_ssse3: 20.4 >> > >> > >> > INIT_XMM ssse3 >> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 >> > movam0, [pb_flip_byte] >> > xor xq, xq ; <== >> > mov wd, dword wm >> > sub wq, mmsize * 2 >> > ;remove the cmp here <== >> > jl .skip >> > >> > .loop0: ; process two xmm in the loop >> > neg xq >> > movum1, [srcq + xq - mmsize + 1] >> > movum2, [srcq + xq - mmsize * 2 + 1] <== >> > pshufb m1, m0 >> > pshufb m2, m0 <== >> > neg xq >> > movu[dstq + xq], m1 >> > movu[dstq + xq + mmsize], m2 <== >> > add xq, mmsize * 2 <== >> > cmp xq, wq >> > jl .loop0 >> > RET ; add RET here >> > >> > ; MISSING one xmm process if need >> > >> > .skip: >> > add wq, mmsize >> > .loop1: >> > negxq >> > movvb, [srcq + xq] >> > negxq >> > mov[dstq + xq], vb >> > addxq, 1 >> > cmpxq, wq >> > jl .loop1 >> > RET >> >> So what is wrong now? >> > > Doesn't see your email, when i send mine. > > Check asm result with your last patch (and modify for the short version > "add xq, mmsize" to "add xq, mmsize * 2") > hflip_byte_c: 28.0 > hflip_byte_ssse3: 127.5 > hflip_short_c: 276.5 > hflip_short_ssse3: 100.2 > Ops, fixed. > > Do you think if you add RET after the end of loop0 , it can work in all > cases ? No, it would try to read before src, and crash. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Maybe the problem come from the skip part : +INIT_XMM ssse3 > +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v > +movam0, [pb_flip_byte] > +mov xq, 0 > +mov wd, dword wm > +sub wq, 2 * mmsize > +cmp wq, mmsize > +jl .skip > + > +.loop0: > +neg xq > +movum1, [srcq + xq - mmsize + 1] > +movum2, [srcq + xq - 2 * mmsize + 1] > +pshufb m1, m0 > +pshufb m2, m0 > +neg xq > +movu[dstq + xq ], m1 > +movu[dstq + xq + mmsize], m2 > +add xq, mmsize * 2 > +cmp xq, wq > +jl .loop0 > + > +.skip: > +add wq, 2 * mmsize > ==> use xq instead of wq ? > +.loop1: > +negxq > +movvb, [srcq + xq] > +negxq > +mov[dstq + xq], vb > +addxq, 1 > +cmpxq, wq > +jl .loop1 > +RET > + > +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v > +movam0, [pb_flip_short] > +mov xq, 0 > +mov wd, dword wm > +add wq, wq > +sub wq, 2 * mmsize > +cmp wq, mmsize > +jl .skip > + > +.loop0: > +neg xq > +movum1, [srcq + xq - mmsize + 2] > +movum2, [srcq + xq - 2 * mmsize + 2] > +pshufb m1, m0 > +pshufb m2, m0 > +neg xq > +movu[dstq + xq ], m1 > +movu[dstq + xq + mmsize], m2 > +add xq, mmsize > +cmp xq, wq > +jl .loop0 > + > +.skip: > +add wq, 2 * mmsize > ==> same here ? +.loop1: > +negxq > +movvw, [srcq + xq] > +negxq > +mov[dstq + xq], vw > +addxq, 2 > +cmpxq, wq > +jl .loop1 > +RET > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
2017-12-03 20:36 GMT+01:00 Paul B Mahol: > On 12/3/17, Martin Vignali wrote: > >> > >> In any case, if clang or gcc can generate better code, then the hand > >> written version needs to be optimized to be as fast or faster. > >> > >> > >> > > Quick test : pass checkasm (but probably only because width = 256) > > hflip_byte_c: 26.4 > > hflip_byte_ssse3: 20.4 > > > > > > INIT_XMM ssse3 > > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 > > movam0, [pb_flip_byte] > > xor xq, xq ; <== > > mov wd, dword wm > > sub wq, mmsize * 2 > > ;remove the cmp here <== > > jl .skip > > > > .loop0: ; process two xmm in the loop > > neg xq > > movum1, [srcq + xq - mmsize + 1] > > movum2, [srcq + xq - mmsize * 2 + 1] <== > > pshufb m1, m0 > > pshufb m2, m0 <== > > neg xq > > movu[dstq + xq], m1 > > movu[dstq + xq + mmsize], m2 <== > > add xq, mmsize * 2 <== > > cmp xq, wq > > jl .loop0 > > RET ; add RET here > > > > ; MISSING one xmm process if need > > > > .skip: > > add wq, mmsize > > .loop1: > > negxq > > movvb, [srcq + xq] > > negxq > > mov[dstq + xq], vb > > addxq, 1 > > cmpxq, wq > > jl .loop1 > > RET > > So what is wrong now? > Doesn't see your email, when i send mine. Check asm result with your last patch (and modify for the short version "add xq, mmsize" to "add xq, mmsize * 2") hflip_byte_c: 28.0 hflip_byte_ssse3: 127.5 hflip_short_c: 276.5 hflip_short_ssse3: 100.2 Do you think if you add RET after the end of loop0 , it can work in all cases ? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Martin Vignaliwrote: >> >> In any case, if clang or gcc can generate better code, then the hand >> written version needs to be optimized to be as fast or faster. >> >> >> > Quick test : pass checkasm (but probably only because width = 256) > hflip_byte_c: 26.4 > hflip_byte_ssse3: 20.4 > > > INIT_XMM ssse3 > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 > movam0, [pb_flip_byte] > xor xq, xq ; <== > mov wd, dword wm > sub wq, mmsize * 2 > ;remove the cmp here <== > jl .skip > > .loop0: ; process two xmm in the loop > neg xq > movum1, [srcq + xq - mmsize + 1] > movum2, [srcq + xq - mmsize * 2 + 1] <== > pshufb m1, m0 > pshufb m2, m0 <== > neg xq > movu[dstq + xq], m1 > movu[dstq + xq + mmsize], m2 <== > add xq, mmsize * 2 <== > cmp xq, wq > jl .loop0 > RET ; add RET here > > ; MISSING one xmm process if need > > .skip: > add wq, mmsize > .loop1: > negxq > movvb, [srcq + xq] > negxq > mov[dstq + xq], vb > addxq, 1 > cmpxq, wq > jl .loop1 > RET So what is wrong now? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
> > In any case, if clang or gcc can generate better code, then the hand > written version needs to be optimized to be as fast or faster. > > > Quick test : pass checkasm (but probably only because width = 256) hflip_byte_c: 26.4 hflip_byte_ssse3: 20.4 INIT_XMM ssse3 cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2 movam0, [pb_flip_byte] xor xq, xq ; <== mov wd, dword wm sub wq, mmsize * 2 ;remove the cmp here <== jl .skip .loop0: ; process two xmm in the loop neg xq movum1, [srcq + xq - mmsize + 1] movum2, [srcq + xq - mmsize * 2 + 1] <== pshufb m1, m0 pshufb m2, m0 <== neg xq movu[dstq + xq], m1 movu[dstq + xq + mmsize], m2 <== add xq, mmsize * 2 <== cmp xq, wq jl .loop0 RET ; add RET here ; MISSING one xmm process if need .skip: add wq, mmsize .loop1: negxq movvb, [srcq + xq] negxq mov[dstq + xq], vb addxq, 1 cmpxq, wq jl .loop1 RET Martin ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Signed-off-by: Paul B Mahol--- libavfilter/hflip.h | 38 libavfilter/vf_hflip.c | 133 ++-- libavfilter/x86/Makefile| 2 + libavfilter/x86/vf_hflip.asm| 98 + libavfilter/x86/vf_hflip_init.c | 41 + 5 files changed, 265 insertions(+), 47 deletions(-) create mode 100644 libavfilter/hflip.h create mode 100644 libavfilter/x86/vf_hflip.asm create mode 100644 libavfilter/x86/vf_hflip_init.c diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h new file mode 100644 index 00..138380427c --- /dev/null +++ b/libavfilter/hflip.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2007 Benoit Fouet + * Copyright (c) 2010 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_HFLIP_H +#define AVFILTER_HFLIP_H + +#include "avfilter.h" + +typedef struct FlipContext { +const AVClass *class; +int max_step[4];///< max pixel step for each plane, expressed as a number of bytes +int planewidth[4]; ///< width of each plane +int planeheight[4]; ///< height of each plane + +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); +} FlipContext; + +void ff_hflip_init_x86(FlipContext *s, int step[4]); + +#endif /* AVFILTER_HFLIP_H */ diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c index cf20c193f7..030015df0a 100644 --- a/libavfilter/vf_hflip.c +++ b/libavfilter/vf_hflip.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "avfilter.h" #include "formats.h" +#include "hflip.h" #include "internal.h" #include "video.h" #include "libavutil/pixdesc.h" @@ -36,13 +37,6 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" -typedef struct FlipContext { -const AVClass *class; -int max_step[4];///< max pixel step for each plane, expressed as a number of bytes -int planewidth[4]; ///< width of each plane -int planeheight[4]; ///< height of each plane -} FlipContext; - static const AVOption hflip_options[] = { { NULL } }; @@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) +{ +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint16_t *src = (const uint16_t *)ssrc; +uint16_t *dst = (uint16_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint32_t *src = (const uint32_t *)ssrc; +uint32_t *dst = (uint32_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 3, in -= 3) { +int32_t v = AV_RB24(in); + +AV_WB24(out, v); +} +} + +static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 6, in -= 6) { +int64_t v = AV_RB48(in); + +AV_WB48(out, v); +} +} + +static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint64_t *src = (const uint64_t *)ssrc; +uint64_t *dst = (uint64_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + static int config_props(AVFilterLink *inlink) { FlipContext *s = inlink->dst->priv; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); const int hsub = pix_desc->log2_chroma_w; const int vsub = pix_desc->log2_chroma_h; +int nb_planes, i; av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink) s->planeheight[0] = s->planeheight[3] = inlink->h; s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); +nb_planes =
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Paul B Maholwrote: > On 12/3/17, Paul B Mahol wrote: >> Signed-off-by: Paul B Mahol >> --- >> libavfilter/hflip.h | 38 >> libavfilter/vf_hflip.c | 133 >> ++-- >> libavfilter/x86/Makefile| 2 + >> libavfilter/x86/vf_hflip.asm| 98 + >> libavfilter/x86/vf_hflip_init.c | 41 + >> 5 files changed, 265 insertions(+), 47 deletions(-) >> create mode 100644 libavfilter/hflip.h >> create mode 100644 libavfilter/x86/vf_hflip.asm >> create mode 100644 libavfilter/x86/vf_hflip_init.c >> > > This is overall ~50% faster than pure C that gcc 6.3.0 gives with > vanilla options. > By overall I mean this simple bench test: ffmpeg -f lavfi -i smptehdbars=hd1080 -vf hflip=threads=1 -f null - ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Paul B Maholwrote: > Signed-off-by: Paul B Mahol > --- > libavfilter/hflip.h | 38 > libavfilter/vf_hflip.c | 133 > ++-- > libavfilter/x86/Makefile| 2 + > libavfilter/x86/vf_hflip.asm| 98 + > libavfilter/x86/vf_hflip_init.c | 41 + > 5 files changed, 265 insertions(+), 47 deletions(-) > create mode 100644 libavfilter/hflip.h > create mode 100644 libavfilter/x86/vf_hflip.asm > create mode 100644 libavfilter/x86/vf_hflip_init.c > This is overall ~50% faster than pure C that gcc 6.3.0 gives with vanilla options. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/2017 3:55 PM, Martin Vignali wrote: > in O2 or O3 : clang -S -O3 test_asm_gen.c > > If i correctly understand, same idea than paul's patch > but processing two xmm in the main loop > > .section__TEXT,__text,regular,pure_instructions > .macosx_version_min 10, 12 > .section__TEXT,__literal16,16byte_literals > .p2align4 > LCPI0_0: > .byte15 ## 0xf > .byte14 ## 0xe > .byte13 ## 0xd > .byte12 ## 0xc > .byte11 ## 0xb > .byte10 ## 0xa > .byte9 ## 0x9 > .byte8 ## 0x8 > .byte7 ## 0x7 > .byte6 ## 0x6 > .byte5 ## 0x5 > .byte4 ## 0x4 > .byte3 ## 0x3 > .byte2 ## 0x2 > .byte1 ## 0x1 > .byte0 ## 0x0 > .section__TEXT,__text,regular,pure_instructions > .globl_hflip_byte_c > .p2align4, 0x90 > _hflip_byte_c: ## @hflip_byte_c > .cfi_startproc > ## BB#0: > pushq%rbp > Ltmp0: > .cfi_def_cfa_offset 16 > Ltmp1: > .cfi_offset %rbp, -16 > movq%rsp, %rbp > Ltmp2: > .cfi_def_cfa_register %rbp > ## kill: %EDX %EDX > %RDX > testl%edx, %edx > jleLBB0_17 > ## BB#1: > movl%edx, %r8d > cmpl$32, %edx > jaeLBB0_3 > ## BB#2: > xorl%r11d, %r11d > jmpLBB0_11 > LBB0_3: > andl$31, %edx > movq%r8, %r11 > subq%rdx, %r11 > jeLBB0_7 > ## BB#4: > leaq1(%rdi), %rax > cmpq%rsi, %rax > jbeLBB0_8 > ## BB#5: > leaq(%rsi,%r8), %r9 > movl$1, %eax > subq%r8, %rax > addq%rdi, %rax > cmpq%r9, %rax > jaeLBB0_8 > LBB0_7: > xorl%r11d, %r11d > jmpLBB0_11 > LBB0_8: > leaq-15(%rdi), %r9 > leaq16(%rsi), %rax > movdqaLCPI0_0(%rip), %xmm0## xmm0 = > [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] > movq%r11, %r10 > .p2align4, 0x90 > LBB0_9: ## =>This Inner Loop Header: Depth=1 > movdqu-16(%r9), %xmm1 > movdqu(%r9), %xmm2 > pshufb%xmm0, %xmm2 > pshufb%xmm0, %xmm1 > movdqu%xmm2, -16(%rax) > movdqu%xmm1, (%rax) > addq$-32, %r9 > addq$32, %rax > addq$-32, %r10 > jneLBB0_9 Huh, so we're not disabling tree vectorization with clang, only with GCC. Guess it hasn't generated broken code before to justify disabling it. In any case, if clang or gcc can generate better code, then the hand written version needs to be optimized to be as fast or faster. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
> Can you post a disassembly of hflip_byte_c? > > > in O1 : clang -S -O1 test_asm_gen.c .section__TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .globl_hflip_byte_c .p2align4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq%rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq%rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp testl%edx, %edx jleLBB0_3 ## BB#1: movl%edx, %eax .p2align4, 0x90 LBB0_2: ## =>This Inner Loop Header: Depth=1 movzbl(%rdi), %ecx movb%cl, (%rsi) decq%rdi incq%rsi decq%rax jneLBB0_2 LBB0_3: popq%rbp retq .cfi_endproc .subsections_via_symbols in O2 or O3 : clang -S -O3 test_asm_gen.c If i correctly understand, same idea than paul's patch but processing two xmm in the main loop .section__TEXT,__text,regular,pure_instructions .macosx_version_min 10, 12 .section__TEXT,__literal16,16byte_literals .p2align4 LCPI0_0: .byte15 ## 0xf .byte14 ## 0xe .byte13 ## 0xd .byte12 ## 0xc .byte11 ## 0xb .byte10 ## 0xa .byte9 ## 0x9 .byte8 ## 0x8 .byte7 ## 0x7 .byte6 ## 0x6 .byte5 ## 0x5 .byte4 ## 0x4 .byte3 ## 0x3 .byte2 ## 0x2 .byte1 ## 0x1 .byte0 ## 0x0 .section__TEXT,__text,regular,pure_instructions .globl_hflip_byte_c .p2align4, 0x90 _hflip_byte_c: ## @hflip_byte_c .cfi_startproc ## BB#0: pushq%rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq%rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp ## kill: %EDX %EDX %RDX testl%edx, %edx jleLBB0_17 ## BB#1: movl%edx, %r8d cmpl$32, %edx jaeLBB0_3 ## BB#2: xorl%r11d, %r11d jmpLBB0_11 LBB0_3: andl$31, %edx movq%r8, %r11 subq%rdx, %r11 jeLBB0_7 ## BB#4: leaq1(%rdi), %rax cmpq%rsi, %rax jbeLBB0_8 ## BB#5: leaq(%rsi,%r8), %r9 movl$1, %eax subq%r8, %rax addq%rdi, %rax cmpq%r9, %rax jaeLBB0_8 LBB0_7: xorl%r11d, %r11d jmpLBB0_11 LBB0_8: leaq-15(%rdi), %r9 leaq16(%rsi), %rax movdqaLCPI0_0(%rip), %xmm0## xmm0 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] movq%r11, %r10 .p2align4, 0x90 LBB0_9: ## =>This Inner Loop Header: Depth=1 movdqu-16(%r9), %xmm1 movdqu(%r9), %xmm2 pshufb%xmm0, %xmm2 pshufb%xmm0, %xmm1 movdqu%xmm2, -16(%rax) movdqu%xmm1, (%rax) addq$-32, %r9 addq$32, %rax addq$-32, %r10 jneLBB0_9 ## BB#10: testl%edx, %edx jeLBB0_17 LBB0_11: movl%r8d, %eax subl%r11d, %eax leaq-1(%r8), %r9 subq%r11, %r9 andq$3, %rax jeLBB0_14 ## BB#12: movq%rdi, %rdx subq%r11, %rdx negq%rax .p2align4, 0x90 LBB0_13:## =>This Inner Loop Header: Depth=1 movzbl(%rdx), %ecx movb%cl, (%rsi,%r11) incq%r11 decq%rdx incq%rax jneLBB0_13 LBB0_14: cmpq$3, %r9 jbLBB0_17 ## BB#15: subq%r11, %r8 subq%r11, %rdi leaq3(%rsi,%r11), %rax .p2align4, 0x90 LBB0_16:## =>This Inner Loop Header: Depth=1 movzbl(%rdi), %ecx movb%cl, -3(%rax) movzbl-1(%rdi), %ecx movb%cl, -2(%rax) movzbl-2(%rdi), %ecx movb%cl, -1(%rax) movzbl-3(%rdi), %ecx movb%cl, (%rax) addq$-4, %rdi addq$4, %rax addq$-4, %r8 jneLBB0_16 LBB0_17: popq%rbp retq .cfi_endproc .subsections_via_symbols ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/2017 3:09 PM, Martin Vignali wrote: >> 2017-12-03 17:46 GMT+01:00 Paul B Mahol: >> >>> On 12/3/17, Martin Vignali wrote: Hello, Maybe you can use a macro for byte and short version, only few lines are different in each version >>> >>> Sure, feel free to send patches. >>> >>> I'm not very macro proficient. >>> >> >> Ok, i will take a look. >> >> Martin >> > > I write a basic checkasm test. Seems like the byte version is slower than c > > hflip_byte_c: 31.8 > hflip_byte_ssse3: 108.1 > hflip_short_c: 300.1 > hflip_short_ssse3: 139.8 > > (checkasm patch in attach if you want to test) > > Martin $ tests/checkasm/checkasm.exe --test=vf_hflip --bench benchmarking with native FFmpeg timers nop: 32.0 hflip_byte_c: 362.0 hflip_byte_ssse3: 96.0 hflip_short_c: 374.0 hflip_short_ssse3: 121.0 Guess your compiler is really good at optimizing this code, or something funny is going on. Can you post a disassembly of hflip_byte_c? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
> 2017-12-03 17:46 GMT+01:00 Paul B Mahol: > >> On 12/3/17, Martin Vignali wrote: >> > Hello, >> > >> > Maybe you can use a macro for byte and short version, >> > only few lines are different in each version >> >> Sure, feel free to send patches. >> >> I'm not very macro proficient. >> > > Ok, i will take a look. > > Martin > I write a basic checkasm test. Seems like the byte version is slower than c hflip_byte_c: 31.8 hflip_byte_ssse3: 108.1 hflip_short_c: 300.1 hflip_short_ssse3: 139.8 (checkasm patch in attach if you want to test) Martin 0002-checkasm-vf_hflip-add-test-for-hflip-SIMD.patch Description: Binary data ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
2017-12-03 17:46 GMT+01:00 Paul B Mahol: > On 12/3/17, Martin Vignali wrote: > > Hello, > > > > Maybe you can use a macro for byte and short version, > > only few lines are different in each version > > Sure, feel free to send patches. > > I'm not very macro proficient. > Ok, i will take a look. Martin ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/3/17, Martin Vignaliwrote: > Hello, > > Maybe you can use a macro for byte and short version, > only few lines are different in each version Sure, feel free to send patches. I'm not very macro proficient. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Hello, Maybe you can use a macro for byte and short version, only few lines are different in each version Martin ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/2/17, Martin Vignaliwrote: >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +SECTION_RODATA >> + >> +pb_flip_byte: times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 >> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 >> + >> > > times 16 ? Removed. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
> + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +pb_flip_byte: times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 > +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 > + > times 16 ? Martin ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Signed-off-by: Paul B Mahol--- libavfilter/hflip.h | 38 libavfilter/vf_hflip.c | 131 ++-- libavfilter/x86/Makefile| 2 + libavfilter/x86/vf_hflip.asm| 92 libavfilter/x86/vf_hflip_init.c | 41 + 5 files changed, 257 insertions(+), 47 deletions(-) create mode 100644 libavfilter/hflip.h create mode 100644 libavfilter/x86/vf_hflip.asm create mode 100644 libavfilter/x86/vf_hflip_init.c diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h new file mode 100644 index 00..138380427c --- /dev/null +++ b/libavfilter/hflip.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2007 Benoit Fouet + * Copyright (c) 2010 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_HFLIP_H +#define AVFILTER_HFLIP_H + +#include "avfilter.h" + +typedef struct FlipContext { +const AVClass *class; +int max_step[4];///< max pixel step for each plane, expressed as a number of bytes +int planewidth[4]; ///< width of each plane +int planeheight[4]; ///< height of each plane + +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); +} FlipContext; + +void ff_hflip_init_x86(FlipContext *s, int step[4]); + +#endif /* AVFILTER_HFLIP_H */ diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c index cf20c193f7..303cc8af60 100644 --- a/libavfilter/vf_hflip.c +++ b/libavfilter/vf_hflip.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "avfilter.h" #include "formats.h" +#include "hflip.h" #include "internal.h" #include "video.h" #include "libavutil/pixdesc.h" @@ -36,13 +37,6 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" -typedef struct FlipContext { -const AVClass *class; -int max_step[4];///< max pixel step for each plane, expressed as a number of bytes -int planewidth[4]; ///< width of each plane -int planeheight[4]; ///< height of each plane -} FlipContext; - static const AVOption hflip_options[] = { { NULL } }; @@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) +{ +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint16_t *src = (const uint16_t *)ssrc; +uint16_t *dst = (uint16_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint32_t *src = (const uint32_t *)ssrc; +uint32_t *dst = (uint32_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + +static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 3, in -= 3) { +int32_t v = AV_RB24(in); + +AV_WB24(out, v); +} +} + +static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w) +{ +const uint8_t *in = src; +uint8_t *out = dst; +int j; + +for (j = 0; j < w; j++, out += 6, in -= 6) { +int64_t v = AV_RB48(in); + +AV_WB48(out, v); +} +} + +static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w) +{ +const uint64_t *src = (const uint64_t *)ssrc; +uint64_t *dst = (uint64_t *)ddst; +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + static int config_props(AVFilterLink *inlink) { FlipContext *s = inlink->dst->priv; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); const int hsub = pix_desc->log2_chroma_w; const int vsub = pix_desc->log2_chroma_h; +int i; av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -80,6 +139,22 @@ static int config_props(AVFilterLink *inlink) s->planeheight[0] = s->planeheight[3] = inlink->h; s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); +for (i = 0; i < 4; i++) { +switch
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/1/2017 7:02 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol> --- > libavfilter/hflip.h | 38 + > libavfilter/vf_hflip.c | 30 ++-- > libavfilter/x86/Makefile| 2 ++ > libavfilter/x86/vf_hflip.asm| 61 > + > libavfilter/x86/vf_hflip_init.c | 38 + > 5 files changed, 160 insertions(+), 9 deletions(-) > create mode 100644 libavfilter/hflip.h > create mode 100644 libavfilter/x86/vf_hflip.asm > create mode 100644 libavfilter/x86/vf_hflip_init.c > > diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h > new file mode 100644 > index 00..138380427c > --- /dev/null > +++ b/libavfilter/hflip.h > @@ -0,0 +1,38 @@ > +/* > + * Copyright (c) 2007 Benoit Fouet > + * Copyright (c) 2010 Stefano Sabatini > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVFILTER_HFLIP_H > +#define AVFILTER_HFLIP_H > + > +#include "avfilter.h" > + > +typedef struct FlipContext { > +const AVClass *class; > +int max_step[4];///< max pixel step for each plane, expressed as a > number of bytes > +int planewidth[4]; ///< width of each plane > +int planeheight[4]; ///< height of each plane > + > +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); > +} FlipContext; > + > +void ff_hflip_init_x86(FlipContext *s, int step[4]); > + > +#endif /* AVFILTER_HFLIP_H */ > diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c > index cf20c193f7..65cf7c5cd1 100644 > --- a/libavfilter/vf_hflip.c > +++ b/libavfilter/vf_hflip.c > @@ -29,6 +29,7 @@ > #include "libavutil/opt.h" > #include "avfilter.h" > #include "formats.h" > +#include "hflip.h" > #include "internal.h" > #include "video.h" > #include "libavutil/pixdesc.h" > @@ -36,13 +37,6 @@ > #include "libavutil/intreadwrite.h" > #include "libavutil/imgutils.h" > > -typedef struct FlipContext { > -const AVClass *class; > -int max_step[4];///< max pixel step for each plane, expressed as a > number of bytes > -int planewidth[4]; ///< width of each plane > -int planeheight[4]; ///< height of each plane > -} FlipContext; > - > static const AVOption hflip_options[] = { > { NULL } > }; > @@ -67,12 +61,21 @@ static int query_formats(AVFilterContext *ctx) > return ff_set_common_formats(ctx, pix_fmts); > } > > +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) > +{ > +int j; > + > +for (j = 0; j < w; j++) > +dst[j] = src[-j]; > +} > + > static int config_props(AVFilterLink *inlink) > { > FlipContext *s = inlink->dst->priv; > const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); > const int hsub = pix_desc->log2_chroma_w; > const int vsub = pix_desc->log2_chroma_h; > +int i; > > av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); > s->planewidth[0] = s->planewidth[3] = inlink->w; > @@ -80,6 +83,16 @@ static int config_props(AVFilterLink *inlink) > s->planeheight[0] = s->planeheight[3] = inlink->h; > s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); > > +for (i = 0; i < 4; i++) { > +switch (s->max_step[i]) { > +case 1: > +s->flip_line[i] = hflip_byte_c; > +} > +} > + > +if (ARCH_X86) > +ff_hflip_init_x86(s, s->max_step); > + > return 0; > } > > @@ -109,8 +122,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, > int job, int nb_jobs) > for (i = start; i < end; i++) { > switch (step) { > case 1: > -for (j = 0; j < width; j++) > -outrow[j] = inrow[-j]; > +s->flip_line[plane](inrow, outrow, width); > break; > > case 2: > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 3431625883..1420954f62 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += > x86/colorspacedsp_init.o > OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o > OBJS-$(CONFIG_FSPP_FILTER)
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On 12/1/2017 11:13 PM, Michael Niedermayer wrote: > On Fri, Dec 01, 2017 at 11:02:43PM +0100, Paul B Mahol wrote: >> Signed-off-by: Paul B Mahol>> --- >> libavfilter/hflip.h | 38 + >> libavfilter/vf_hflip.c | 30 ++-- >> libavfilter/x86/Makefile| 2 ++ >> libavfilter/x86/vf_hflip.asm| 61 >> + >> libavfilter/x86/vf_hflip_init.c | 38 + >> 5 files changed, 160 insertions(+), 9 deletions(-) >> create mode 100644 libavfilter/hflip.h >> create mode 100644 libavfilter/x86/vf_hflip.asm >> create mode 100644 libavfilter/x86/vf_hflip_init.c > > fails to build on x86-32 linux > > libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86': > src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to > `ff_hflip_byte_ssse3' > collect2: error: ld returned 1 exit status > make: *** [ffmpeg_g] Error 1 > make: *** Waiting for unfinished jobs > libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86': > src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to > `ff_hflip_byte_ssse3' > collect2: error: ld returned 1 exit status > make: *** [ffprobe_g] Error 1 For some reason the whole asm function is wrapped in a x86_64 check even though it's not needed. Guess it was a copy paste mistake. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
On Fri, Dec 01, 2017 at 11:02:43PM +0100, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol> --- > libavfilter/hflip.h | 38 + > libavfilter/vf_hflip.c | 30 ++-- > libavfilter/x86/Makefile| 2 ++ > libavfilter/x86/vf_hflip.asm| 61 > + > libavfilter/x86/vf_hflip_init.c | 38 + > 5 files changed, 160 insertions(+), 9 deletions(-) > create mode 100644 libavfilter/hflip.h > create mode 100644 libavfilter/x86/vf_hflip.asm > create mode 100644 libavfilter/x86/vf_hflip_init.c fails to build on x86-32 linux libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86': src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to `ff_hflip_byte_ssse3' collect2: error: ld returned 1 exit status make: *** [ffmpeg_g] Error 1 make: *** Waiting for unfinished jobs libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86': src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to `ff_hflip_byte_ssse3' collect2: error: ld returned 1 exit status make: *** [ffprobe_g] Error 1 [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB No human being will ever know the Truth, for even if they happen to say it by chance, they would not even known they had done so. -- Xenophanes signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Signed-off-by: Paul B Mahol--- libavfilter/hflip.h | 38 + libavfilter/vf_hflip.c | 30 ++-- libavfilter/x86/Makefile| 2 ++ libavfilter/x86/vf_hflip.asm| 61 + libavfilter/x86/vf_hflip_init.c | 38 + 5 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 libavfilter/hflip.h create mode 100644 libavfilter/x86/vf_hflip.asm create mode 100644 libavfilter/x86/vf_hflip_init.c diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h new file mode 100644 index 00..138380427c --- /dev/null +++ b/libavfilter/hflip.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2007 Benoit Fouet + * Copyright (c) 2010 Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_HFLIP_H +#define AVFILTER_HFLIP_H + +#include "avfilter.h" + +typedef struct FlipContext { +const AVClass *class; +int max_step[4];///< max pixel step for each plane, expressed as a number of bytes +int planewidth[4]; ///< width of each plane +int planeheight[4]; ///< height of each plane + +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w); +} FlipContext; + +void ff_hflip_init_x86(FlipContext *s, int step[4]); + +#endif /* AVFILTER_HFLIP_H */ diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c index cf20c193f7..65cf7c5cd1 100644 --- a/libavfilter/vf_hflip.c +++ b/libavfilter/vf_hflip.c @@ -29,6 +29,7 @@ #include "libavutil/opt.h" #include "avfilter.h" #include "formats.h" +#include "hflip.h" #include "internal.h" #include "video.h" #include "libavutil/pixdesc.h" @@ -36,13 +37,6 @@ #include "libavutil/intreadwrite.h" #include "libavutil/imgutils.h" -typedef struct FlipContext { -const AVClass *class; -int max_step[4];///< max pixel step for each plane, expressed as a number of bytes -int planewidth[4]; ///< width of each plane -int planeheight[4]; ///< height of each plane -} FlipContext; - static const AVOption hflip_options[] = { { NULL } }; @@ -67,12 +61,21 @@ static int query_formats(AVFilterContext *ctx) return ff_set_common_formats(ctx, pix_fmts); } +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w) +{ +int j; + +for (j = 0; j < w; j++) +dst[j] = src[-j]; +} + static int config_props(AVFilterLink *inlink) { FlipContext *s = inlink->dst->priv; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); const int hsub = pix_desc->log2_chroma_w; const int vsub = pix_desc->log2_chroma_h; +int i; av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc); s->planewidth[0] = s->planewidth[3] = inlink->w; @@ -80,6 +83,16 @@ static int config_props(AVFilterLink *inlink) s->planeheight[0] = s->planeheight[3] = inlink->h; s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub); +for (i = 0; i < 4; i++) { +switch (s->max_step[i]) { +case 1: +s->flip_line[i] = hflip_byte_c; +} +} + +if (ARCH_X86) +ff_hflip_init_x86(s, s->max_step); + return 0; } @@ -109,8 +122,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs) for (i = start; i < end; i++) { switch (step) { case 1: -for (j = 0; j < width; j++) -outrow[j] = inrow[-j]; +s->flip_line[plane](inrow, outrow, width); break; case 2: diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 3431625883..1420954f62 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o +OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o