Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
Thanks for comments and testing Pushed Martin ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
On Thu, Dec 07, 2017 at 10:08:56PM +0100, Martin Vignali wrote: > > > > You should also change the cglobal line for x86_32, right below this else > > > > > new patch in attach tested, works on arm/mips/x86-32/64 linux & x86-32/64 mingw [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Democracy is the form of government in which you can choose your dictator signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
> > You should also change the cglobal line for x86_32, right below this else > > new patch in attach 0001-avfilter-x86-vf_threshold-add-threshold16-SIMD-SSE4.patch Description: Binary data 0002-checkasm-vf_threshold-add-test-for-threshold16.patch Description: Binary data ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
On 12/7/2017 5:10 PM, Martin Vignali wrote: > 2017-12-03 21:28 GMT+01:00 Martin Vignali : > >> >> >> 2017-12-03 21:15 GMT+01:00 James Darnley : >> >>> On 2017-12-03 19:30, Martin Vignali wrote: libavfilter/x86/vf_threshold.asm| 19 ++- libavfilter/x86/vf_threshold_init.c | 34 >>> -- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/libavfilter/x86/vf_threshold.asm >>> b/libavfilter/x86/vf_threshold.asm index fb008c376a..7b929c6bd2 100644 --- a/libavfilter/x86/vf_threshold.asm +++ b/libavfilter/x86/vf_threshold.asm @@ -27,14 +27,21 @@ SECTION_RODATA pb_128: times 16 db 128 +pb_128_0 : times 16 dw 32768 >>> >>> No. Please use db and the values you want. >>> >>> I assume this is supposed to be "times 8 db 0, 128". >> >> >> > Hello, > > new patch in attach (you're right, it's "times 8 db 0, 128") > > > Martin > From ac91cb26724b6e8fe294e0bf9ad2dd17fe0eada9 Mon Sep 17 00:00:00 2001 > From: Martin Vignali > Date: Thu, 7 Dec 2017 21:06:43 +0100 > Subject: [PATCH 1/2] avfilter/x86/vf_threshold : add threshold16 SIMD (SSE4 > and AVX2) > > --- > libavfilter/x86/vf_threshold.asm| 19 +-- > libavfilter/x86/vf_threshold_init.c | 34 -- > 2 files changed, 33 insertions(+), 20 deletions(-) > > diff --git a/libavfilter/x86/vf_threshold.asm > b/libavfilter/x86/vf_threshold.asm > index 56a6c242d8..dc42cd4971 100644 > --- a/libavfilter/x86/vf_threshold.asm > +++ b/libavfilter/x86/vf_threshold.asm > @@ -25,12 +25,14 @@ > SECTION_RODATA > > pb_128: times 16 db 128 > +pb_128_0 : times 8 db 0, 128 > > SECTION .text > > -%macro THRESHOLD_8 0 > +;%1 depth (8 or 16) ; %2 b or w ; %3 constant > +%macro THRESHOLD 3 > %if ARCH_X86_64 > -cglobal threshold8, 10, 13, 5, in, threshold, min, max, out, ilinesize, > tlinesize, flinesize, slinesize, olinesize, w, h, x > +cglobal threshold%1, 10, 13, 5, in, threshold, min, max, out, ilinesize, > tlinesize, flinesize, slinesize, olinesize, w, h, x > mov wd, dword wm > mov hd, dword hm > %else You should also change the cglobal line for x86_32, right below this else > @@ -43,7 +45,10 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, > w, x > %define olinesizeq r9mp > %define hd r11mp > %endif > -VBROADCASTI128 m4, [pb_128] > +VBROADCASTI128 m4, [%3] > +%if %1 == 16 > +add wq, wq ; w *= 2 (16 bits instead of 8) > +%endif > addinq, wq > add thresholdq, wq > add minq, wq > @@ -60,7 +65,7 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, > w, x > movum3, [maxq + xq] > pxorm0, m4 > pxorm1, m4 > -pcmpgtb m0, m1 > +pcmpgt%2m0, m1 > PBLENDVBm3, m2, m0 > movu [outq + xq], m3 > add xq, mmsize > @@ -77,9 +82,11 @@ RET > %endmacro > > INIT_XMM sse4 > -THRESHOLD_8 > +THRESHOLD 8, b, pb_128 > +THRESHOLD 16, w, pb_128_0 > > %if HAVE_AVX2_EXTERNAL > INIT_YMM avx2 > -THRESHOLD_8 > +THRESHOLD 8, b, pb_128 > +THRESHOLD 16, w, pb_128_0 > %endif > diff --git a/libavfilter/x86/vf_threshold_init.c > b/libavfilter/x86/vf_threshold_init.c > index db0559533d..8e42296791 100644 > --- a/libavfilter/x86/vf_threshold_init.c > +++ b/libavfilter/x86/vf_threshold_init.c > @@ -23,20 +23,19 @@ > #include "libavutil/x86/cpu.h" > #include "libavfilter/threshold.h" > > -void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold, > -const uint8_t *min, const uint8_t *max, > -uint8_t *out, > -ptrdiff_t ilinesize, ptrdiff_t tlinesize, > -ptrdiff_t flinesize, ptrdiff_t slinesize, > -ptrdiff_t olinesize, > -int w, int h); > -void ff_threshold8_avx2(const uint8_t *in, const uint8_t *threshold, > -const uint8_t *min, const uint8_t *max, > -uint8_t *out, > -ptrdiff_t ilinesize, ptrdiff_t tlinesize, > -ptrdiff_t flinesize, ptrdiff_t slinesize, > -ptrdiff_t olinesize, > -int w, int h); > +#define THRESHOLD_FUNC(depth, opt) \ > +void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t > *threshold,\ > +const uint8_t *min, const uint8_t *max, \ > +uint8_t *out, \ > +ptrdiff_t ilinesize, ptrdiff_t tlinesize, \ > +ptrdiff_t flinesize, ptrdiff_t slinesize, \ > +ptrdiff_t olinesize,\ > +i
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
2017-12-03 21:28 GMT+01:00 Martin Vignali : > > > 2017-12-03 21:15 GMT+01:00 James Darnley : > >> On 2017-12-03 19:30, Martin Vignali wrote: >> > libavfilter/x86/vf_threshold.asm| 19 ++- >> > libavfilter/x86/vf_threshold_init.c | 34 >> -- >> > 2 files changed, 34 insertions(+), 19 deletions(-) >> > >> > diff --git a/libavfilter/x86/vf_threshold.asm >> b/libavfilter/x86/vf_threshold.asm >> > index fb008c376a..7b929c6bd2 100644 >> > --- a/libavfilter/x86/vf_threshold.asm >> > +++ b/libavfilter/x86/vf_threshold.asm >> > @@ -27,14 +27,21 @@ >> > SECTION_RODATA >> > >> > pb_128: times 16 db 128 >> > +pb_128_0 : times 16 dw 32768 >> >> No. Please use db and the values you want. >> >> I assume this is supposed to be "times 8 db 0, 128". > > > Hello, new patch in attach (you're right, it's "times 8 db 0, 128") Martin 0001-avfilter-x86-vf_threshold-add-threshold16-SIMD-SSE4.patch Description: Binary data 0002-checkasm-vf_threshold-add-test-for-threshold16.patch Description: Binary data ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
2017-12-03 21:15 GMT+01:00 James Darnley : > On 2017-12-03 19:30, Martin Vignali wrote: > > libavfilter/x86/vf_threshold.asm| 19 ++- > > libavfilter/x86/vf_threshold_init.c | 34 -- > > > 2 files changed, 34 insertions(+), 19 deletions(-) > > > > diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_ > threshold.asm > > index fb008c376a..7b929c6bd2 100644 > > --- a/libavfilter/x86/vf_threshold.asm > > +++ b/libavfilter/x86/vf_threshold.asm > > @@ -27,14 +27,21 @@ > > SECTION_RODATA > > > > pb_128: times 16 db 128 > > +pb_128_0 : times 16 dw 32768 > > No. Please use db and the values you want. > > I assume this is supposed to be "times 8 db 0, 128". If these are > supposed to be word values then the constant should be named "packed > word". If one were to reuse an existing constant for different word > sizes then it would be acceptable. > > > Thanks for pointing this, I will replace it by pb_128_0 : times 8 db 128, 0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16
On 2017-12-03 19:30, Martin Vignali wrote: > libavfilter/x86/vf_threshold.asm| 19 ++- > libavfilter/x86/vf_threshold_init.c | 34 -- > 2 files changed, 34 insertions(+), 19 deletions(-) > > diff --git a/libavfilter/x86/vf_threshold.asm > b/libavfilter/x86/vf_threshold.asm > index fb008c376a..7b929c6bd2 100644 > --- a/libavfilter/x86/vf_threshold.asm > +++ b/libavfilter/x86/vf_threshold.asm > @@ -27,14 +27,21 @@ > SECTION_RODATA > > pb_128: times 16 db 128 > +pb_128_0 : times 16 dw 32768 No. Please use db and the values you want. I assume this is supposed to be "times 8 db 0, 128". If these are supposed to be word values then the constant should be named "packed word". If one were to reuse an existing constant for different word sizes then it would be acceptable. signature.asc Description: OpenPGP digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel