Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
Checkasm result (osx) for your last patch :
hflip_byte_c: 28.5
hflip_byte_ssse3: 29.0
hflip_short_c: 277.7
hflip_short_ssse3: 65.0

if you add a "cmp xq, wq" after the simd loop
you can be faster than c (clang), if width is multiple of mmsize*2

hflip_byte_c: 28.5
hflip_byte_ssse3: 27.5

see below


otherwise looks ok (i will send later a much cleaner patch for the checkasm,
and a patch to use one macro for both func)

+
> +pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> +pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +
> +SECTION .text
> +
> +INIT_XMM ssse3
> +cglobal hflip_byte, 3, 6, 3, src, dst, w, x, v, r
>
+movam0, [pb_flip_byte]
> +mov xq, 0
> +mov wd, dword wm
> +mov rq, wq
> +and rq, 2 * mmsize - 1
> +cmp wq, 2 * mmsize
> +jl .loop1
> +sub wq, rq
> +
> +.loop0:
> +neg xq
> +movum1, [srcq + xq - mmsize + 1]
> +movum2, [srcq + xq - 2 * mmsize + 1]
> +pshufb  m1, m0
> +pshufb  m2, m0
> +neg xq
> +movu[dstq + xq ], m1
> +movu[dstq + xq + mmsize], m2
> +add xq, mmsize * 2
> +cmp xq, wq
> +jl .loop0
>

cmp xq, wq
je .end


> +
> +addwq, rq
> +
> +.loop1:
> +negxq
> +movvb, [srcq + xq]
> +negxq
> +mov[dstq + xq], vb
> +addxq, 1
> +cmpxq, wq
> +jl .loop1
>

.end:


> +RET
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread James Almer
On 12/3/2017 5:50 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol 
> ---
>  libavfilter/hflip.h |  38 
>  libavfilter/vf_hflip.c  | 133 
> ++--
>  libavfilter/x86/Makefile|   2 +
>  libavfilter/x86/vf_hflip.asm| 102 ++
>  libavfilter/x86/vf_hflip_init.c |  41 +
>  5 files changed, 269 insertions(+), 47 deletions(-)
>  create mode 100644 libavfilter/hflip.h
>  create mode 100644 libavfilter/x86/vf_hflip.asm
>  create mode 100644 libavfilter/x86/vf_hflip_init.c

[...]

> @@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink)
>  s->planeheight[0] = s->planeheight[3] = inlink->h;
>  s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
>  
> +nb_planes = av_pix_fmt_count_planes(inlink->format);
> +
> +for (i = 0; i < nb_planes; i++) {
> +switch (s->max_step[i]) {
> +case 1: s->flip_line[i] = hflip_byte_c;  break;
> +case 2: s->flip_line[i] = hflip_short_c; break;
> +case 3: s->flip_line[i] = hflip_b24_c;   break;
> +case 4: s->flip_line[i] = hflip_dword_c; break;
> +case 6: s->flip_line[i] = hflip_b48_c;   break;
> +case 8: s->flip_line[i] = hflip_qword_c; break;
> +default:
> +return AVERROR_BUG;
> +}
> +}
> +
> +if (ARCH_X86)
> +ff_hflip_init_x86(s, s->max_step);

Pass nb_planes here and use it instead of the hardcoded 4.

Should be good aside from that.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
Signed-off-by: Paul B Mahol 
---
 libavfilter/hflip.h |  38 
 libavfilter/vf_hflip.c  | 133 ++--
 libavfilter/x86/Makefile|   2 +
 libavfilter/x86/vf_hflip.asm| 102 ++
 libavfilter/x86/vf_hflip_init.c |  41 +
 5 files changed, 269 insertions(+), 47 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 00..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+const AVClass *class;
+int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
+int planewidth[4];  ///< width of each plane
+int planeheight[4]; ///< height of each plane
+
+void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..030015df0a 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-const AVClass *class;
-int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
-int planewidth[4];  ///< width of each plane
-int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
 { NULL }
 };
@@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx)
 return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint16_t *src = (const uint16_t *)ssrc;
+uint16_t *dst = (uint16_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint32_t *src = (const uint32_t *)ssrc;
+uint32_t *dst = (uint32_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 3, in -= 3) {
+int32_t v = AV_RB24(in);
+
+AV_WB24(out, v);
+}
+}
+
+static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 6, in -= 6) {
+int64_t v = AV_RB48(in);
+
+AV_WB48(out, v);
+}
+}
+
+static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint64_t *src = (const uint64_t *)ssrc;
+uint64_t *dst = (uint64_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
 FlipContext *s = inlink->dst->priv;
 const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
 const int hsub = pix_desc->log2_chroma_w;
 const int vsub = pix_desc->log2_chroma_h;
+int nb_planes, i;
 
 av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
 s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink)
 s->planeheight[0] = s->planeheight[3] = inlink->h;
 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+nb_planes = 

Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
I modify the checkasm test, to test various width

if (check_func(s.flip_line[0], "hflip_%s", report_name)) {
for (i = 1; i < w; i++) {
call_ref(src, dst_ref, i);
call_new(src, dst_new, i);
if (memcmp(dst_ref, dst_new, WIDTH)) {
printf("FAIL : W = %d\n", i);
fail();
}
}
bench_new(src, dst_new, WIDTH);
}


This asm seems to be ok (same idea for the hflip_short version)
hflip_byte_c: 28.4
hflip_byte_ssse3: 23.7
hflip_short_c: 275.9
hflip_short_ssse3: 65.2


INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
movam0, [pb_flip_byte]
mov xq, 0
mov wd, dword wm
sub wq, 2 * mmsize
;cmp wq, mmsize ; < Doesn't seems to be need
jl .skip

.loop0:
neg xq
movum1, [srcq + xq - mmsize + 1]
movum2, [srcq + xq - 2 * mmsize + 1]
pshufb  m1, m0
pshufb  m2, m0
neg xq
movu[dstq + xq ], m1
movu[dstq + xq + mmsize], m2
add xq, mmsize * 2
cmp xq, wq
jl .loop0

cmp xq, wq ;<
je .end ;<


   sub xq, mmsize *2 ;<
   jmp .loop1 ;<




.skip:
add wq, 2 * mmsize
.loop1:
negxq
movvb, [srcq + xq]
negxq
mov[dstq + xq], vb
addxq, 1
cmpxq, wq
jl .loop1
.end:
RET
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Paul B Mahol  wrote:
> On 12/3/17, Martin Vignali  wrote:
>> Maybe the problem come from the skip part :
>>
>> +INIT_XMM ssse3
>>> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
>>> +movam0, [pb_flip_byte]
>>> +mov xq, 0
>>> +mov wd, dword wm
>>> +sub wq, 2 * mmsize
>>> +cmp wq, mmsize
>>> +jl .skip
>>> +
>>> +.loop0:
>>> +neg xq
>>> +movum1, [srcq + xq - mmsize + 1]
>>> +movum2, [srcq + xq - 2 * mmsize + 1]
>>> +pshufb  m1, m0
>>> +pshufb  m2, m0
>>> +neg xq
>>> +movu[dstq + xq ], m1
>>> +movu[dstq + xq + mmsize], m2
>>> +add xq, mmsize * 2
>>> +cmp xq, wq
>>> +jl .loop0
>>> +
>>> +.skip:
>>> +add wq, 2 * mmsize
>>>
>>
>> ==> use xq instead of wq ?
>
> Nope.
>
>>
>>
>>> +.loop1:
>>> +negxq
>>> +movvb, [srcq + xq]
>>> +negxq
>>> +mov[dstq + xq], vb
>>> +addxq, 1
>>> +cmpxq, wq
>>> +jl .loop1
>>> +RET
>>> +
>>> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
>>> +movam0, [pb_flip_short]
>>> +mov xq, 0
>>> +mov wd, dword wm
>>> +add wq, wq
>>> +sub wq, 2 * mmsize
>>> +cmp wq, mmsize
>>> +jl .skip
>>> +
>>> +.loop0:
>>> +neg xq
>>> +movum1, [srcq + xq - mmsize + 2]
>>> +movum2, [srcq + xq - 2 * mmsize + 2]
>>> +pshufb  m1, m0
>>> +pshufb  m2, m0
>>> +neg xq
>>> +movu[dstq + xq ], m1
>>> +movu[dstq + xq + mmsize], m2
>>> +add xq, mmsize
>>> +cmp xq, wq
>>> +jl .loop0
>>> +
>>> +.skip:
>>> +add wq, 2 * mmsize
>>>
>>
>>
>> ==> same here ?
>
> Nope, This is for case when width is not multiple of mmsize.
>

Can I get final verdict? I would like to move to other things.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Martin Vignali  wrote:
> Maybe the problem come from the skip part :
>
> +INIT_XMM ssse3
>> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
>> +movam0, [pb_flip_byte]
>> +mov xq, 0
>> +mov wd, dword wm
>> +sub wq, 2 * mmsize
>> +cmp wq, mmsize
>> +jl .skip
>> +
>> +.loop0:
>> +neg xq
>> +movum1, [srcq + xq - mmsize + 1]
>> +movum2, [srcq + xq - 2 * mmsize + 1]
>> +pshufb  m1, m0
>> +pshufb  m2, m0
>> +neg xq
>> +movu[dstq + xq ], m1
>> +movu[dstq + xq + mmsize], m2
>> +add xq, mmsize * 2
>> +cmp xq, wq
>> +jl .loop0
>> +
>> +.skip:
>> +add wq, 2 * mmsize
>>
>
> ==> use xq instead of wq ?

Nope.

>
>
>> +.loop1:
>> +negxq
>> +movvb, [srcq + xq]
>> +negxq
>> +mov[dstq + xq], vb
>> +addxq, 1
>> +cmpxq, wq
>> +jl .loop1
>> +RET
>> +
>> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
>> +movam0, [pb_flip_short]
>> +mov xq, 0
>> +mov wd, dword wm
>> +add wq, wq
>> +sub wq, 2 * mmsize
>> +cmp wq, mmsize
>> +jl .skip
>> +
>> +.loop0:
>> +neg xq
>> +movum1, [srcq + xq - mmsize + 2]
>> +movum2, [srcq + xq - 2 * mmsize + 2]
>> +pshufb  m1, m0
>> +pshufb  m2, m0
>> +neg xq
>> +movu[dstq + xq ], m1
>> +movu[dstq + xq + mmsize], m2
>> +add xq, mmsize
>> +cmp xq, wq
>> +jl .loop0
>> +
>> +.skip:
>> +add wq, 2 * mmsize
>>
>
>
> ==> same here ?

Nope, This is for case when width is not multiple of mmsize.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Martin Vignali  wrote:
> 2017-12-03 20:36 GMT+01:00 Paul B Mahol :
>
>> On 12/3/17, Martin Vignali  wrote:
>> >>
>> >> In any case, if clang or gcc can generate better code, then the hand
>> >> written version needs to be optimized to be as fast or faster.
>> >>
>> >>
>> >>
>> > Quick test : pass checkasm (but probably only because width = 256)
>> > hflip_byte_c: 26.4
>> > hflip_byte_ssse3: 20.4
>> >
>> >
>> > INIT_XMM ssse3
>> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
>> > movam0, [pb_flip_byte]
>> > xor xq, xq ; <==
>> > mov wd, dword wm
>> > sub wq, mmsize * 2
>> > ;remove the cmp here <==
>> > jl .skip
>> >
>> > .loop0: ; process two xmm in the loop
>> > neg xq
>> > movum1, [srcq + xq - mmsize + 1]
>> > movum2, [srcq + xq - mmsize * 2 + 1] <==
>> > pshufb  m1, m0
>> > pshufb  m2, m0 <==
>> > neg xq
>> > movu[dstq + xq], m1
>> > movu[dstq + xq + mmsize], m2 <==
>> > add xq, mmsize * 2 <==
>> > cmp xq, wq
>> > jl .loop0
>> >  RET ; add RET here
>> >
>> > ; MISSING one xmm process if need
>> >
>> > .skip:
>> > add wq, mmsize
>> > .loop1:
>> > negxq
>> > movvb, [srcq + xq]
>> > negxq
>> > mov[dstq + xq], vb
>> > addxq, 1
>> > cmpxq, wq
>> > jl .loop1
>> > RET
>>
>> So what is wrong now?
>>
>
> Doesn't see your email, when i send mine.
>
> Check asm result with your last patch (and modify for the short version
> "add xq, mmsize" to "add xq, mmsize * 2")
> hflip_byte_c: 28.0
> hflip_byte_ssse3: 127.5
> hflip_short_c: 276.5
> hflip_short_ssse3: 100.2
>

Ops, fixed.

>
> Do you think if you add RET after the end of loop0 , it can work in all
> cases ?

No, it would try to read before src, and crash.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
Maybe the problem come from the skip part :

+INIT_XMM ssse3
> +cglobal hflip_byte, 3, 5, 3, src, dst, w, x, v
> +movam0, [pb_flip_byte]
> +mov xq, 0
> +mov wd, dword wm
> +sub wq, 2 * mmsize
> +cmp wq, mmsize
> +jl .skip
> +
> +.loop0:
> +neg xq
> +movum1, [srcq + xq - mmsize + 1]
> +movum2, [srcq + xq - 2 * mmsize + 1]
> +pshufb  m1, m0
> +pshufb  m2, m0
> +neg xq
> +movu[dstq + xq ], m1
> +movu[dstq + xq + mmsize], m2
> +add xq, mmsize * 2
> +cmp xq, wq
> +jl .loop0
> +
> +.skip:
> +add wq, 2 * mmsize
>

==> use xq instead of wq ?


> +.loop1:
> +negxq
> +movvb, [srcq + xq]
> +negxq
> +mov[dstq + xq], vb
> +addxq, 1
> +cmpxq, wq
> +jl .loop1
> +RET
> +
> +cglobal hflip_short, 3, 5, 3, src, dst, w, x, v
> +movam0, [pb_flip_short]
> +mov xq, 0
> +mov wd, dword wm
> +add wq, wq
> +sub wq, 2 * mmsize
> +cmp wq, mmsize
> +jl .skip
> +
> +.loop0:
> +neg xq
> +movum1, [srcq + xq - mmsize + 2]
> +movum2, [srcq + xq - 2 * mmsize + 2]
> +pshufb  m1, m0
> +pshufb  m2, m0
> +neg xq
> +movu[dstq + xq ], m1
> +movu[dstq + xq + mmsize], m2
> +add xq, mmsize
> +cmp xq, wq
> +jl .loop0
> +
> +.skip:
> +add wq, 2 * mmsize
>


==> same here ?


+.loop1:
> +negxq
> +movvw, [srcq + xq]
> +negxq
> +mov[dstq + xq], vw
> +addxq, 2
> +cmpxq, wq
> +jl .loop1
> +RET
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
2017-12-03 20:36 GMT+01:00 Paul B Mahol :

> On 12/3/17, Martin Vignali  wrote:
> >>
> >> In any case, if clang or gcc can generate better code, then the hand
> >> written version needs to be optimized to be as fast or faster.
> >>
> >>
> >>
> > Quick test : pass checkasm (but probably only because width = 256)
> > hflip_byte_c: 26.4
> > hflip_byte_ssse3: 20.4
> >
> >
> > INIT_XMM ssse3
> > cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
> > movam0, [pb_flip_byte]
> > xor xq, xq ; <==
> > mov wd, dword wm
> > sub wq, mmsize * 2
> > ;remove the cmp here <==
> > jl .skip
> >
> > .loop0: ; process two xmm in the loop
> > neg xq
> > movum1, [srcq + xq - mmsize + 1]
> > movum2, [srcq + xq - mmsize * 2 + 1] <==
> > pshufb  m1, m0
> > pshufb  m2, m0 <==
> > neg xq
> > movu[dstq + xq], m1
> > movu[dstq + xq + mmsize], m2 <==
> > add xq, mmsize * 2 <==
> > cmp xq, wq
> > jl .loop0
> >  RET ; add RET here
> >
> > ; MISSING one xmm process if need
> >
> > .skip:
> > add wq, mmsize
> > .loop1:
> > negxq
> > movvb, [srcq + xq]
> > negxq
> > mov[dstq + xq], vb
> > addxq, 1
> > cmpxq, wq
> > jl .loop1
> > RET
>
> So what is wrong now?
>

Doesn't see your email, when i send mine.

Check asm result with your last patch (and modify for the short version
"add xq, mmsize" to "add xq, mmsize * 2")
hflip_byte_c: 28.0
hflip_byte_ssse3: 127.5
hflip_short_c: 276.5
hflip_short_ssse3: 100.2


Do you think if you add RET after the end of loop0 , it can work in all
cases ?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Martin Vignali  wrote:
>>
>> In any case, if clang or gcc can generate better code, then the hand
>> written version needs to be optimized to be as fast or faster.
>>
>>
>>
> Quick test : pass checkasm (but probably only because width = 256)
> hflip_byte_c: 26.4
> hflip_byte_ssse3: 20.4
>
>
> INIT_XMM ssse3
> cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
> movam0, [pb_flip_byte]
> xor xq, xq ; <==
> mov wd, dword wm
> sub wq, mmsize * 2
> ;remove the cmp here <==
> jl .skip
>
> .loop0: ; process two xmm in the loop
> neg xq
> movum1, [srcq + xq - mmsize + 1]
> movum2, [srcq + xq - mmsize * 2 + 1] <==
> pshufb  m1, m0
> pshufb  m2, m0 <==
> neg xq
> movu[dstq + xq], m1
> movu[dstq + xq + mmsize], m2 <==
> add xq, mmsize * 2 <==
> cmp xq, wq
> jl .loop0
>  RET ; add RET here
>
> ; MISSING one xmm process if need
>
> .skip:
> add wq, mmsize
> .loop1:
> negxq
> movvb, [srcq + xq]
> negxq
> mov[dstq + xq], vb
> addxq, 1
> cmpxq, wq
> jl .loop1
> RET

So what is wrong now?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
>
> In any case, if clang or gcc can generate better code, then the hand
> written version needs to be optimized to be as fast or faster.
>
>
>
Quick test : pass checkasm (but probably only because width = 256)
hflip_byte_c: 26.4
hflip_byte_ssse3: 20.4


INIT_XMM ssse3
cglobal hflip_byte, 3, 5, 2, src, dst, w, x, v, src2
movam0, [pb_flip_byte]
xor xq, xq ; <==
mov wd, dword wm
sub wq, mmsize * 2
;remove the cmp here <==
jl .skip

.loop0: ; process two xmm in the loop
neg xq
movum1, [srcq + xq - mmsize + 1]
movum2, [srcq + xq - mmsize * 2 + 1] <==
pshufb  m1, m0
pshufb  m2, m0 <==
neg xq
movu[dstq + xq], m1
movu[dstq + xq + mmsize], m2 <==
add xq, mmsize * 2 <==
cmp xq, wq
jl .loop0
 RET ; add RET here

; MISSING one xmm process if need

.skip:
add wq, mmsize
.loop1:
negxq
movvb, [srcq + xq]
negxq
mov[dstq + xq], vb
addxq, 1
cmpxq, wq
jl .loop1
RET


Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
Signed-off-by: Paul B Mahol 
---
 libavfilter/hflip.h |  38 
 libavfilter/vf_hflip.c  | 133 ++--
 libavfilter/x86/Makefile|   2 +
 libavfilter/x86/vf_hflip.asm|  98 +
 libavfilter/x86/vf_hflip_init.c |  41 +
 5 files changed, 265 insertions(+), 47 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 00..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+const AVClass *class;
+int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
+int planewidth[4];  ///< width of each plane
+int planeheight[4]; ///< height of each plane
+
+void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..030015df0a 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-const AVClass *class;
-int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
-int planewidth[4];  ///< width of each plane
-int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
 { NULL }
 };
@@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx)
 return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint16_t *src = (const uint16_t *)ssrc;
+uint16_t *dst = (uint16_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint32_t *src = (const uint32_t *)ssrc;
+uint32_t *dst = (uint32_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 3, in -= 3) {
+int32_t v = AV_RB24(in);
+
+AV_WB24(out, v);
+}
+}
+
+static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 6, in -= 6) {
+int64_t v = AV_RB48(in);
+
+AV_WB48(out, v);
+}
+}
+
+static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint64_t *src = (const uint64_t *)ssrc;
+uint64_t *dst = (uint64_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
 FlipContext *s = inlink->dst->priv;
 const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
 const int hsub = pix_desc->log2_chroma_w;
 const int vsub = pix_desc->log2_chroma_h;
+int nb_planes, i;
 
 av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
 s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink)
 s->planeheight[0] = s->planeheight[3] = inlink->h;
 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+nb_planes = 

Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Paul B Mahol  wrote:
> On 12/3/17, Paul B Mahol  wrote:
>> Signed-off-by: Paul B Mahol 
>> ---
>>  libavfilter/hflip.h |  38 
>>  libavfilter/vf_hflip.c  | 133
>> ++--
>>  libavfilter/x86/Makefile|   2 +
>>  libavfilter/x86/vf_hflip.asm|  98 +
>>  libavfilter/x86/vf_hflip_init.c |  41 +
>>  5 files changed, 265 insertions(+), 47 deletions(-)
>>  create mode 100644 libavfilter/hflip.h
>>  create mode 100644 libavfilter/x86/vf_hflip.asm
>>  create mode 100644 libavfilter/x86/vf_hflip_init.c
>>
>
> This is overall ~50% faster than pure C that gcc 6.3.0 gives with
> vanilla options.
>

By overall I mean this simple bench test:

ffmpeg -f lavfi -i smptehdbars=hd1080 -vf hflip=threads=1 -f null -
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Paul B Mahol  wrote:
> Signed-off-by: Paul B Mahol 
> ---
>  libavfilter/hflip.h |  38 
>  libavfilter/vf_hflip.c  | 133
> ++--
>  libavfilter/x86/Makefile|   2 +
>  libavfilter/x86/vf_hflip.asm|  98 +
>  libavfilter/x86/vf_hflip_init.c |  41 +
>  5 files changed, 265 insertions(+), 47 deletions(-)
>  create mode 100644 libavfilter/hflip.h
>  create mode 100644 libavfilter/x86/vf_hflip.asm
>  create mode 100644 libavfilter/x86/vf_hflip_init.c
>

This is overall ~50% faster than pure C that gcc 6.3.0 gives with
vanilla options.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread James Almer
On 12/3/2017 3:55 PM, Martin Vignali wrote:
> in O2 or O3 : clang -S -O3 test_asm_gen.c
> 
> If i correctly understand, same idea than paul's patch
> but processing two xmm in the main loop
> 
> .section__TEXT,__text,regular,pure_instructions
> .macosx_version_min 10, 12
> .section__TEXT,__literal16,16byte_literals
> .p2align4
> LCPI0_0:
> .byte15  ## 0xf
> .byte14  ## 0xe
> .byte13  ## 0xd
> .byte12  ## 0xc
> .byte11  ## 0xb
> .byte10  ## 0xa
> .byte9   ## 0x9
> .byte8   ## 0x8
> .byte7   ## 0x7
> .byte6   ## 0x6
> .byte5   ## 0x5
> .byte4   ## 0x4
> .byte3   ## 0x3
> .byte2   ## 0x2
> .byte1   ## 0x1
> .byte0   ## 0x0
> .section__TEXT,__text,regular,pure_instructions
> .globl_hflip_byte_c
> .p2align4, 0x90
> _hflip_byte_c:  ## @hflip_byte_c
> .cfi_startproc
> ## BB#0:
> pushq%rbp
> Ltmp0:
> .cfi_def_cfa_offset 16
> Ltmp1:
> .cfi_offset %rbp, -16
> movq%rsp, %rbp
> Ltmp2:
> .cfi_def_cfa_register %rbp
> ## kill: %EDX %EDX
> %RDX
> testl%edx, %edx
> jleLBB0_17
> ## BB#1:
> movl%edx, %r8d
> cmpl$32, %edx
> jaeLBB0_3
> ## BB#2:
> xorl%r11d, %r11d
> jmpLBB0_11
> LBB0_3:
> andl$31, %edx
> movq%r8, %r11
> subq%rdx, %r11
> jeLBB0_7
> ## BB#4:
> leaq1(%rdi), %rax
> cmpq%rsi, %rax
> jbeLBB0_8
> ## BB#5:
> leaq(%rsi,%r8), %r9
> movl$1, %eax
> subq%r8, %rax
> addq%rdi, %rax
> cmpq%r9, %rax
> jaeLBB0_8
> LBB0_7:
> xorl%r11d, %r11d
> jmpLBB0_11
> LBB0_8:
> leaq-15(%rdi), %r9
> leaq16(%rsi), %rax
> movdqaLCPI0_0(%rip), %xmm0## xmm0 =
> [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
> movq%r11, %r10
> .p2align4, 0x90
> LBB0_9: ## =>This Inner Loop Header: Depth=1
> movdqu-16(%r9), %xmm1
> movdqu(%r9), %xmm2
> pshufb%xmm0, %xmm2
> pshufb%xmm0, %xmm1
> movdqu%xmm2, -16(%rax)
> movdqu%xmm1, (%rax)
> addq$-32, %r9
> addq$32, %rax
> addq$-32, %r10
> jneLBB0_9

Huh, so we're not disabling tree vectorization with clang, only with
GCC. Guess it hasn't generated broken code before to justify disabling it.

In any case, if clang or gcc can generate better code, then the hand
written version needs to be optimized to be as fast or faster.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
> Can you post a disassembly of hflip_byte_c?
>
>
> in O1 : clang -S -O1 test_asm_gen.c

.section__TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.globl_hflip_byte_c
.p2align4, 0x90
_hflip_byte_c:  ## @hflip_byte_c
.cfi_startproc
## BB#0:
pushq%rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq%rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testl%edx, %edx
jleLBB0_3
## BB#1:
movl%edx, %eax
.p2align4, 0x90
LBB0_2: ## =>This Inner Loop Header: Depth=1
movzbl(%rdi), %ecx
movb%cl, (%rsi)
decq%rdi
incq%rsi
decq%rax
jneLBB0_2
LBB0_3:
popq%rbp
retq
.cfi_endproc


.subsections_via_symbols






in O2 or O3 : clang -S -O3 test_asm_gen.c

If i correctly understand, same idea than paul's patch
but processing two xmm in the main loop

.section__TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.section__TEXT,__literal16,16byte_literals
.p2align4
LCPI0_0:
.byte15  ## 0xf
.byte14  ## 0xe
.byte13  ## 0xd
.byte12  ## 0xc
.byte11  ## 0xb
.byte10  ## 0xa
.byte9   ## 0x9
.byte8   ## 0x8
.byte7   ## 0x7
.byte6   ## 0x6
.byte5   ## 0x5
.byte4   ## 0x4
.byte3   ## 0x3
.byte2   ## 0x2
.byte1   ## 0x1
.byte0   ## 0x0
.section__TEXT,__text,regular,pure_instructions
.globl_hflip_byte_c
.p2align4, 0x90
_hflip_byte_c:  ## @hflip_byte_c
.cfi_startproc
## BB#0:
pushq%rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq%rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
## kill: %EDX %EDX
%RDX
testl%edx, %edx
jleLBB0_17
## BB#1:
movl%edx, %r8d
cmpl$32, %edx
jaeLBB0_3
## BB#2:
xorl%r11d, %r11d
jmpLBB0_11
LBB0_3:
andl$31, %edx
movq%r8, %r11
subq%rdx, %r11
jeLBB0_7
## BB#4:
leaq1(%rdi), %rax
cmpq%rsi, %rax
jbeLBB0_8
## BB#5:
leaq(%rsi,%r8), %r9
movl$1, %eax
subq%r8, %rax
addq%rdi, %rax
cmpq%r9, %rax
jaeLBB0_8
LBB0_7:
xorl%r11d, %r11d
jmpLBB0_11
LBB0_8:
leaq-15(%rdi), %r9
leaq16(%rsi), %rax
movdqaLCPI0_0(%rip), %xmm0## xmm0 =
[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
movq%r11, %r10
.p2align4, 0x90
LBB0_9: ## =>This Inner Loop Header: Depth=1
movdqu-16(%r9), %xmm1
movdqu(%r9), %xmm2
pshufb%xmm0, %xmm2
pshufb%xmm0, %xmm1
movdqu%xmm2, -16(%rax)
movdqu%xmm1, (%rax)
addq$-32, %r9
addq$32, %rax
addq$-32, %r10
jneLBB0_9
## BB#10:
testl%edx, %edx
jeLBB0_17
LBB0_11:
movl%r8d, %eax
subl%r11d, %eax
leaq-1(%r8), %r9
subq%r11, %r9
andq$3, %rax
jeLBB0_14
## BB#12:
movq%rdi, %rdx
subq%r11, %rdx
negq%rax
.p2align4, 0x90
LBB0_13:## =>This Inner Loop Header: Depth=1
movzbl(%rdx), %ecx
movb%cl, (%rsi,%r11)
incq%r11
decq%rdx
incq%rax
jneLBB0_13
LBB0_14:
cmpq$3, %r9
jbLBB0_17
## BB#15:
subq%r11, %r8
subq%r11, %rdi
leaq3(%rsi,%r11), %rax
.p2align4, 0x90
LBB0_16:## =>This Inner Loop Header: Depth=1
movzbl(%rdi), %ecx
movb%cl, -3(%rax)
movzbl-1(%rdi), %ecx
movb%cl, -2(%rax)
movzbl-2(%rdi), %ecx
movb%cl, -1(%rax)
movzbl-3(%rdi), %ecx
movb%cl, (%rax)
addq$-4, %rdi
addq$4, %rax
addq$-4, %r8
jneLBB0_16
LBB0_17:
popq%rbp
retq
.cfi_endproc


.subsections_via_symbols
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread James Almer
On 12/3/2017 3:09 PM, Martin Vignali wrote:
>> 2017-12-03 17:46 GMT+01:00 Paul B Mahol :
>>
>>> On 12/3/17, Martin Vignali  wrote:
 Hello,

 Maybe you can use a macro for byte and short version,
 only few lines are different in each version
>>>
>>> Sure, feel free to send patches.
>>>
>>> I'm not very macro proficient.
>>>
>>
>> Ok, i will take a look.
>>
>> Martin
>>
> 
> I write a basic checkasm test. Seems like the byte version is slower than c
> 
> hflip_byte_c: 31.8
> hflip_byte_ssse3: 108.1
> hflip_short_c: 300.1
> hflip_short_ssse3: 139.8
> 
> (checkasm patch in attach if you want to test)
> 
> Martin

$ tests/checkasm/checkasm.exe --test=vf_hflip --bench
benchmarking with native FFmpeg timers
nop: 32.0
hflip_byte_c: 362.0
hflip_byte_ssse3: 96.0
hflip_short_c: 374.0
hflip_short_ssse3: 121.0

Guess your compiler is really good at optimizing this code, or something
funny is going on.
Can you post a disassembly of hflip_byte_c?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
> 2017-12-03 17:46 GMT+01:00 Paul B Mahol :
>
>> On 12/3/17, Martin Vignali  wrote:
>> > Hello,
>> >
>> > Maybe you can use a macro for byte and short version,
>> > only few lines are different in each version
>>
>> Sure, feel free to send patches.
>>
>> I'm not very macro proficient.
>>
>
> Ok, i will take a look.
>
> Martin
>

I write a basic checkasm test. Seems like the byte version is slower than c

hflip_byte_c: 31.8
hflip_byte_ssse3: 108.1
hflip_short_c: 300.1
hflip_short_ssse3: 139.8

(checkasm patch in attach if you want to test)

Martin


0002-checkasm-vf_hflip-add-test-for-hflip-SIMD.patch
Description: Binary data
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
2017-12-03 17:46 GMT+01:00 Paul B Mahol :

> On 12/3/17, Martin Vignali  wrote:
> > Hello,
> >
> > Maybe you can use a macro for byte and short version,
> > only few lines are different in each version
>
> Sure, feel free to send patches.
>
> I'm not very macro proficient.
>

Ok, i will take a look.

Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Paul B Mahol
On 12/3/17, Martin Vignali  wrote:
> Hello,
>
> Maybe you can use a macro for byte and short version,
> only few lines are different in each version

Sure, feel free to send patches.

I'm not very macro proficient.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-03 Thread Martin Vignali
Hello,

Maybe you can use a macro for byte and short version,
only few lines are different in each version

Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-02 Thread Paul B Mahol
On 12/2/17, Martin Vignali  wrote:
>> +
>> +%include "libavutil/x86/x86util.asm"
>> +
>> +SECTION_RODATA
>> +
>> +pb_flip_byte:  times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
>> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
>> +
>>
>
> times 16 ?

Removed.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-02 Thread Martin Vignali
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +pb_flip_byte:  times 16 db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
> +pb_flip_short: times 16 db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
> +
>

times 16 ?

Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-02 Thread Paul B Mahol
Signed-off-by: Paul B Mahol 
---
 libavfilter/hflip.h |  38 
 libavfilter/vf_hflip.c  | 131 ++--
 libavfilter/x86/Makefile|   2 +
 libavfilter/x86/vf_hflip.asm|  92 
 libavfilter/x86/vf_hflip_init.c |  41 +
 5 files changed, 257 insertions(+), 47 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 00..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+const AVClass *class;
+int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
+int planewidth[4];  ///< width of each plane
+int planeheight[4]; ///< height of each plane
+
+void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..303cc8af60 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-const AVClass *class;
-int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
-int planewidth[4];  ///< width of each plane
-int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
 { NULL }
 };
@@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx)
 return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint16_t *src = (const uint16_t *)ssrc;
+uint16_t *dst = (uint16_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint32_t *src = (const uint32_t *)ssrc;
+uint32_t *dst = (uint32_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
+static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 3, in -= 3) {
+int32_t v = AV_RB24(in);
+
+AV_WB24(out, v);
+}
+}
+
+static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
+{
+const uint8_t *in  = src;
+uint8_t *out = dst;
+int j;
+
+for (j = 0; j < w; j++, out += 6, in -= 6) {
+int64_t v = AV_RB48(in);
+
+AV_WB48(out, v);
+}
+}
+
+static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
+{
+const uint64_t *src = (const uint64_t *)ssrc;
+uint64_t *dst = (uint64_t *)ddst;
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
 FlipContext *s = inlink->dst->priv;
 const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
 const int hsub = pix_desc->log2_chroma_w;
 const int vsub = pix_desc->log2_chroma_h;
+int i;
 
 av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
 s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +139,22 @@ static int config_props(AVFilterLink *inlink)
 s->planeheight[0] = s->planeheight[3] = inlink->h;
 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+for (i = 0; i < 4; i++) {
+switch 

Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-01 Thread James Almer
On 12/1/2017 7:02 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol 
> ---
>  libavfilter/hflip.h | 38 +
>  libavfilter/vf_hflip.c  | 30 ++--
>  libavfilter/x86/Makefile|  2 ++
>  libavfilter/x86/vf_hflip.asm| 61 
> +
>  libavfilter/x86/vf_hflip_init.c | 38 +
>  5 files changed, 160 insertions(+), 9 deletions(-)
>  create mode 100644 libavfilter/hflip.h
>  create mode 100644 libavfilter/x86/vf_hflip.asm
>  create mode 100644 libavfilter/x86/vf_hflip_init.c
> 
> diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
> new file mode 100644
> index 00..138380427c
> --- /dev/null
> +++ b/libavfilter/hflip.h
> @@ -0,0 +1,38 @@
> +/*
> + * Copyright (c) 2007 Benoit Fouet
> + * Copyright (c) 2010 Stefano Sabatini
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#ifndef AVFILTER_HFLIP_H
> +#define AVFILTER_HFLIP_H
> +
> +#include "avfilter.h"
> +
> +typedef struct FlipContext {
> +const AVClass *class;
> +int max_step[4];///< max pixel step for each plane, expressed as a 
> number of bytes
> +int planewidth[4];  ///< width of each plane
> +int planeheight[4]; ///< height of each plane
> +
> +void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
> +} FlipContext;
> +
> +void ff_hflip_init_x86(FlipContext *s, int step[4]);
> +
> +#endif /* AVFILTER_HFLIP_H */
> diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
> index cf20c193f7..65cf7c5cd1 100644
> --- a/libavfilter/vf_hflip.c
> +++ b/libavfilter/vf_hflip.c
> @@ -29,6 +29,7 @@
>  #include "libavutil/opt.h"
>  #include "avfilter.h"
>  #include "formats.h"
> +#include "hflip.h"
>  #include "internal.h"
>  #include "video.h"
>  #include "libavutil/pixdesc.h"
> @@ -36,13 +37,6 @@
>  #include "libavutil/intreadwrite.h"
>  #include "libavutil/imgutils.h"
>  
> -typedef struct FlipContext {
> -const AVClass *class;
> -int max_step[4];///< max pixel step for each plane, expressed as a 
> number of bytes
> -int planewidth[4];  ///< width of each plane
> -int planeheight[4]; ///< height of each plane
> -} FlipContext;
> -
>  static const AVOption hflip_options[] = {
>  { NULL }
>  };
> @@ -67,12 +61,21 @@ static int query_formats(AVFilterContext *ctx)
>  return ff_set_common_formats(ctx, pix_fmts);
>  }
>  
> +static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
> +{
> +int j;
> +
> +for (j = 0; j < w; j++)
> +dst[j] = src[-j];
> +}
> +
>  static int config_props(AVFilterLink *inlink)
>  {
>  FlipContext *s = inlink->dst->priv;
>  const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
>  const int hsub = pix_desc->log2_chroma_w;
>  const int vsub = pix_desc->log2_chroma_h;
> +int i;
>  
>  av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
>  s->planewidth[0]  = s->planewidth[3]  = inlink->w;
> @@ -80,6 +83,16 @@ static int config_props(AVFilterLink *inlink)
>  s->planeheight[0] = s->planeheight[3] = inlink->h;
>  s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
>  
> +for (i = 0; i < 4; i++) {
> +switch (s->max_step[i]) {
> +case 1:
> +s->flip_line[i] = hflip_byte_c;
> +}
> +}
> +
> +if (ARCH_X86)
> +ff_hflip_init_x86(s, s->max_step);
> +
>  return 0;
>  }
>  
> @@ -109,8 +122,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
> int job, int nb_jobs)
>  for (i = start; i < end; i++) {
>  switch (step) {
>  case 1:
> -for (j = 0; j < width; j++)
> -outrow[j] = inrow[-j];
> +s->flip_line[plane](inrow, outrow, width);
>  break;
>  
>  case 2:
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 3431625883..1420954f62 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += 
> x86/colorspacedsp_init.o
>  OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
>  OBJS-$(CONFIG_FSPP_FILTER)  

Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-01 Thread James Almer
On 12/1/2017 11:13 PM, Michael Niedermayer wrote:
> On Fri, Dec 01, 2017 at 11:02:43PM +0100, Paul B Mahol wrote:
>> Signed-off-by: Paul B Mahol 
>> ---
>>  libavfilter/hflip.h | 38 +
>>  libavfilter/vf_hflip.c  | 30 ++--
>>  libavfilter/x86/Makefile|  2 ++
>>  libavfilter/x86/vf_hflip.asm| 61 
>> +
>>  libavfilter/x86/vf_hflip_init.c | 38 +
>>  5 files changed, 160 insertions(+), 9 deletions(-)
>>  create mode 100644 libavfilter/hflip.h
>>  create mode 100644 libavfilter/x86/vf_hflip.asm
>>  create mode 100644 libavfilter/x86/vf_hflip_init.c
> 
> fails to build on x86-32 linux
> 
> libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86':
> src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to 
> `ff_hflip_byte_ssse3'
> collect2: error: ld returned 1 exit status
> make: *** [ffmpeg_g] Error 1
> make: *** Waiting for unfinished jobs
> libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86':
> src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to 
> `ff_hflip_byte_ssse3'
> collect2: error: ld returned 1 exit status
> make: *** [ffprobe_g] Error 1

For some reason the whole asm function is wrapped in a x86_64 check even
though it's not needed.
Guess it was a copy paste mistake.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-01 Thread Michael Niedermayer
On Fri, Dec 01, 2017 at 11:02:43PM +0100, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol 
> ---
>  libavfilter/hflip.h | 38 +
>  libavfilter/vf_hflip.c  | 30 ++--
>  libavfilter/x86/Makefile|  2 ++
>  libavfilter/x86/vf_hflip.asm| 61 
> +
>  libavfilter/x86/vf_hflip_init.c | 38 +
>  5 files changed, 160 insertions(+), 9 deletions(-)
>  create mode 100644 libavfilter/hflip.h
>  create mode 100644 libavfilter/x86/vf_hflip.asm
>  create mode 100644 libavfilter/x86/vf_hflip_init.c

fails to build on x86-32 linux

libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86':
src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to 
`ff_hflip_byte_ssse3'
collect2: error: ld returned 1 exit status
make: *** [ffmpeg_g] Error 1
make: *** Waiting for unfinished jobs
libavfilter/libavfilter.a(vf_hflip_init.o): In function `ff_hflip_init_x86':
src/libavfilter/x86/vf_hflip_init.c:35: undefined reference to 
`ff_hflip_byte_ssse3'
collect2: error: ld returned 1 exit status
make: *** [ffprobe_g] Error 1


[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No human being will ever know the Truth, for even if they happen to say it
by chance, they would not even known they had done so. -- Xenophanes


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

2017-12-01 Thread Paul B Mahol
Signed-off-by: Paul B Mahol 
---
 libavfilter/hflip.h | 38 +
 libavfilter/vf_hflip.c  | 30 ++--
 libavfilter/x86/Makefile|  2 ++
 libavfilter/x86/vf_hflip.asm| 61 +
 libavfilter/x86/vf_hflip_init.c | 38 +
 5 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 libavfilter/hflip.h
 create mode 100644 libavfilter/x86/vf_hflip.asm
 create mode 100644 libavfilter/x86/vf_hflip_init.c

diff --git a/libavfilter/hflip.h b/libavfilter/hflip.h
new file mode 100644
index 00..138380427c
--- /dev/null
+++ b/libavfilter/hflip.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2007 Benoit Fouet
+ * Copyright (c) 2010 Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_HFLIP_H
+#define AVFILTER_HFLIP_H
+
+#include "avfilter.h"
+
+typedef struct FlipContext {
+const AVClass *class;
+int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
+int planewidth[4];  ///< width of each plane
+int planeheight[4]; ///< height of each plane
+
+void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
+} FlipContext;
+
+void ff_hflip_init_x86(FlipContext *s, int step[4]);
+
+#endif /* AVFILTER_HFLIP_H */
diff --git a/libavfilter/vf_hflip.c b/libavfilter/vf_hflip.c
index cf20c193f7..65cf7c5cd1 100644
--- a/libavfilter/vf_hflip.c
+++ b/libavfilter/vf_hflip.c
@@ -29,6 +29,7 @@
 #include "libavutil/opt.h"
 #include "avfilter.h"
 #include "formats.h"
+#include "hflip.h"
 #include "internal.h"
 #include "video.h"
 #include "libavutil/pixdesc.h"
@@ -36,13 +37,6 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/imgutils.h"
 
-typedef struct FlipContext {
-const AVClass *class;
-int max_step[4];///< max pixel step for each plane, expressed as a 
number of bytes
-int planewidth[4];  ///< width of each plane
-int planeheight[4]; ///< height of each plane
-} FlipContext;
-
 static const AVOption hflip_options[] = {
 { NULL }
 };
@@ -67,12 +61,21 @@ static int query_formats(AVFilterContext *ctx)
 return ff_set_common_formats(ctx, pix_fmts);
 }
 
+static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
+{
+int j;
+
+for (j = 0; j < w; j++)
+dst[j] = src[-j];
+}
+
 static int config_props(AVFilterLink *inlink)
 {
 FlipContext *s = inlink->dst->priv;
 const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
 const int hsub = pix_desc->log2_chroma_w;
 const int vsub = pix_desc->log2_chroma_h;
+int i;
 
 av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
 s->planewidth[0]  = s->planewidth[3]  = inlink->w;
@@ -80,6 +83,16 @@ static int config_props(AVFilterLink *inlink)
 s->planeheight[0] = s->planeheight[3] = inlink->h;
 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
 
+for (i = 0; i < 4; i++) {
+switch (s->max_step[i]) {
+case 1:
+s->flip_line[i] = hflip_byte_c;
+}
+}
+
+if (ARCH_X86)
+ff_hflip_init_x86(s, s->max_step);
+
 return 0;
 }
 
@@ -109,8 +122,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, 
int job, int nb_jobs)
 for (i = start; i < end; i++) {
 switch (step) {
 case 1:
-for (j = 0; j < width; j++)
-outrow[j] = inrow[-j];
+s->flip_line[plane](inrow, outrow, width);
 break;
 
 case 2:
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 3431625883..1420954f62 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += 
x86/colorspacedsp_init.o
 OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)+= x86/vf_gradfun_init.o
+OBJS-$(CONFIG_HFLIP_FILTER)  += x86/vf_hflip_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)   += x86/vf_idet_init.o