Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2016-01-06 Thread Hendrik Leppkes
On Thu, Jan 7, 2016 at 3:13 AM, Hendrik Leppkes  wrote:
> On Mon, Oct 12, 2015 at 1:21 AM, James Almer  wrote:
>> On 10/11/2015 3:11 PM, Ronald S. Bultje wrote:
>>> Hi,
>>>
>>> On Sun, Oct 11, 2015 at 1:17 PM, James Almer  wrote:
>>>
 On 10/11/2015 4:31 AM, Paul B Mahol wrote:
> On 10/11/15, James Almer  wrote:
>> Signed-off-by: James Almer 
>> ---
>>  libavfilter/x86/vf_w3fdif.asm | 16 +++-
>>  1 file changed, 7 insertions(+), 9 deletions(-)
>>
>> diff --git a/libavfilter/x86/vf_w3fdif.asm
 b/libavfilter/x86/vf_w3fdif.asm
>> index f02319b..f2001a4 100644
>> --- a/libavfilter/x86/vf_w3fdif.asm
>> +++ b/libavfilter/x86/vf_w3fdif.asm
>> @@ -103,13 +103,11 @@ REP_RET
>>
>>  %if ARCH_X86_64
>>
>> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>>  movq  m2, [coefq]
>>  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0,
 in_lines_cur1,
>> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
>> -SPLATWm0, m2, 0
>> -SPLATWm1, m2, 1
>> +pshufdm0, m2, q
>>  SPLATWm2, m2, 2
>> -SBUTTERFLYwd, 0, 1, 7
>>  pxor  m7, m7
>>  mov  offsetq, 0
>>  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
>> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
>> in_lines_cur0, in_lines_adj0,
>>  movh   m4, [in_lines_cur1q+offsetq]
>>  punpcklbw  m3, m7
>>  punpcklbw  m4, m7
>> -SBUTTERFLY wd, 3, 4, 8
>> +SBUTTERFLY wd, 3, 4, 1
>>  pmaddwdm3, m0
>> -pmaddwdm4, m1
>> +pmaddwdm4, m0
>>  movh   m5, [in_lines_adj0q+offsetq]
>>  movh   m6, [in_lines_adj1q+offsetq]
>>  punpcklbw  m5, m7
>>  punpcklbw  m6, m7
>> -SBUTTERFLY wd, 5, 6, 8
>> +SBUTTERFLY wd, 5, 6, 1
>>  pmaddwdm5, m0
>> -pmaddwdm6, m1
>> +pmaddwdm6, m0
>>  paddd  m3, m5
>>  paddd  m4, m6
>>  movh   m5, [in_lines_cur2q+offsetq]
>>  movh   m6, [in_lines_adj2q+offsetq]
>>  punpcklbw  m5, m7
>>  punpcklbw  m6, m7
>> -SBUTTERFLY wd, 5, 6, 8
>> +SBUTTERFLY wd, 5, 6, 1
>>  pmaddwdm5, m2
>>  pmaddwdm6, m2
>>  paddd  m3, m5
>> --
>> 2.6.0
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>
> Cant this now be used on x32?

>>>
>>> Add to the data pointers directly (in_lines_cur0q and work_lineq). Then sub
>>> all other curXq/adjXq from cur0q (on 32bit only) before the loop and you
>>> have to adds (on 32bit) instead of one (on 64bit), but one reg less
>>> (offset), making it 7, which means it works.
>>>
>>> Ronald
>>
>> Ah, like it's being done in PACK_6CH from swr's audio_convert.asm
>> For complex_high some stack ab/use will be needed (see PACK_8CH), but it 
>> should
>> be doable.
>> This way w3fdif will be able to fully dethrone yadif :P
>
> Are you still working on w3fdif_simple_high for 32bit?
> I would be interested in that. Otherwise I might try to do it myself,
> although it sounds like a lot of #if'ery

I was bored and it was easy, so patch coming up anyway!
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2016-01-06 Thread Hendrik Leppkes
On Mon, Oct 12, 2015 at 1:21 AM, James Almer  wrote:
> On 10/11/2015 3:11 PM, Ronald S. Bultje wrote:
>> Hi,
>>
>> On Sun, Oct 11, 2015 at 1:17 PM, James Almer  wrote:
>>
>>> On 10/11/2015 4:31 AM, Paul B Mahol wrote:
 On 10/11/15, James Almer  wrote:
> Signed-off-by: James Almer 
> ---
>  libavfilter/x86/vf_w3fdif.asm | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/libavfilter/x86/vf_w3fdif.asm
>>> b/libavfilter/x86/vf_w3fdif.asm
> index f02319b..f2001a4 100644
> --- a/libavfilter/x86/vf_w3fdif.asm
> +++ b/libavfilter/x86/vf_w3fdif.asm
> @@ -103,13 +103,11 @@ REP_RET
>
>  %if ARCH_X86_64
>
> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
> in_lines_adj0, coef, linesize
> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
> in_lines_adj0, coef, linesize
>  movq  m2, [coefq]
>  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0,
>>> in_lines_cur1,
> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
> -SPLATWm0, m2, 0
> -SPLATWm1, m2, 1
> +pshufdm0, m2, q
>  SPLATWm2, m2, 2
> -SBUTTERFLYwd, 0, 1, 7
>  pxor  m7, m7
>  mov  offsetq, 0
>  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
> in_lines_cur0, in_lines_adj0,
>  movh   m4, [in_lines_cur1q+offsetq]
>  punpcklbw  m3, m7
>  punpcklbw  m4, m7
> -SBUTTERFLY wd, 3, 4, 8
> +SBUTTERFLY wd, 3, 4, 1
>  pmaddwdm3, m0
> -pmaddwdm4, m1
> +pmaddwdm4, m0
>  movh   m5, [in_lines_adj0q+offsetq]
>  movh   m6, [in_lines_adj1q+offsetq]
>  punpcklbw  m5, m7
>  punpcklbw  m6, m7
> -SBUTTERFLY wd, 5, 6, 8
> +SBUTTERFLY wd, 5, 6, 1
>  pmaddwdm5, m0
> -pmaddwdm6, m1
> +pmaddwdm6, m0
>  paddd  m3, m5
>  paddd  m4, m6
>  movh   m5, [in_lines_cur2q+offsetq]
>  movh   m6, [in_lines_adj2q+offsetq]
>  punpcklbw  m5, m7
>  punpcklbw  m6, m7
> -SBUTTERFLY wd, 5, 6, 8
> +SBUTTERFLY wd, 5, 6, 1
>  pmaddwdm5, m2
>  pmaddwdm6, m2
>  paddd  m3, m5
> --
> 2.6.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

 Cant this now be used on x32?
>>>
>>
>> Add to the data pointers directly (in_lines_cur0q and work_lineq). Then sub
>> all other curXq/adjXq from cur0q (on 32bit only) before the loop and you
>> have to adds (on 32bit) instead of one (on 64bit), but one reg less
>> (offset), making it 7, which means it works.
>>
>> Ronald
>
> Ah, like it's being done in PACK_6CH from swr's audio_convert.asm
> For complex_high some stack ab/use will be needed (see PACK_8CH), but it 
> should
> be doable.
> This way w3fdif will be able to fully dethrone yadif :P

Are you still working on w3fdif_simple_high for 32bit?
I would be interested in that. Otherwise I might try to do it myself,
although it sounds like a lot of #if'ery
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-12 Thread Robert Krüger
>
>
> Ah, like it's being done in PACK_6CH from swr's audio_convert.asm
> For complex_high some stack ab/use will be needed (see PACK_8CH), but it
> should
> be doable.
> This way w3fdif will be able to fully dethrone yadif :P


Once it supports +8bit, that is ;-)
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-11 Thread James Almer
On 10/11/2015 3:11 PM, Ronald S. Bultje wrote:
> Hi,
> 
> On Sun, Oct 11, 2015 at 1:17 PM, James Almer  wrote:
> 
>> On 10/11/2015 4:31 AM, Paul B Mahol wrote:
>>> On 10/11/15, James Almer  wrote:
 Signed-off-by: James Almer 
 ---
  libavfilter/x86/vf_w3fdif.asm | 16 +++-
  1 file changed, 7 insertions(+), 9 deletions(-)

 diff --git a/libavfilter/x86/vf_w3fdif.asm
>> b/libavfilter/x86/vf_w3fdif.asm
 index f02319b..f2001a4 100644
 --- a/libavfilter/x86/vf_w3fdif.asm
 +++ b/libavfilter/x86/vf_w3fdif.asm
 @@ -103,13 +103,11 @@ REP_RET

  %if ARCH_X86_64

 -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
 in_lines_adj0, coef, linesize
 +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
 in_lines_adj0, coef, linesize
  movq  m2, [coefq]
  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0,
>> in_lines_cur1,
 linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
 -SPLATWm0, m2, 0
 -SPLATWm1, m2, 1
 +pshufdm0, m2, q
  SPLATWm2, m2, 2
 -SBUTTERFLYwd, 0, 1, 7
  pxor  m7, m7
  mov  offsetq, 0
  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
 @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
 in_lines_cur0, in_lines_adj0,
  movh   m4, [in_lines_cur1q+offsetq]
  punpcklbw  m3, m7
  punpcklbw  m4, m7
 -SBUTTERFLY wd, 3, 4, 8
 +SBUTTERFLY wd, 3, 4, 1
  pmaddwdm3, m0
 -pmaddwdm4, m1
 +pmaddwdm4, m0
  movh   m5, [in_lines_adj0q+offsetq]
  movh   m6, [in_lines_adj1q+offsetq]
  punpcklbw  m5, m7
  punpcklbw  m6, m7
 -SBUTTERFLY wd, 5, 6, 8
 +SBUTTERFLY wd, 5, 6, 1
  pmaddwdm5, m0
 -pmaddwdm6, m1
 +pmaddwdm6, m0
  paddd  m3, m5
  paddd  m4, m6
  movh   m5, [in_lines_cur2q+offsetq]
  movh   m6, [in_lines_adj2q+offsetq]
  punpcklbw  m5, m7
  punpcklbw  m6, m7
 -SBUTTERFLY wd, 5, 6, 8
 +SBUTTERFLY wd, 5, 6, 1
  pmaddwdm5, m2
  pmaddwdm6, m2
  paddd  m3, m5
 --
 2.6.0

 ___
 ffmpeg-devel mailing list
 ffmpeg-devel@ffmpeg.org
 http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

>>>
>>> Cant this now be used on x32?
>>
> 
> Add to the data pointers directly (in_lines_cur0q and work_lineq). Then sub
> all other curXq/adjXq from cur0q (on 32bit only) before the loop and you
> have to adds (on 32bit) instead of one (on 64bit), but one reg less
> (offset), making it 7, which means it works.
> 
> Ronald

Ah, like it's being done in PACK_6CH from swr's audio_convert.asm
For complex_high some stack ab/use will be needed (see PACK_8CH), but it should
be doable.
This way w3fdif will be able to fully dethrone yadif :P
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-11 Thread Ronald S. Bultje
Hi,

On Sun, Oct 11, 2015 at 1:17 PM, James Almer  wrote:

> On 10/11/2015 4:31 AM, Paul B Mahol wrote:
> > On 10/11/15, James Almer  wrote:
> >> Signed-off-by: James Almer 
> >> ---
> >>  libavfilter/x86/vf_w3fdif.asm | 16 +++-
> >>  1 file changed, 7 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/libavfilter/x86/vf_w3fdif.asm
> b/libavfilter/x86/vf_w3fdif.asm
> >> index f02319b..f2001a4 100644
> >> --- a/libavfilter/x86/vf_w3fdif.asm
> >> +++ b/libavfilter/x86/vf_w3fdif.asm
> >> @@ -103,13 +103,11 @@ REP_RET
> >>
> >>  %if ARCH_X86_64
> >>
> >> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
> >> in_lines_adj0, coef, linesize
> >> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
> >> in_lines_adj0, coef, linesize
> >>  movq  m2, [coefq]
> >>  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0,
> in_lines_cur1,
> >> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
> >> -SPLATWm0, m2, 0
> >> -SPLATWm1, m2, 1
> >> +pshufdm0, m2, q
> >>  SPLATWm2, m2, 2
> >> -SBUTTERFLYwd, 0, 1, 7
> >>  pxor  m7, m7
> >>  mov  offsetq, 0
> >>  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
> >> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
> >> in_lines_cur0, in_lines_adj0,
> >>  movh   m4, [in_lines_cur1q+offsetq]
> >>  punpcklbw  m3, m7
> >>  punpcklbw  m4, m7
> >> -SBUTTERFLY wd, 3, 4, 8
> >> +SBUTTERFLY wd, 3, 4, 1
> >>  pmaddwdm3, m0
> >> -pmaddwdm4, m1
> >> +pmaddwdm4, m0
> >>  movh   m5, [in_lines_adj0q+offsetq]
> >>  movh   m6, [in_lines_adj1q+offsetq]
> >>  punpcklbw  m5, m7
> >>  punpcklbw  m6, m7
> >> -SBUTTERFLY wd, 5, 6, 8
> >> +SBUTTERFLY wd, 5, 6, 1
> >>  pmaddwdm5, m0
> >> -pmaddwdm6, m1
> >> +pmaddwdm6, m0
> >>  paddd  m3, m5
> >>  paddd  m4, m6
> >>  movh   m5, [in_lines_cur2q+offsetq]
> >>  movh   m6, [in_lines_adj2q+offsetq]
> >>  punpcklbw  m5, m7
> >>  punpcklbw  m6, m7
> >> -SBUTTERFLY wd, 5, 6, 8
> >> +SBUTTERFLY wd, 5, 6, 1
> >>  pmaddwdm5, m2
> >>  pmaddwdm6, m2
> >>  paddd  m3, m5
> >> --
> >> 2.6.0
> >>
> >> ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >
> > Cant this now be used on x32?
>

Add to the data pointers directly (in_lines_cur0q and work_lineq). Then sub
all other curXq/adjXq from cur0q (on 32bit only) before the loop and you
have to adds (on 32bit) instead of one (on 64bit), but one reg less
(offset), making it 7, which means it works.

Ronald
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-11 Thread James Almer
On 10/11/2015 4:31 AM, Paul B Mahol wrote:
> On 10/11/15, James Almer  wrote:
>> Signed-off-by: James Almer 
>> ---
>>  libavfilter/x86/vf_w3fdif.asm | 16 +++-
>>  1 file changed, 7 insertions(+), 9 deletions(-)
>>
>> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
>> index f02319b..f2001a4 100644
>> --- a/libavfilter/x86/vf_w3fdif.asm
>> +++ b/libavfilter/x86/vf_w3fdif.asm
>> @@ -103,13 +103,11 @@ REP_RET
>>
>>  %if ARCH_X86_64
>>
>> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
>> in_lines_adj0, coef, linesize
>>  movq  m2, [coefq]
>>  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0, in_lines_cur1,
>> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
>> -SPLATWm0, m2, 0
>> -SPLATWm1, m2, 1
>> +pshufdm0, m2, q
>>  SPLATWm2, m2, 2
>> -SBUTTERFLYwd, 0, 1, 7
>>  pxor  m7, m7
>>  mov  offsetq, 0
>>  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
>> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
>> in_lines_cur0, in_lines_adj0,
>>  movh   m4, [in_lines_cur1q+offsetq]
>>  punpcklbw  m3, m7
>>  punpcklbw  m4, m7
>> -SBUTTERFLY wd, 3, 4, 8
>> +SBUTTERFLY wd, 3, 4, 1
>>  pmaddwdm3, m0
>> -pmaddwdm4, m1
>> +pmaddwdm4, m0
>>  movh   m5, [in_lines_adj0q+offsetq]
>>  movh   m6, [in_lines_adj1q+offsetq]
>>  punpcklbw  m5, m7
>>  punpcklbw  m6, m7
>> -SBUTTERFLY wd, 5, 6, 8
>> +SBUTTERFLY wd, 5, 6, 1
>>  pmaddwdm5, m0
>> -pmaddwdm6, m1
>> +pmaddwdm6, m0
>>  paddd  m3, m5
>>  paddd  m4, m6
>>  movh   m5, [in_lines_cur2q+offsetq]
>>  movh   m6, [in_lines_adj2q+offsetq]
>>  punpcklbw  m5, m7
>>  punpcklbw  m6, m7
>> -SBUTTERFLY wd, 5, 6, 8
>> +SBUTTERFLY wd, 5, 6, 1
>>  pmaddwdm5, m2
>>  pmaddwdm6, m2
>>  paddd  m3, m5
>> --
>> 2.6.0
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
> 
> Cant this now be used on x32?

Even though i got it down to eight xmm regs, we still have only seven grps
to work with on x86_32. 
The function has seven pointers plus the offset variable used as part of
effective addresses, which means they can't be accessed directly from
stack for this purpose, something that can be done with linesize.

So it will need some changes, like constant movs of grps to and from stack
to get it working.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-11 Thread Paul B Mahol
On 10/11/15, James Almer  wrote:
> Signed-off-by: James Almer 
> ---
>  libavfilter/x86/vf_w3fdif.asm | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
> index f02319b..f2001a4 100644
> --- a/libavfilter/x86/vf_w3fdif.asm
> +++ b/libavfilter/x86/vf_w3fdif.asm
> @@ -103,13 +103,11 @@ REP_RET
>
>  %if ARCH_X86_64
>
> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0,
> in_lines_adj0, coef, linesize
> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0,
> in_lines_adj0, coef, linesize
>  movq  m2, [coefq]
>  DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0, in_lines_cur1,
> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
> -SPLATWm0, m2, 0
> -SPLATWm1, m2, 1
> +pshufdm0, m2, q
>  SPLATWm2, m2, 2
> -SBUTTERFLYwd, 0, 1, 7
>  pxor  m7, m7
>  mov  offsetq, 0
>  mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line,
> in_lines_cur0, in_lines_adj0,
>  movh   m4, [in_lines_cur1q+offsetq]
>  punpcklbw  m3, m7
>  punpcklbw  m4, m7
> -SBUTTERFLY wd, 3, 4, 8
> +SBUTTERFLY wd, 3, 4, 1
>  pmaddwdm3, m0
> -pmaddwdm4, m1
> +pmaddwdm4, m0
>  movh   m5, [in_lines_adj0q+offsetq]
>  movh   m6, [in_lines_adj1q+offsetq]
>  punpcklbw  m5, m7
>  punpcklbw  m6, m7
> -SBUTTERFLY wd, 5, 6, 8
> +SBUTTERFLY wd, 5, 6, 1
>  pmaddwdm5, m0
> -pmaddwdm6, m1
> +pmaddwdm6, m0
>  paddd  m3, m5
>  paddd  m4, m6
>  movh   m5, [in_lines_cur2q+offsetq]
>  movh   m6, [in_lines_adj2q+offsetq]
>  punpcklbw  m5, m7
>  punpcklbw  m6, m7
> -SBUTTERFLY wd, 5, 6, 8
> +SBUTTERFLY wd, 5, 6, 1
>  pmaddwdm5, m2
>  pmaddwdm6, m2
>  paddd  m3, m5
> --
> 2.6.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

Cant this now be used on x32?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/2] x86/vf_w3fdif: simplify w3fdif_simple_high

2015-10-10 Thread James Almer
Signed-off-by: James Almer 
---
 libavfilter/x86/vf_w3fdif.asm | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
index f02319b..f2001a4 100644
--- a/libavfilter/x86/vf_w3fdif.asm
+++ b/libavfilter/x86/vf_w3fdif.asm
@@ -103,13 +103,11 @@ REP_RET
 
 %if ARCH_X86_64
 
-cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0, 
in_lines_adj0, coef, linesize
+cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, 
in_lines_adj0, coef, linesize
 movq  m2, [coefq]
 DEFINE_ARGSwork_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, 
linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
-SPLATWm0, m2, 0
-SPLATWm1, m2, 1
+pshufdm0, m2, q
 SPLATWm2, m2, 2
-SBUTTERFLYwd, 0, 1, 7
 pxor  m7, m7
 mov  offsetq, 0
 mov   in_lines_cur2q, [in_lines_cur0q+gprsize*2]
@@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, 
in_lines_cur0, in_lines_adj0,
 movh   m4, [in_lines_cur1q+offsetq]
 punpcklbw  m3, m7
 punpcklbw  m4, m7
-SBUTTERFLY wd, 3, 4, 8
+SBUTTERFLY wd, 3, 4, 1
 pmaddwdm3, m0
-pmaddwdm4, m1
+pmaddwdm4, m0
 movh   m5, [in_lines_adj0q+offsetq]
 movh   m6, [in_lines_adj1q+offsetq]
 punpcklbw  m5, m7
 punpcklbw  m6, m7
-SBUTTERFLY wd, 5, 6, 8
+SBUTTERFLY wd, 5, 6, 1
 pmaddwdm5, m0
-pmaddwdm6, m1
+pmaddwdm6, m0
 paddd  m3, m5
 paddd  m4, m6
 movh   m5, [in_lines_cur2q+offsetq]
 movh   m6, [in_lines_adj2q+offsetq]
 punpcklbw  m5, m7
 punpcklbw  m6, m7
-SBUTTERFLY wd, 5, 6, 8
+SBUTTERFLY wd, 5, 6, 1
 pmaddwdm5, m2
 pmaddwdm6, m2
 paddd  m3, m5
-- 
2.6.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel