Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-28 Thread Jan Ekström
On Fri, Jul 27, 2018 at 11:14 AM, Clément Bœsch  wrote:
> On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
>> Clang is more strict about some things.
>> ---
>>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
>> b/libavfilter/aarch64/vf_nlmeans_neon.S
>> index 6308a428db..ac16157bbd 100644
>> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
>> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
>> @@ -22,7 +22,7 @@
>>
>>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>>  .macro acc_sum_store x, xb
>> -dup v24.4S, v24.4S[3]   // 
>> ...X -> 
>> +dup v24.4s, v24.s[3]// 
>> ...X -> 
>
> can you keep the capitalized form?
>
>>  ext v25.16B, v26.16B, \xb, #12  // 
>> ext(,ABCD,12)=0ABC
>>  add v24.4S, v24.4S, \x  // 
>> +ABCD={X+A,X+B,X+C,X+D}
>>  add v24.4S, v24.4S, v25.4S  // 
>> {X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
>> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>>  moviv26.4S, #0  // 
>> used as zero for the "rotations" in acc_sum_store
>>  sub x3, x3, w6, UXTW// 
>> s1 padding (s1_linesize - w)
>>  sub x5, x5, w6, UXTW// 
>> s2 padding (s2_linesize - w)
>> -sub x9, x0, x1, UXTW #2 // 
>> dst_top
>> +sub x9, x0, w1, UXTW #2 // 
>> dst_top
>>  sub x1, x1, w6, UXTW// 
>> dst padding (dst_linesize_32 - w)
>>  lsl x1, x1, #2  // 
>> dst padding expressed in bytes
>>  1:  mov w10, w6 // 
>> width copy for each line
>
> LGTM otherwise, thx
>

Fixed the capitalization with the s suffix and used "fix" instead of
"fixup" in the commit message due to IRC review.

Pushed, and default compilation configuration should now be fixed
again with aarch64+clang.

Jan
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-27 Thread Clément Bœsch
On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote:
> Clang is more strict about some things.
> ---
>  libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
> b/libavfilter/aarch64/vf_nlmeans_neon.S
> index 6308a428db..ac16157bbd 100644
> --- a/libavfilter/aarch64/vf_nlmeans_neon.S
> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S
> @@ -22,7 +22,7 @@
>  
>  // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
>  .macro acc_sum_store x, xb
> -dup v24.4S, v24.4S[3]   // 
> ...X -> 
> +dup v24.4s, v24.s[3]// 
> ...X -> 

can you keep the capitalized form?

>  ext v25.16B, v26.16B, \xb, #12  // 
> ext(,ABCD,12)=0ABC
>  add v24.4S, v24.4S, \x  // 
> +ABCD={X+A,X+B,X+C,X+D}
>  add v24.4S, v24.4S, v25.4S  // 
> {X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
>  moviv26.4S, #0  // 
> used as zero for the "rotations" in acc_sum_store
>  sub x3, x3, w6, UXTW// 
> s1 padding (s1_linesize - w)
>  sub x5, x5, w6, UXTW// 
> s2 padding (s2_linesize - w)
> -sub x9, x0, x1, UXTW #2 // 
> dst_top
> +sub x9, x0, w1, UXTW #2 // 
> dst_top
>  sub x1, x1, w6, UXTW// 
> dst padding (dst_linesize_32 - w)
>  lsl x1, x1, #2  // 
> dst padding expressed in bytes
>  1:  mov w10, w6 // 
> width copy for each line

LGTM otherwise, thx

-- 
Clément B.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang

2018-07-26 Thread Jan Ekström
Clang is more strict about some things.
---
 libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S 
b/libavfilter/aarch64/vf_nlmeans_neon.S
index 6308a428db..ac16157bbd 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -22,7 +22,7 @@
 
 // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
 .macro acc_sum_store x, xb
-dup v24.4S, v24.4S[3]   // 
...X -> 
+dup v24.4s, v24.s[3]// 
...X -> 
 ext v25.16B, v26.16B, \xb, #12  // 
ext(,ABCD,12)=0ABC
 add v24.4S, v24.4S, \x  // 
+ABCD={X+A,X+B,X+C,X+D}
 add v24.4S, v24.4S, v25.4S  // 
{X+A,X+B+A,X+C+B,X+D+C}   (+0ABC)
@@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
 moviv26.4S, #0  // 
used as zero for the "rotations" in acc_sum_store
 sub x3, x3, w6, UXTW// s1 
padding (s1_linesize - w)
 sub x5, x5, w6, UXTW// s2 
padding (s2_linesize - w)
-sub x9, x0, x1, UXTW #2 // 
dst_top
+sub x9, x0, w1, UXTW #2 // 
dst_top
 sub x1, x1, w6, UXTW// dst 
padding (dst_linesize_32 - w)
 lsl x1, x1, #2  // dst 
padding expressed in bytes
 1:  mov w10, w6 // 
width copy for each line
-- 
2.17.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel