Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang
On Fri, Jul 27, 2018 at 11:14 AM, Clément Bœsch wrote: > On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote: >> Clang is more strict about some things. >> --- >> libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S >> b/libavfilter/aarch64/vf_nlmeans_neon.S >> index 6308a428db..ac16157bbd 100644 >> --- a/libavfilter/aarch64/vf_nlmeans_neon.S >> +++ b/libavfilter/aarch64/vf_nlmeans_neon.S >> @@ -22,7 +22,7 @@ >> >> // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D} >> .macro acc_sum_store x, xb >> -dup v24.4S, v24.4S[3] // >> ...X -> >> +dup v24.4s, v24.s[3]// >> ...X -> > > can you keep the capitalized form? > >> ext v25.16B, v26.16B, \xb, #12 // >> ext(,ABCD,12)=0ABC >> add v24.4S, v24.4S, \x // >> +ABCD={X+A,X+B,X+C,X+D} >> add v24.4S, v24.4S, v25.4S // >> {X+A,X+B+A,X+C+B,X+D+C} (+0ABC) >> @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1 >> moviv26.4S, #0 // >> used as zero for the "rotations" in acc_sum_store >> sub x3, x3, w6, UXTW// >> s1 padding (s1_linesize - w) >> sub x5, x5, w6, UXTW// >> s2 padding (s2_linesize - w) >> -sub x9, x0, x1, UXTW #2 // >> dst_top >> +sub x9, x0, w1, UXTW #2 // >> dst_top >> sub x1, x1, w6, UXTW// >> dst padding (dst_linesize_32 - w) >> lsl x1, x1, #2 // >> dst padding expressed in bytes >> 1: mov w10, w6 // >> width copy for each line > > LGTM otherwise, thx > Fixed the capitalization with the s suffix and used "fix" instead of "fixup" in the commit message due to IRC review. Pushed, and default compilation configuration should now be fixed again with aarch64+clang. Jan ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang
On Fri, Jul 27, 2018 at 12:03:46AM +0300, Jan Ekström wrote: > Clang is more strict about some things. > --- > libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S > b/libavfilter/aarch64/vf_nlmeans_neon.S > index 6308a428db..ac16157bbd 100644 > --- a/libavfilter/aarch64/vf_nlmeans_neon.S > +++ b/libavfilter/aarch64/vf_nlmeans_neon.S > @@ -22,7 +22,7 @@ > > // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D} > .macro acc_sum_store x, xb > -dup v24.4S, v24.4S[3] // > ...X -> > +dup v24.4s, v24.s[3]// > ...X -> can you keep the capitalized form? > ext v25.16B, v26.16B, \xb, #12 // > ext(,ABCD,12)=0ABC > add v24.4S, v24.4S, \x // > +ABCD={X+A,X+B,X+C,X+D} > add v24.4S, v24.4S, v25.4S // > {X+A,X+B+A,X+C+B,X+D+C} (+0ABC) > @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1 > moviv26.4S, #0 // > used as zero for the "rotations" in acc_sum_store > sub x3, x3, w6, UXTW// > s1 padding (s1_linesize - w) > sub x5, x5, w6, UXTW// > s2 padding (s2_linesize - w) > -sub x9, x0, x1, UXTW #2 // > dst_top > +sub x9, x0, w1, UXTW #2 // > dst_top > sub x1, x1, w6, UXTW// > dst padding (dst_linesize_32 - w) > lsl x1, x1, #2 // > dst padding expressed in bytes > 1: mov w10, w6 // > width copy for each line LGTM otherwise, thx -- Clément B. signature.asc Description: PGP signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] lavfi/nlmeans: fixup aarch64 assembly with clang
Clang is more strict about some things. --- libavfilter/aarch64/vf_nlmeans_neon.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S b/libavfilter/aarch64/vf_nlmeans_neon.S index 6308a428db..ac16157bbd 100644 --- a/libavfilter/aarch64/vf_nlmeans_neon.S +++ b/libavfilter/aarch64/vf_nlmeans_neon.S @@ -22,7 +22,7 @@ // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D} .macro acc_sum_store x, xb -dup v24.4S, v24.4S[3] // ...X -> +dup v24.4s, v24.s[3]// ...X -> ext v25.16B, v26.16B, \xb, #12 // ext(,ABCD,12)=0ABC add v24.4S, v24.4S, \x // +ABCD={X+A,X+B,X+C,X+D} add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC) @@ -37,7 +37,7 @@ function ff_compute_safe_ssd_integral_image_neon, export=1 moviv26.4S, #0 // used as zero for the "rotations" in acc_sum_store sub x3, x3, w6, UXTW// s1 padding (s1_linesize - w) sub x5, x5, w6, UXTW// s2 padding (s2_linesize - w) -sub x9, x0, x1, UXTW #2 // dst_top +sub x9, x0, w1, UXTW #2 // dst_top sub x1, x1, w6, UXTW// dst padding (dst_linesize_32 - w) lsl x1, x1, #2 // dst padding expressed in bytes 1: mov w10, w6 // width copy for each line -- 2.17.1 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel