Re: [PATCH 03/10] [i386] libgcc: Enable hfmode soft-sf/df/xf/tf extensions and truncations.

2021-07-26 Thread Hongtao Liu via Gcc-patches
On Thu, Jul 22, 2021 at 8:14 PM Richard Biener
 wrote:
>
> On Wed, Jul 21, 2021 at 9:43 AM liuhongt  wrote:
> >
> > gcc/ChangeLog:
> >
> > * optabs-query.c (get_best_extraction_insn): Use word_mode for
> > HF field.
> >
> > libgcc/ChangeLog:
> >
> > * config/i386/32/sfp-machine.h (_FP_NANFRAC_H): New macro.
> > * config/i386/64/sfp-machine.h (_FP_NANFRAC_H): Ditto.
> > * config/i386/sfp-machine.h (_FP_NANSIGN_H): Ditto.
> > * config/i386/t-softfp: Add hf soft-fp.
> > * config.host: Add i386/64/t-softfp.
> > * config/i386/64/t-softfp: New file.
> > ---
> >  gcc/optabs-query.c  | 10 +-
> >  libgcc/config.host  |  5 +
> >  libgcc/config/i386/32/sfp-machine.h |  1 +
> >  libgcc/config/i386/64/sfp-machine.h |  1 +
> >  libgcc/config/i386/64/t-softfp  |  1 +
> >  libgcc/config/i386/sfp-machine.h|  1 +
> >  libgcc/config/i386/t-softfp |  5 +
> >  7 files changed, 19 insertions(+), 5 deletions(-)
> >  create mode 100644 libgcc/config/i386/64/t-softfp
> >
> > diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
> > index 05ee5f517da..0438e451474 100644
> > --- a/gcc/optabs-query.c
> > +++ b/gcc/optabs-query.c
> > @@ -205,7 +205,15 @@ get_best_extraction_insn (extraction_insn *insn,
> >   machine_mode field_mode)
> >  {
> >opt_scalar_int_mode mode_iter;
> > -  FOR_EACH_MODE_FROM (mode_iter, smallest_int_mode_for_size (struct_bits))
> > +  scalar_int_mode smallest_int_mode;
> > +  /* FIXME: validate_subreg only allows (subreg:WORD_MODE (reg:HF) 0). */
>
> I think that needs "fixing" then, or alternatively the caller should care.
>
How about this

modified   gcc/emit-rtl.c
@@ -928,6 +928,10 @@ validate_subreg (machine_mode omode, machine_mode imode,
  fix them all.  */
   if (omode == word_mode)
 ;
+  /* ???Similarly like (subreg:DI (reg:SF), also allow (subreg:SI (reg:HF))
+ here. Though extract_bit_field is the culprit here, not the backends.  */
+  else if (imode == HFmode && omode == SImode)
+;
   /* ??? Similarly, e.g. with (subreg:DF (reg:TI)).  Though store_bit_field
  is the culprit here, and not the backends.  */
   else if (known_ge (osize, regsize) && known_ge (isize, osize))
new file   gcc/testsuite/gcc.target/i386/float16-5.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+_Float16
+foo (int a)
+{
+  union {
+int a;
+_Float16 b;
+  }c;
+  c.a = a;
+  return c.b;
+}

If it's ok, I'll merge the upper change to the former commit:
"[PATCH 02/10] [i386] Enable _Float16 type for TARGET_SSE2 and above."

> > +  if (FLOAT_MODE_P (field_mode)
> > +  && known_eq (GET_MODE_SIZE (field_mode), 2))
> > +smallest_int_mode = word_mode;
> > +  else
> > +smallest_int_mode = smallest_int_mode_for_size (struct_bits);
> > +
> > +  FOR_EACH_MODE_FROM (mode_iter, smallest_int_mode)
> >  {
> >scalar_int_mode mode = mode_iter.require ();
> >if (get_extraction_insn (insn, pattern, type, mode))
> > diff --git a/libgcc/config.host b/libgcc/config.host
> > index 50f00062232..96da9ef1cce 100644
> > --- a/libgcc/config.host
> > +++ b/libgcc/config.host
> > @@ -1540,10 +1540,7 @@ i[34567]86-*-elfiamcu | i[34567]86-*-rtems*)
> > ;;
> >  i[34567]86-*-* | x86_64-*-*)
> > tmake_file="${tmake_file} t-softfp-tf"
> > -   if test "${host_address}" = 32; then
> > -   tmake_file="${tmake_file} i386/${host_address}/t-softfp"
> > -   fi
> > -   tmake_file="${tmake_file} i386/t-softfp t-softfp"
> > +   tmake_file="${tmake_file} i386/${host_address}/t-softfp 
> > i386/t-softfp t-softfp"
> > ;;
> >  esac
> >
> > diff --git a/libgcc/config/i386/32/sfp-machine.h 
> > b/libgcc/config/i386/32/sfp-machine.h
> > index 1fa282d7afe..e24cbc8d180 100644
> > --- a/libgcc/config/i386/32/sfp-machine.h
> > +++ b/libgcc/config/i386/32/sfp-machine.h
> > @@ -86,6 +86,7 @@
> >  #define _FP_DIV_MEAT_D(R,X,Y)   _FP_DIV_MEAT_2_udiv(D,R,X,Y)
> >  #define _FP_DIV_MEAT_Q(R,X,Y)   _FP_DIV_MEAT_4_udiv(Q,R,X,Y)
> >
> > +#define _FP_NANFRAC_H  _FP_QNANBIT_H
> >  #define _FP_NANFRAC_S  _FP_QNANBIT_S
> >  #define _FP_NANFRAC_D  _FP_QNANBIT_D, 0
> >  /* Even if XFmode is 12byte,  we have to pad it to
> > diff --git a/libgcc/config/i386/64/sfp-machine.h 
> > b/libgcc/config/i386/64/sfp-machine.h
> > index 1ff94c23ea4..e1c616699bb 100644
> > --- a/libgcc/config/i386/64/sfp-machine.h
> > +++ b/libgcc/config/i386/64/sfp-machine.h
> > @@ -13,6 +13,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
> >
> >  #define _FP_DIV_MEAT_Q(R,X,Y)   _FP_DIV_MEAT_2_udiv(Q,R,X,Y)
> >
> > +#define _FP_NANFRAC_H  _FP_QNANBIT_H
> >  #define _FP_NANFRAC_S  _FP_QNANBIT_S
> >  #define _FP_NANFRAC_D  _FP_QNANBIT_D
> >  #define _FP_NANFRAC_E  _FP_QNANBIT_E, 0
> > diff --git a/libgcc/config/i386/64/t-softfp b/libgcc/config/i386/64/t-softfp
> > new 

Re: [PATCH 23/55] rs6000: Incorporate new builtins code into the build machinery

2021-07-26 Thread Bill Schmidt via Gcc-patches

Hi Segher,

On 7/21/21 1:58 PM, Segher Boessenkool wrote:

On Thu, Jun 17, 2021 at 10:19:07AM -0500, Bill Schmidt wrote:

2021-06-07  Bill Schmidt  

gcc/
* config.gcc (extra_objs): Include rs6000-builtins.o and
rs6000-c.o.

The rs6000-c.o part needs an explanation, and probably should be a
separate (bugfix) patch (and it needs backports?)
OK.  Yeah, I'm a little surprised looking at this diff now...I wonder if 
it got removed by accident by some previous patch.  I'll investigate.


The changelog entry should read
* config.gcc (powerpc*-*-*): Include [...] in extra_objs.
or similar.

OK, thanks!  Will fix.



* config/rs6000/t-rs6000 (rs6000-gen-builtins.o): New target.
(rbtree.o): Likewise.
(rs6000-gen-builtins): Likewise.
(rs6000-builtins.c): Likewise.
(rs6000-builtins.h): Likewise.
(rs6000.o): Add dependency.
(EXTRA_HEADERS): Add rs6000-vecdefines.h.
(rs6000-vecdefines.h): New target.
(rs6000-builtins.o): Likewise.
(rs6000-call.o): Add rs6000-builtins.h as a dependency.
(rs6000-c.o): Likewise.
+rs6000-gen-builtins.o: $(srcdir)/config/rs6000/rs6000-gen-builtins.c
+   $(COMPILE) $(CXXFLAGS) $<
+   $(POSTCOMPILE)
+
+rbtree.o: $(srcdir)/config/rs6000/rbtree.c
+   $(COMPILE) $<
+   $(POSTCOMPILE)

Why does one need CXXFLAGS and the other does not?
Well, I'm as puzzled as you are.  I'll remove the $(CXXFLAGS) and see if 
anything goes wrong to remind me...but I think it's just unnecessary.



+# TODO: Whenever GNU make 4.3 is the minimum required, we should use
+# grouped targets on this:

That may be quite a while still.  GNU make is the foundation of
everything, so we cannot require very new versions of it ever.

In the meantime, you can make all these targets depend on an
intermediate target (that you mark with .INTERMEDIATE), and have that
intermediate target have the dependencies.  This is from version 3.74.3
and we require 3.80 already, so this is fine.
I think this is too artificial.  Right now I just make the two generated 
.h files depend on the generated .c file, which works since they are all 
generated together or none of them are generated.  That seems simple 
enough and more self-documenting to me.



+EXTRA_HEADERS += rs6000-vecdefines.h
+rs6000-vecdefines.h : rs6000-builtins.c

No space before the colon please.


Oopsie.

Thanks!
Bill




Segher


[PATCH] vect: Fix wrong check in vect_recog_mulhs_pattern [PR101596]

2021-07-26 Thread Kewen.Lin via Gcc-patches
Hi,

As PR101596 showed, vect_recog_mulhs_pattern uses target_precision to
check the scale_term is expected or not, it could be wrong when the
precision of the actual used new_type larger than target_precision as
shown by the example.

This patch is to use precision of new_type instead of target_precision
for the scale_term matching check.

Bootstrapped & regtested on powerpc64le-linux-gnu P10,
powerpc64-linux-gnu P8, x86_64-redhat-linux and aarch64-linux-gnu.

Is it ok for trunk?

BR,
Kewen
-
gcc/ChangeLog:

PR tree-optimization/100696
* tree-vect-patterns.c (vect_recog_mulhs_pattern): Fix wrong check
by using new_type's precision instead.

gcc/testsuite/ChangeLog:

PR tree-optimization/100696
* gcc.target/powerpc/pr101596-1.c: New test.
* gcc.target/powerpc/pr101596-2.c: Likewise.
* gcc.target/powerpc/pr101596-3.c: Likewise.
---
 gcc/testsuite/gcc.target/powerpc/pr101596-1.c | 30 +
 gcc/testsuite/gcc.target/powerpc/pr101596-2.c | 30 +
 gcc/testsuite/gcc.target/powerpc/pr101596-3.c | 58 +
 gcc/tree-vect-patterns.c  | 63 +++
 4 files changed, 154 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr101596-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr101596-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr101596-3.c

diff --git a/gcc/testsuite/gcc.target/powerpc/pr101596-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr101596-1.c
new file mode 100644
index 000..ed09e5f96ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr101596-1.c
@@ -0,0 +1,30 @@
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model -fdump-tree-vect-details" } */
+
+/* Check vect_recog_mulhs_pattern can't be detected with shift count 48.  */
+
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+signed int si_a[N], si_b[N];
+unsigned int ui_a[N], ui_b[N];
+signed short sh_c[N];
+unsigned short uh_c[N];
+
+void
+test1 ()
+{
+  for (int i = 0; i < N; i++)
+sh_c[i] = ((sLL) si_a[i] * (sLL) si_b[i]) >> 48;
+}
+
+void
+test2 ()
+{
+  for (int i = 0; i < N; i++)
+uh_c[i] = ((uLL) ui_a[i] * (uLL) ui_b[i]) >> 48;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_mulhs_pattern: detected" "vect" 
} } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr101596-2.c 
b/gcc/testsuite/gcc.target/powerpc/pr101596-2.c
new file mode 100644
index 000..5caa7ce4189
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr101596-2.c
@@ -0,0 +1,30 @@
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model -fdump-tree-vect-details" } */
+
+/* Check vect_recog_mulhs_pattern can be detected with shift count 32.  */
+
+#define N 128
+
+typedef signed long long sLL;
+typedef unsigned long long uLL;
+
+signed int si_a[N], si_b[N];
+unsigned int ui_a[N], ui_b[N];
+signed short sh_c[N];
+unsigned short uh_c[N];
+
+void
+test1 ()
+{
+  for (int i = 0; i < N; i++)
+sh_c[i] = ((sLL) si_a[i] * (sLL) si_b[i]) >> 32;
+}
+
+void
+test2 ()
+{
+  for (int i = 0; i < N; i++)
+uh_c[i] = ((uLL) ui_a[i] * (uLL) ui_b[i]) >> 32;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_mulhs_pattern: detected" 2 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr101596-3.c 
b/gcc/testsuite/gcc.target/powerpc/pr101596-3.c
new file mode 100644
index 000..4b92661412d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr101596-3.c
@@ -0,0 +1,58 @@
+/* { dg-do run } */
+/* { dg-require-effective-target power10_hw } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize 
-fno-vect-cost-model" } */
+
+/* Verify the execution goes well with shift count either 32 or 48.  */
+
+#define N 128
+
+typedef signed int si;
+typedef signed short sh;
+typedef signed long long sll;
+typedef unsigned int ui;
+typedef unsigned short uh;
+typedef unsigned long long ull;
+
+si si_a[N], si_b[N];
+ui ui_a[N], ui_b[N];
+sh sh_c[N];
+uh uh_c[N];
+
+#define TEST(NTYPE, TYPE, WTYPE, CNT)  
\
+  void __attribute__ ((noipa)) test_##TYPE##CNT () 
\
+  {
\
+for (int i = 0; i < N; i++)
\
+  NTYPE##_c[i] = ((WTYPE) TYPE##_a[i] * (WTYPE) TYPE##_b[i]) >> CNT;   
\
+  }
\
+   
\
+  void __attribute__ ((noipa, optimize ("O1"))) check_##TYPE##CNT ()   
\
+  {
\
+test_##TYPE##CNT ();   
\
+for (int i = 0; i < 

Re: [PATCH V3] Use preferred mode for doloop IV [PR61837]

2021-07-26 Thread Jiufu Guo via Gcc-patches

Jeff Law  writes:


On 7/15/2021 4:08 AM, Jiufu Guo via Gcc-patches wrote:

Refine code for V2 according to review comments:
* Use if check instead assert, and refine assert
* Use better RE check for test case, e.g. (?n)/(?p)
* Use better wording for target.def

Currently, doloop.xx variable is using the type as niter which 
may be
shorter than word size.  For some targets, it would be better 
to use
word size type.  For example, on 64bit system, to access 32bit 
value,
subreg maybe used.  Then using 64bit type maybe better for 
niter if

it can be present in both 32bit and 64bit.

This patch add target hook for querg perferred mode for doloop 
IV.

And update mode accordingly.

Bootstrap and regtest pass on powerpc64le, is this ok for 
trunk?


BR.
Jiufu

gcc/ChangeLog:

2021-07-15  Jiufu Guo  

PR target/61837
	* config/rs6000/rs6000.c (TARGET_PREFERRED_DOLOOP_MODE): 
New hook.

(rs6000_preferred_doloop_mode): New hook.
* doc/tm.texi: Regenerate.
* doc/tm.texi.in: Add hook preferred_doloop_mode.
* target.def (preferred_doloop_mode): New hook.
* targhooks.c (default_preferred_doloop_mode): New hook.
* targhooks.h (default_preferred_doloop_mode): New hook.
	* tree-ssa-loop-ivopts.c (compute_doloop_base_on_mode): 
New function.
	(add_iv_candidate_for_doloop): Call 
targetm.preferred_doloop_mode

and compute_doloop_base_on_mode.

gcc/testsuite/ChangeLog:

2021-07-15  Jiufu Guo  

PR target/61837
* gcc.target/powerpc/pr61837.c: New test.

Hi Jeff,

My first reaction was that whatever type corresponds to the 
target's word_mode would be the right choice.  But then I 
remembered things like dbCC on m68k which had a more limited 
range.  While I don't think m68k uses the doloop bits, it's a 
clear example that the most desirable type may not correspond to 
the word type for the target.
I had a check about the implementation of doloop_end insn again. 
m68k seems do not implement this insn.  On most of these targets 
who implement doloop_end insn, word_mode meets the requirement of 
doloop_end insn.  s390/tilegx and pru are little different.  s390 
checks SI/DI and arch; tilegx is using if-then-else-jump; on pru, 
it seems weird: UNITS_PER_WORD is defined as 1(then word_mode 
8bits), while doloop_end rejects QImode.
So, word_mode would be suitable for most targets, while it may not 
be best choice for some targets as you said.  Then this patch 
introducing a hook for target to tune.




So my concern with this patch is its introducing more target 
dependencies into the gimple pipeline which is generally 
considered undesirable from a design standpoint.  Is there any 
way to lower from whatever type is chosen by ivopts to the 
target's desired type at the gimple->rtl border rather than 
doing it in ivopts?
Currently, once gimple choice the type of doloop iv, we keep it 
until rtl doloop pass and then some suboptimize (like subreg 
access) was living into asm code.
To change the type during lower doloop iv (at expend pass?), we 
may also need a hook.
Choosing suitable type early at gimple for doloop iv may avoid 
some sub-optimization(e.g. type conversion) for all following 
passes.
So, this patch selects the preferred type for doloop iv when it 
was generated.


Thanks for your review and further comments!

BR,
Jiufu



jeff


Re: [PATCH] Make loops_list support an optional loop_p root

2021-07-26 Thread Kewen.Lin via Gcc-patches
on 2021/7/24 上午12:26, Martin Sebor wrote:
> On 7/23/21 2:41 AM, Kewen.Lin wrote:
>> on 2021/7/22 下午8:56, Richard Biener wrote:
>>> On Tue, Jul 20, 2021 at 4:37
>>> PM Kewen.Lin  wrote:

 Hi,

 This v2 has addressed some review comments/suggestions:

    - Use "!=" instead of "<" in function operator!= (const Iter )
    - Add new CTOR loops_list (struct loops *loops, unsigned flags)
  to support loop hierarchy tree rather than just a function,
  and adjust to use loops* accordingly.
>>>
>>> I actually meant struct loop *, not struct loops * ;)  At the point
>>> we pondered to make loop invariant motion work on single
>>> loop nests we gave up not only but also because it iterates
>>> over the loop nest but all the iterators only ever can process
>>> all loops, not say, all loops inside a specific 'loop' (and
>>> including that 'loop' if LI_INCLUDE_ROOT).  So the
>>> CTOR would take the 'root' of the loop tree as argument.
>>>
>>> I see that doesn't trivially fit how loops_list works, at least
>>> not for LI_ONLY_INNERMOST.  But I guess FROM_INNERMOST
>>> could be adjusted to do ONLY_INNERMOST as well?
>>>
>>
>>
>> Thanks for the clarification!  I just realized that the previous
>> version with struct loops* is problematic, all traversal is
>> still bounded with outer_loop == NULL.  I think what you expect
>> is to respect the given loop_p root boundary.  Since we just
>> record the loops' nums, I think we still need the function* fn?
>> So I add one optional argument loop_p root and update the
>> visiting codes accordingly.  Before this change, the previous
>> visiting uses the outer_loop == NULL as the termination condition,
>> it perfectly includes the root itself, but with this given root,
>> we have to use it as the termination condition to avoid to iterate
>> onto its possible existing next.
>>
>> For LI_ONLY_INNERMOST, I was thinking whether we can use the
>> code like:
>>
>>  struct loops *fn_loops = loops_for_fn (fn)->larray;
>>  for (i = 0; vec_safe_iterate (fn_loops, i, ); i++)
>>  if (aloop != NULL
>>  && aloop->inner == NULL
>>  && flow_loop_nested_p (tree_root, aloop))
>>   this->to_visit.quick_push (aloop->num);
>>
>> it has the stable bound, but if the given root only has several
>> child loops, it can be much worse if there are many loops in fn.
>> It seems impossible to predict the given root loop hierarchy size,
>> maybe we can still use the original linear searching for the case
>> loops_for_fn (fn) == root?  But since this visiting seems not so
>> performance critical, I chose to share the code originally used
>> for FROM_INNERMOST, hope it can have better readability and
>> maintainability.
> 
> I might be mixing up the two patches (they both seem to touch
> the same functions), but in this one the loops_list ctor looks
> like a sizeable function with at least one loop.  Since the ctor
> is used in the initialization of each of the many range-for loops,
> that could result in inlining of a lot of these calls and so quite
> a bit code bloat.  Unless this is necessary for efficiency  (not
> my area) I would recommend to consider defining the loops_list
> ctor out-of-line in some .c or .cc file.
> 

Yeah, they touch the same functions.  Good point on the code bloat,
I'm not sure the historical reason here, it needs Richi's input.  :)

> (Also, if you agree with the rationale, I'd replace loop_p with
> loop * in the new code.)
> 

Oh, thanks for the reminder, will update it.  

BR,
Kewen

> Thanks
> Martin
> 
>>
>> Bootstrapped and regtested on powerpc64le-linux-gnu P9,
>> x86_64-redhat-linux and aarch64-linux-gnu, also
>> bootstrapped on ppc64le P9 with bootstrap-O3 config.
>>
>> Does the attached patch meet what you expect?
>>
>> BR,
>> Kewen
>> -
>> gcc/ChangeLog:
>>
>> * cfgloop.h (loops_list::loops_list): Add one optional argument root
>> and adjust accordingly.
>>
> 


[PATCH v4] Use range-based for loops for traversing loops

2021-07-26 Thread Kewen.Lin via Gcc-patches
on 2021/7/24 上午12:10, Martin Sebor wrote:
> On 7/23/21 2:35 AM, Kewen.Lin wrote:
>> Hi,
>>
>> Comparing to v2, this v3 removed the new CTOR with struct loops *loops
>> as Richi clarified.  I'd like to support it in a separated follow up
>> patch by extending the existing CTOR with an optional argument loop_p
>> root.
> 
> Looks very nice (and quite a bit work)!  Thanks again!
> 
> Not to make even more work for you, but it occurred to me that
> the declaration of the loop control variable could be simplified
> by the use of auto like so:
> 
>  for (auto loop: loops_list (cfun, ...))
> 

Thanks for the suggestion!  Updated in v4 accordingly.

I was under the impression to use C++11 auto is arguable since it sometimes
may make things less clear.  But I think you are right, using auto here won't
make it harder to read but more concise.  Thanks again.

> I spotted what looks to me like a few minor typos in the docs
> diff:
> 
> diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi
> index a135656ed01..27697b08728 100644
> --- a/gcc/doc/loop.texi
> +++ b/gcc/doc/loop.texi
> @@ -79,14 +79,14 @@ and its subloops in the numbering.  The index of a loop 
> never changes.
> 
> The entries of the @code{larray} field should not be accessed directly.
> The function @code{get_loop} returns the loop description for a loop with
> -the given index.  @code{number_of_loops} function returns number of
> -loops in the function.  To traverse all loops, use @code{FOR_EACH_LOOP}
> -macro.  The @code{flags} argument of the macro is used to determine
> -the direction of traversal and the set of loops visited.  Each loop is
> -guaranteed to be visited exactly once, regardless of the changes to the
> -loop tree, and the loops may be removed during the traversal.  The newly
> -created loops are never traversed, if they need to be visited, this
> -must be done separately after their creation.
> +the given index.  @code{number_of_loops} function returns number of loops
> +in the function.  To traverse all loops, use range-based for loop with
> 
> Missing article:
> 
>   use a range-based for loop
> 
> +class @code{loop_list} instance. The @code{flags} argument of the macro
> 
> Is that loop_list or loops_list?
> 
> IIUC, it's also not a macro anymore, right?  The flags argument
> is passed to the loop_list ctor, no?
> 

Oops, thanks for catching all above ones!  Fixed in v4.

Bootstrapped and regtested again on powerpc64le-linux-gnu P9,
x86_64-redhat-linux and aarch64-linux-gnu, also
bootstrapped again on ppc64le P9 with bootstrap-O3 config.

Is it ok for trunk?

BR,
Kewen
-
gcc/ChangeLog:

* cfgloop.h (as_const): New function.
(class loop_iterator): Rename to ...
(class loops_list): ... this.
(loop_iterator::next): Rename to ...
(loops_list::Iter::fill_curr_loop): ... this and adjust.
(loop_iterator::loop_iterator): Rename to ...
(loops_list::loops_list): ... this and adjust.
(loops_list::Iter): New class.
(loops_list::iterator): New type.
(loops_list::const_iterator): New type.
(loops_list::begin): New function.
(loops_list::end): Likewise.
(loops_list::begin const): Likewise.
(loops_list::end const): Likewise.
(FOR_EACH_LOOP): Remove.
(FOR_EACH_LOOP_FN): Remove.
* cfgloop.c (flow_loops_dump): Adjust FOR_EACH_LOOP* with range-based
for loop with loops_list instance.
(sort_sibling_loops): Likewise.
(disambiguate_loops_with_multiple_latches): Likewise.
(verify_loop_structure): Likewise.
* cfgloopmanip.c (create_preheaders): Likewise.
(force_single_succ_latches): Likewise.
* config/aarch64/falkor-tag-collision-avoidance.c
(execute_tag_collision_avoidance): Likewise.
* config/mn10300/mn10300.c (mn10300_scan_for_setlb_lcc): Likewise.
* config/s390/s390.c (s390_adjust_loops): Likewise.
* doc/loop.texi: Likewise.
* gimple-loop-interchange.cc (pass_linterchange::execute): Likewise.
* gimple-loop-jam.c (tree_loop_unroll_and_jam): Likewise.
* gimple-loop-versioning.cc (loop_versioning::analyze_blocks): Likewise.
(loop_versioning::make_versioning_decisions): Likewise.
* gimple-ssa-split-paths.c (split_paths): Likewise.
* graphite-isl-ast-to-gimple.c (graphite_regenerate_ast_isl): Likewise.
* graphite.c (canonicalize_loop_form): Likewise.
(graphite_transform_loops): Likewise.
* ipa-fnsummary.c (analyze_function_body): Likewise.
* ipa-pure-const.c (analyze_function): Likewise.
* loop-doloop.c (doloop_optimize_loops): Likewise.
* loop-init.c (loop_optimizer_finalize): Likewise.
(fix_loop_structure): Likewise.
* loop-invariant.c (calculate_loop_reg_pressure): Likewise.
(move_loop_invariants): Likewise.
* loop-unroll.c (decide_unrolling): Likewise.
(unroll_loops): Likewise.
* 

Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

2021-07-26 Thread Hongtao Liu via Gcc-patches
On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu  wrote:
>
> Correct mail list, please reply under this email.
>
> On Mon, Jul 26, 2021 at 4:47 PM liuhongt  wrote:
> >
> > Hi:
> >   As decribled in PR, the pinsr instruction has poor throughput in SKX
> > and CLX, which leads to worse performance in vectorization in some cases.
> > This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> > which is used by vector construction, the cost is same as sse_op on other
> >  targets, but twice much as sse_op on CLX/SKX.
> >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> >   Ok for trunk?
> >
I'm going to check in this patch if there's no objection.
> > gcc/ChangeLog:
> >
> > PR target/99881
> > * config/i386/i386.h (processor_costs): Add new member
> > integer_to_sse.
> > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > generic_cost, core_cost): Initialize integer_to_sse same value
> > as sse_op.
> > (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > Use integer_to_sse instead of sse_op to calculate the cost of
> > vec_construct.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/99881
> > * gcc.target/i386/pr99881.c: New test.
> > ---
> >  gcc/config/i386/i386.c  |  6 ++-
> >  gcc/config/i386/i386.h  |  1 +
> >  gcc/config/i386/x86-tune-costs.h| 26 +
> >  gcc/testsuite/gcc.target/i386/pr99881.c | 49 +
> >  4 files changed, 81 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index ff96134fb37..fbebd2d8f9a 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum 
> > vect_cost_for_stmt type_of_cost,
> >case vec_construct:
> > {
> >   /* N element inserts into SSE vectors.  */
> > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > + int cost
> > +   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > +   ix86_cost->sse_op
> > +   : 
> > ix86_cost->integer_to_sse);
> > +
> >   /* One vinserti128 for combining two SSE vectors for AVX256.  */
> >   if (GET_MODE_BITSIZE (mode) == 256)
> > cost += ix86_vec_cost (mode, ix86_cost->addss);
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 0c2c93daf32..d1e1c225990 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -165,6 +165,7 @@ struct processor_costs {
> >const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
> > zmm_move;
> >const int sse_to_integer;/* cost of moving SSE register to integer.  
> > */
> > +  const int integer_to_sse;/* cost of moving integer to SSE register.  
> > */
> >const int gather_static, gather_per_elt; /* Cost of gather load is 
> > computed
> >as static + per_item * nelts. */
> >const int scatter_static, scatter_per_elt; /* Cost of gather store is
> > diff --git a/gcc/config/i386/x86-tune-costs.h 
> > b/gcc/config/i386/x86-tune-costs.h
> > index ffe810f2bcb..67cfa006196 100644
> > --- a/gcc/config/i386/x86-tune-costs.h
> > +++ b/gcc/config/i386/x86-tune-costs.h
> > @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for 
> > tuning for size */
> >in 128bit, 256bit and 512bit */
> >3, 3, 3, /* cost of moving XMM,YMM,ZMM 
> > register */
> >3,   /* cost of moving SSE register to 
> > integer.  */
> > +  COSTS_N_BYTES (2),   /* cost of moving integer to sse 
> > register.  */
> >5, 0,/* Gather load static, 
> > per_elt.  */
> >5, 0,/* Gather store static, 
> > per_elt.  */
> >0,   /* size of l1 cache  */
> > @@ -211,6 +212,7 @@ struct processor_costs i386_cost = {/* 386 
> > specific costs */
> >{4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
> >2, 4, 8, /* cost of moving XMM,YMM,ZMM 
> > register */
> >3,   /* cost of moving SSE register to 
> > integer.  

Re: [PATCH] RISC-V: Enable overlap-by-pieces in case of fast unaliged access

2021-07-26 Thread Palmer Dabbelt

On Mon, 26 Jul 2021 03:05:21 PDT (-0700), Andrew Waterman wrote:

On Thu, Jul 22, 2021 at 10:27 AM Palmer Dabbelt  wrote:


On Thu, 22 Jul 2021 06:29:46 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
> Could you add a testcase? Otherwise LGTM.
>
> Option: -O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64
> void foo(char *dst){
>__builtin_memset(dst, 0, 15);
> }

I'd like to see:

* Test results.  This is only on for one target right now, so relying on
  it to just work on others isn't a good idea.
* Something to demonstrate this doesn't break -mstrict-align.
* Some sort of performance analysis.  Most machines that support
  unaligned access do so with some performance degredation,


Also, some machines that gracefully support misaligned accesses under
most circumstances nevertheless experience a perf degradation when the
load depends on two stores that overlap partially but not fully.  This
transformation will obviously trigger such behavior from time to time.


Ya, I thought I wrote a response to this but I guess it's just in a 
buffer somewhere.  The code sequences this is generating are really the 
worst case for unaligned stores: one of them is always guaranteed to be 
misaligned, and it partially overlaps with a store one cycle away.


We're really only saving a handful of instructions at best here, so 
there's not much room for error when it comes to these sorts of things.  
Even if this difficult case is handled fast we're only talking about 
saving 2 cycles, which is pretty borderline as the single-issue in-order 
machines I've worked with that do support misaligned accesses tend to 
take at least a few cycles of performance hit on misaligned accesses.  
Even if misaligned accesses are single cycle, some back of the envelope 
calculations says that a pipeline flush when crossing a cache line would 
definitely push this negative and one per page crossing would be in the 
margins (depending on how we assume the original accesses are aligned).


This is way too subtle of a thing to analyze without at least some 
knowledge of how this pipeline works, whether that's from a benchmark or 
just a pipeline description.



Note, I'm not objecting to this patch; I'm only responding to Palmer's
comment.  It makes sense to enable this kind of optimization for
-mtune=native etc., just not for standard software distributions.


IMO there are really two cases here: -mtune=c906 and -Os (-mtune=native 
is sort of a red herring, it's just syntax).  For -mtune=c906 I'm happy 
enabling this as long as it's actually correct and improves performance, 
but that'll need at least some hardware-oriented analysis -- whether 
it's a benchmark or just a confirmation that these sequences are 
actually expected to run fast.


-Os is a different case, though.  Last time this came up we decided that 
-Os shouldn't purposefully misalign accesses, even when that saves code 
size, as that's too likely to hit pathological performance cases.  I 
don't know if the weights have changed here: the C906 is currently by 
far the cheapest chip (which likely means it's going to be the most 
popular), but it's unclear as to whether it's even RISC-V compliant and 
I don't know of many people who've actually gotten one.  IMO it's too 
early to flip -Os over to this behavior, we at least need to know that 
this chip is going to be sufficiently RISC-V-ey that standard software 
will even run on it much less be popular.






  it's unclear
  that this conversion will actually manifst a performance improvement.
  I don't have a C906 and don't know what workloads people care about
  running on one, but I'm certainly not convinced this is a win --
  what's listed here looks to be the best case, and that's only saving
  two instructions to generate a pretty pathological sequence
  (misaligned access that conflicts with a prior store).


Ah, I guess there it was ;)



Jojo: do you have any description of the C906 pipeline?  Specifically in
this case it'd be good to know how it handles unaligned accesses.

>
> On Thu, Jul 22, 2021 at 8:53 PM Christoph Muellner via Gcc-patches
>  wrote:
>>
>> This patch enables the overlap-by-pieces feature of the by-pieces
>> infrastructure for inlining builtins in case the target has set
>> riscv_slow_unaligned_access_p to false.
>>
>> To demonstrate the effect for targets with fast unaligned access,
>> the following code sequences are generated for a 15-byte memset-zero.
>>
>> Without overlap_op_by_pieces we get:
>>   8e:   00053023sd  zero,0(a0)
>>   92:   00052423sw  zero,8(a0)
>>   96:   00051623sh  zero,12(a0)
>>   9a:   00050723sb  zero,14(a0)
>>
>> With overlap_op_by_pieces we get:
>>   7e:   00053023sd  zero,0(a0)
>>   82:   000533a3sd  zero,7(a0)
>>
>> gcc/ChangeLog:
>>
>> * config/riscv/riscv.c (riscv_overlap_op_by_pieces): New function.
>> (TARGET_OVERLAP_OP_BY_PIECES_P): 

[PATCH] Propagate get_nonzero_bits information in division [PR77980]

2021-07-26 Thread Victor Tong via Gcc-patches
This change enables the "t1 != 0" check to be optimized away in this code:

int x1 = 0;
unsigned int x2 = 1;

int main ()
{
int t1 = x1*(1/(x2+x2));
if (t1 != 0) __builtin_abort();
return 0;
}

The change utilizes the VRP framework to propagate the get_nonzero_bits 
information from the "x2+x2" expression to the "1/(x2+x2)" division expression. 
Specifically, the framework knows that the least significant bit of the "x2+x2" 
expression must be zero.

The get_nonzero_bits information of the left hand side and right hand side of 
expressions needed to be passed down to operator_div::wi_fold() in the VRP 
framework. The majority of this change involves adding two additional 
parameters to propagate this information. There are future opportunities to use 
the non zero bit information to perform better optimizations in other types of 
expressions.

The changes were tested against x86_64-pc-linux-gnu and all tests in "make -k 
check" passed.

The original approach was to implement a match.pd pattern to support this but 
the pattern wasn't being triggered. More context is available in: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77980

Thanks,
Victor

pr77980.patch
Description: pr77980.patch


Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Andrew MacLeod via Gcc-patches

On 7/26/21 5:01 PM, Jakub Jelinek wrote:

On Mon, Jul 26, 2021 at 04:57:14PM -0400, Andrew MacLeod wrote:

 Handle ASCII and EBCDIC in toupper and tolower ranges.
 
 gcc/

 PR tree-optimization/7
 * gimple-range-fold.cc (get_letter_range): New.
 (fold_using_range::range_of_builtin_call): Call get_letter_range.
 
 gcc/testsuite/

 * gcc.dg/pr7.c: Add extra non-standard verifications.

diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
index 8465b4a82f6..a952f693c77 100644
--- a/gcc/gimple-range-fold.cc
+++ b/gcc/gimple-range-fold.cc
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "cfgloop.h"
  #include "tree-ssa-loop.h"
  #include "tree-scalar-evolution.h"
+#include "langhooks.h"
  #include "vr-values.h"
  #include "range.h"
  #include "value-query.h"
@@ -835,6 +836,30 @@ fold_using_range::range_of_builtin_ubsan_call (irange , 
gcall *call,
  r.set_varying (type);
  }
  
+// Return TRUE if we recognize the target character set and return the

+// range for lower case and upper case letters.
+
+static bool
+get_letter_range (tree type, irange , irange )
+{
+  // ASCII
+  int a = lang_hooks.to_target_charset ('a');
+  int z = lang_hooks.to_target_charset ('z');
+  int A = lang_hooks.to_target_charset ('A');
+  int Z = lang_hooks.to_target_charset ('Z');
+
+  if ((z - a == 25) && (Z - A == 25))

I would leave the extraneous ()s around both comparisons.


+{
+  lowers = int_range<2> (build_int_cst (type, 'a'),
+ build_int_cst (type, 'z'));
+  uppers = int_range<2> (build_int_cst (type, 'A'),
+build_int_cst (type, 'Z'));

Here you should use a, z, A and Z instead of 'a', etc.
It is the target values, not the host ones you care about.
So e.g. in the unlikely case host is EBCDIC and target ASCII,
the above would still create weirdo ranges...

Otherwise LGTM.

Jakub


Final version

tested and pushed.

commit d5a8c1382718ae084d46ff9b8a26d6b1d0cb684c
Author: Andrew MacLeod 
Date:   Mon Jul 26 17:25:06 2021 -0400

Confirm and Handle only ASCII in toupper and tolower ranges.

PR tree-optimization/7
* gimple-range-fold.cc (get_letter_range): New.
(fold_using_range::range_of_builtin_call): Call get_letter_range.

diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
index 8465b4a82f6..410bc4ddca4 100644
--- a/gcc/gimple-range-fold.cc
+++ b/gcc/gimple-range-fold.cc
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-ssa-loop.h"
 #include "tree-scalar-evolution.h"
+#include "langhooks.h"
 #include "vr-values.h"
 #include "range.h"
 #include "value-query.h"
@@ -835,6 +836,28 @@ fold_using_range::range_of_builtin_ubsan_call (irange , gcall *call,
 r.set_varying (type);
 }
 
+// Return TRUE if we recognize the target character set and return the
+// range for lower case and upper case letters.
+
+static bool
+get_letter_range (tree type, irange , irange )
+{
+  // ASCII
+  int a = lang_hooks.to_target_charset ('a');
+  int z = lang_hooks.to_target_charset ('z');
+  int A = lang_hooks.to_target_charset ('A');
+  int Z = lang_hooks.to_target_charset ('Z');
+
+  if ((z - a == 25) && (Z - A == 25))
+{
+  lowers = int_range<2> (build_int_cst (type, a), build_int_cst (type, z));
+  uppers = int_range<2> (build_int_cst (type, A), build_int_cst (type, Z));
+  return true;
+}
+  // Unknown character set.
+  return false;
+}
+
 // For a builtin in CALL, return a range in R if known and return
 // TRUE.  Otherwise return FALSE.
 
@@ -873,13 +896,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<3> lowers;
+	int_range<3> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any lower case characters,
 	// but including all the upper case ones.
-	int_range<2> exclude (build_int_cst (type, 'a'),
-			  build_int_cst (type, 'z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> uppers (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'));
+	lowers.invert ();
+	r.intersect (lowers);
 	r.union_ (uppers);
 	return true;
   }
@@ -889,13 +915,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<3> lowers;
+	int_range<3> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any upper case characters,
 	// but including all the lower case ones.
-	int_range<2> exclude (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> lowers (build_int_cst (type, 

Re: [PATCH v4] Add QI vector mode support to by-pieces for memset

2021-07-26 Thread H.J. Lu via Gcc-patches
On Mon, Jul 26, 2021 at 3:56 PM H.J. Lu  wrote:
>
> On Mon, Jul 26, 2021 at 2:53 PM Richard Sandiford
>  wrote:
> >
> > "H.J. Lu via Gcc-patches"  writes:
> > > On Mon, Jul 26, 2021 at 11:42 AM Richard Sandiford
> > >  wrote:
> > >>
> > >> "H.J. Lu via Gcc-patches"  writes:
> > >> > +to avoid stack realignment when expanding memset.  The default is
> > >> > +@code{gen_reg_rtx}.
> > >> > +@end deftypefn
> > >> > +
> > >> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> > >> > @var{nunroll}, class loop *@var{loop})
> > >> >  This target hook returns a new value for the number of times 
> > >> > @var{loop}
> > >> >  should be unrolled. The parameter @var{nunroll} is the number of times
> > >> > […]
> > >> > @@ -1446,7 +1511,10 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> > >> >max_size = STORE_MAX_PIECES + 1;
> > >> >while (max_size > 1 && l > 0)
> > >> >   {
> > >> > -   scalar_int_mode mode = widest_int_mode_for_size (max_size);
> > >> > +   /* Since this can be called before virtual registers are ready
> > >> > +  to use, avoid QI vector mode here.  */
> > >> > +   fixed_size_mode mode
> > >> > + = widest_fixed_size_mode_for_size (max_size, false);
> > >>
> > >> I think I might have asked this before, sorry, but: when is that true
> > >> and why does it matter?
> > >
> > > can_store_by_pieces may be called:
> > >
> > > value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
> > > value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
> > >
> > > before virtual registers can be used.   When true is passed to
> > > widest_fixed_size_mode_for_size,  virtual registers may be used
> > > to expand memset to broadcast, which leads to ICE.   Since for the
> > > purpose of can_store_by_pieces, we don't need to expand memset
> > > to broadcast and pass false here can avoid ICE.
> >
> > Ah, I see, thanks.
> >
> > That sounds like a problem in the way that the memset const function is
> > written though.  can_store_by_pieces is just a query function, so I don't
> > think it should be trying to create new registers for can_store_by_pieces,
> > even if it could.  At the same time, can_store_by_pieces should make the
> > same choices as the real expander would.
> >
> > I think this means that:
> >
> > - gen_memset_broadcast should be inlined into its callers, with the
> >   builtin_memset_read_str getting the CONST_INT_P case and
> >   builtin_memset_gen_str getting the variable case.
> >
> > - builtin_memset_read_str should then stop at and return the
> >   gen_const_vec_duplicate when the prev argument is null.

This doesn't work since can_store_by_pieces has

 cst = (*constfun) (constfundata, nullptr, offset, mode);
  if (!targetm.legitimate_constant_p (mode, cst))

ix86_legitimate_constant_p only allows 0 or -1 for CONST_VECTOR.
can_store_by_pieces doesn't work well for vector modes.

> >   Only when prev is nonnull should it go on to call the hook
> >   and copy the constant to the register that the hook returns.
>
> How about keeping gen_memset_broadcast and passing PREV to it:
>
>   rtx target;
>   if (CONST_INT_P (data))
> {
>   rtx const_vec = gen_const_vec_duplicate (mode, data);
>   if (prev == NULL)
> /* Return CONST_VECTOR when called by a query function.  */
> target = const_vec;
>   else
> {
>   /* Use the move expander with CONST_VECTOR.  */
>   target = targetm.gen_memset_scratch_rtx (mode);
>   emit_move_insn (target, const_vec);
> }
> }
>   else
> {
>   target = targetm.gen_memset_scratch_rtx (mode);
>   class expand_operand ops[2];
>   create_output_operand ([0], target, mode);
>   create_input_operand ([1], data, QImode);
>   expand_insn (icode, 2, ops);
>   if (!rtx_equal_p (target, ops[0].value))
> emit_move_insn (target, ops[0].value);
> }
>
> > I admit that's uglier than the current version, but it looks like that's
> > what the current interface expects.
> >
> > Thanks,
> > Richard
>
>
>
> --
> H.J.



-- 
H.J.


Re: [PATCH v4] Add QI vector mode support to by-pieces for memset

2021-07-26 Thread H.J. Lu via Gcc-patches
On Mon, Jul 26, 2021 at 2:53 PM Richard Sandiford
 wrote:
>
> "H.J. Lu via Gcc-patches"  writes:
> > On Mon, Jul 26, 2021 at 11:42 AM Richard Sandiford
> >  wrote:
> >>
> >> "H.J. Lu via Gcc-patches"  writes:
> >> > +to avoid stack realignment when expanding memset.  The default is
> >> > +@code{gen_reg_rtx}.
> >> > +@end deftypefn
> >> > +
> >> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> >> > @var{nunroll}, class loop *@var{loop})
> >> >  This target hook returns a new value for the number of times @var{loop}
> >> >  should be unrolled. The parameter @var{nunroll} is the number of times
> >> > […]
> >> > @@ -1446,7 +1511,10 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> >> >max_size = STORE_MAX_PIECES + 1;
> >> >while (max_size > 1 && l > 0)
> >> >   {
> >> > -   scalar_int_mode mode = widest_int_mode_for_size (max_size);
> >> > +   /* Since this can be called before virtual registers are ready
> >> > +  to use, avoid QI vector mode here.  */
> >> > +   fixed_size_mode mode
> >> > + = widest_fixed_size_mode_for_size (max_size, false);
> >>
> >> I think I might have asked this before, sorry, but: when is that true
> >> and why does it matter?
> >
> > can_store_by_pieces may be called:
> >
> > value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
> > value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
> >
> > before virtual registers can be used.   When true is passed to
> > widest_fixed_size_mode_for_size,  virtual registers may be used
> > to expand memset to broadcast, which leads to ICE.   Since for the
> > purpose of can_store_by_pieces, we don't need to expand memset
> > to broadcast and pass false here can avoid ICE.
>
> Ah, I see, thanks.
>
> That sounds like a problem in the way that the memset const function is
> written though.  can_store_by_pieces is just a query function, so I don't
> think it should be trying to create new registers for can_store_by_pieces,
> even if it could.  At the same time, can_store_by_pieces should make the
> same choices as the real expander would.
>
> I think this means that:
>
> - gen_memset_broadcast should be inlined into its callers, with the
>   builtin_memset_read_str getting the CONST_INT_P case and
>   builtin_memset_gen_str getting the variable case.
>
> - builtin_memset_read_str should then stop at and return the
>   gen_const_vec_duplicate when the prev argument is null.
>   Only when prev is nonnull should it go on to call the hook
>   and copy the constant to the register that the hook returns.

How about keeping gen_memset_broadcast and passing PREV to it:

  rtx target;
  if (CONST_INT_P (data))
{
  rtx const_vec = gen_const_vec_duplicate (mode, data);
  if (prev == NULL)
/* Return CONST_VECTOR when called by a query function.  */
target = const_vec;
  else
{
  /* Use the move expander with CONST_VECTOR.  */
  target = targetm.gen_memset_scratch_rtx (mode);
  emit_move_insn (target, const_vec);
}
}
  else
{
  target = targetm.gen_memset_scratch_rtx (mode);
  class expand_operand ops[2];
  create_output_operand ([0], target, mode);
  create_input_operand ([1], data, QImode);
  expand_insn (icode, 2, ops);
  if (!rtx_equal_p (target, ops[0].value))
emit_move_insn (target, ops[0].value);
}

> I admit that's uglier than the current version, but it looks like that's
> what the current interface expects.
>
> Thanks,
> Richard



-- 
H.J.


Re: 0001-Don-t-skip-prologue-instructions-as-it-could-affect-.patch

2021-07-26 Thread Segher Boessenkool
On Fri, Jul 23, 2021 at 10:27:37AM -0600, Jeff Law wrote:
> On 7/22/2021 7:04 AM, Richard Biener via Gcc-patches wrote:
> >On Thu, Jul 22, 2021 at 9:02 AM Bin.Cheng via Gcc-patches
> > wrote:
> >>Gentle ping.  Any suggestions would be appreciated.
> >So just to say something - does the existing code mean that any use of
> >the alias info on prologue/epilogue insns is wrong?  We have
> >
> >   /* The prologue/epilogue insns are not threaded onto the
> >  insn chain until after reload has completed.  Thus,
> >  there is no sense wasting time checking if INSN is in
> >  the prologue/epilogue until after reload has completed.  */
> >   bool could_be_prologue_epilogue = ((targetm.have_prologue ()
> >   || targetm.have_epilogue ())
> >  && reload_completed);
> >
> >so when !could_be_prologue_epilogue then passes shouldn't run into
> >them if the comment is correct.  But else even epilogue stmts could appear
> >anywhere (like scheduled around)?  So why's skipping those OK?
> These insns don't exist until after reload has completed.  I think this 
> code is just trying to be more compile-time efficient and not look for 
> them when they're known to not exist.

Yeah.  Unfortunately it isn't trivial to see if this is a premature
optimisation, or if this is needed for correctness instead.  But it is
stage 1 still :-)

> >Are passes supposed to check whether they are dealing with pro/epilogue
> >insns and not touch them?  CCing people that might know.
> Generally most passes can treat them as any other RTL.

Yup, if the *logues are just RTL, that RTL follows just the normal
semantics of RTL.  This means that dead code can be deleted, etc.  Well
that is about the most interesting transform that can be done so late
in the compilation pipeline :-)


Segher


Re: [PATCH] PR fortrsn/101564 - ICE in resolve_allocate_deallocate, at fortran/resolve.c:8169

2021-07-26 Thread Harald Anlauf via Gcc-patches
Hi Tobias,

> > This works as expected with Intel and AOCC, but gives a
> > syntax error with every gfortran tested because of match.c:
> >
> > alloc_opt_list:
> >m = gfc_match (" stat = %v", );
> 
> I think we can simply change that one to %e; the definable
> check should ensure that any non variable (in the Fortran sense)
> is rejected.
> 
> And we should update the comment for %v / match_variable to state
> that it does not include function references.

I've updated this for ALLOCATE/DEALLOCATE and STAT/ERRMSG, see
attached patch.  This required updating the error messages of
two existing files in the testsuite.

> Also affected: Some I/O items, a bunch of other stat=%v and
> errmsg=%v.

We should rather open a separate PR on auditing the related uses
of gfc_match.

> Talking about errmsg: In the same function, the same check is
> done for errmsg as for stat – hence, the patch should update
> also errmsg.

Done.

> >> Additionally, I have to admit that I do not understand the
> >> following existing condition, which you did not touch:
> >>
> >> if ((stat->ts.type != BT_INTEGER
> >>  && !(stat->ref && (stat->ref->type == REF_ARRAY
> >> || stat->ref->type == REF_COMPONENT)))
> >> || stat->rank > 0)
> >>   gfc_error ("Stat-variable at %L must be a scalar INTEGER "
> >>  "variable", >where);
> >>
> >> I mean the ts.type != BT_INTEGER and stat->rank != 0 is clear,
> >> but what's the reason for the refs?
> > Well, that needs to be answered by Steve (see commit 3759634).
> 
> (https://gcc.gnu.org/g:3759634f3208cbc1226bec19d22cbff989a287c3 (svn
> r145331))
> 
> The reason for the ref checks is unclear and seem to be wrong. The added
> testcases also only use 'x' (real) and n or i (integer) as input, i.e.
> they do not exercise this. I did not look for the patch email for reasoning.

Well, there is some text in the standard that I added in front of
the for loops, and this code is now exercised in the new testcase.

Regtested on x86_64-pc-linux-gnu.  OK?

Thanks,
Harald

Fortran: ICE in resolve_allocate_deallocate for invalid STAT argument

gcc/fortran/ChangeLog:

PR fortran/101564
* match.c (gfc_match): Fix comment for %v code.
(gfc_match_allocate, gfc_match_deallocate): Replace use of %v code
by %e in gfc_match to allow for function references as STAT and
ERRMSG arguments.
* resolve.c (resolve_allocate_deallocate): Avoid NULL pointer
dereferences and shortcut for bad STAT and ERRMSG argument to
(DE)ALLOCATE.

gcc/testsuite/ChangeLog:

PR fortran/101564
* gfortran.dg/allocate_stat_3.f90: New test.
* gfortran.dg/allocate_stat.f90: Adjust error messages.
* gfortran.dg/implicit_11.f90: Adjust error messages.

diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c
index d148de3e3b5..b1105481099 100644
--- a/gcc/fortran/match.c
+++ b/gcc/fortran/match.c
@@ -1109,7 +1109,8 @@ gfc_match_char (char c)
%t  Matches end of statement.
%o  Matches an intrinsic operator, returned as an INTRINSIC enum.
%l  Matches a statement label
-   %v  Matches a variable expression (an lvalue)
+   %v  Matches a variable expression (an lvalue, except function references
+   having a data pointer result)
%   Matches a required space (in free form) and optional spaces.  */

 match
@@ -4405,7 +4406,7 @@ gfc_match_allocate (void)

 alloc_opt_list:

-  m = gfc_match (" stat = %v", );
+  m = gfc_match (" stat = %e", );
   if (m == MATCH_ERROR)
 	goto cleanup;
   if (m == MATCH_YES)
@@ -4434,7 +4435,7 @@ alloc_opt_list:
 	goto alloc_opt_list;
 	}

-  m = gfc_match (" errmsg = %v", );
+  m = gfc_match (" errmsg = %e", );
   if (m == MATCH_ERROR)
 	goto cleanup;
   if (m == MATCH_YES)
@@ -4777,7 +4778,7 @@ gfc_match_deallocate (void)

 dealloc_opt_list:

-  m = gfc_match (" stat = %v", );
+  m = gfc_match (" stat = %e", );
   if (m == MATCH_ERROR)
 	goto cleanup;
   if (m == MATCH_YES)
@@ -4799,7 +4800,7 @@ dealloc_opt_list:
 	goto dealloc_opt_list;
 	}

-  m = gfc_match (" errmsg = %v", );
+  m = gfc_match (" errmsg = %e", );
   if (m == MATCH_ERROR)
 	goto cleanup;
   if (m == MATCH_YES)
diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c
index 45c3ad387ac..809a4ad86d1 100644
--- a/gcc/fortran/resolve.c
+++ b/gcc/fortran/resolve.c
@@ -8165,6 +8165,12 @@ resolve_allocate_deallocate (gfc_code *code, const char *fcn)
 	gfc_error ("Stat-variable at %L must be a scalar INTEGER "
 		   "variable", >where);

+  if (stat->expr_type == EXPR_CONSTANT || stat->symtree == NULL)
+	goto done_stat;
+
+  /* F2018:9.7.4: The stat-variable shall not be allocated or deallocated
+   * within the ALLOCATE or DEALLOCATE statement in which it appears ...
+   */
   for (p = code->ext.alloc.list; p; p = p->next)
 	if (p->expr->symtree->n.sym->name == 

Re: [PATCH v4] Add QI vector mode support to by-pieces for memset

2021-07-26 Thread Richard Sandiford via Gcc-patches
"H.J. Lu via Gcc-patches"  writes:
> On Mon, Jul 26, 2021 at 11:42 AM Richard Sandiford
>  wrote:
>>
>> "H.J. Lu via Gcc-patches"  writes:
>> > +to avoid stack realignment when expanding memset.  The default is
>> > +@code{gen_reg_rtx}.
>> > +@end deftypefn
>> > +
>> >  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
>> > @var{nunroll}, class loop *@var{loop})
>> >  This target hook returns a new value for the number of times @var{loop}
>> >  should be unrolled. The parameter @var{nunroll} is the number of times
>> > […]
>> > @@ -1446,7 +1511,10 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
>> >max_size = STORE_MAX_PIECES + 1;
>> >while (max_size > 1 && l > 0)
>> >   {
>> > -   scalar_int_mode mode = widest_int_mode_for_size (max_size);
>> > +   /* Since this can be called before virtual registers are ready
>> > +  to use, avoid QI vector mode here.  */
>> > +   fixed_size_mode mode
>> > + = widest_fixed_size_mode_for_size (max_size, false);
>>
>> I think I might have asked this before, sorry, but: when is that true
>> and why does it matter?
>
> can_store_by_pieces may be called:
>
> value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
> value-prof.c:  if (!can_store_by_pieces (val, builtin_memset_read_str,
>
> before virtual registers can be used.   When true is passed to
> widest_fixed_size_mode_for_size,  virtual registers may be used
> to expand memset to broadcast, which leads to ICE.   Since for the
> purpose of can_store_by_pieces, we don't need to expand memset
> to broadcast and pass false here can avoid ICE.

Ah, I see, thanks.

That sounds like a problem in the way that the memset const function is
written though.  can_store_by_pieces is just a query function, so I don't
think it should be trying to create new registers for can_store_by_pieces,
even if it could.  At the same time, can_store_by_pieces should make the
same choices as the real expander would.

I think this means that:

- gen_memset_broadcast should be inlined into its callers, with the
  builtin_memset_read_str getting the CONST_INT_P case and
  builtin_memset_gen_str getting the variable case.

- builtin_memset_read_str should then stop at and return the
  gen_const_vec_duplicate when the prev argument is null.
  Only when prev is nonnull should it go on to call the hook
  and copy the constant to the register that the hook returns.

I admit that's uglier than the current version, but it looks like that's
what the current interface expects.

Thanks,
Richard


Re: [PATCH] [DWARF] Fix hierarchy of debug information for offload kernels.

2021-07-26 Thread Hafiz Abid Qadeer
On 22/07/2021 12:52, Richard Biener wrote:
> On Thu, Jul 22, 2021 at 1:48 PM Jakub Jelinek  wrote:
>>
>> On Thu, Jul 22, 2021 at 01:43:49PM +0200, Richard Biener wrote:
>>> So I think we need to get to an agreement between the debug info
>>> producer and consumer here.
>>> Usually the DWARF spec is not of much help here.
>>
>> It is something that needs to be discussed for DWARF 6, currently indeed can
>> be solved only with some DWARF extensions we'd need to invent.
> 
> I mean, the question is what should the concrete instance inherit from
> the abstract instance - IMHO parent-child relationship is one thing, no?

I guess the problem is that pointer is one-sided from concrete to abstract. 
With this change, one
can go from concrete child function to abstract child (and abstract parent). 
But it is not easy to
find the concrete parent for the consumer as there is no link from abstract to 
concrete.


Thanks,
-- 
Hafiz Abid Qadeer
Mentor, a Siemens Business


Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Kwok Cheung Yeung

On 26/07/2021 10:23 pm, Jakub Jelinek wrote:

On Mon, Jul 26, 2021 at 10:19:35PM +0100, Kwok Cheung Yeung wrote:

In that comment, Deepak says:

So, we decided to keep the target trait static, requiring that the declare
target directive must be explicit and that the function version must be
different from the version of the function that may be called outside of a
target region (with the additional clarification that whether it differs or
not will be implementation defined).

"the function version must be different from the version of the function
that may be called outside of a target region": This is what we do not have
in GCC at the moment - the function versions called within and outside
target regions are the same on the host.

"whether it differs or not will be implementation defined": So whether a
function with 'declare target' and a metadirective involving a 'target'
construct behaves the same or not when called from both inside and outside
of a target region is implementation defined?

I will leave the treatment of target constructs in the selector as it is
then, with both calls going to the same function with the metadirective
resolving to the 'target' variant. I will try to address your other concerns
later.


I think you're right, it should differ in the host vs. target version iff
it is in explicit declare target block, my memory is weak, but let's implement
the 5.0 wording for now (and ignore the 5.1 wording later on) and only when
we'll be doing 5.2 change this (and change for both metadirective and
declare variant at that point).
Ok?



Okay, the rest of the metadirective spec is quite enough to be getting on with 
for now. :-)


Thanks

Kwok


[ARM] PR66791: Replace builtins in vld1

2021-07-26 Thread Prathamesh Kulkarni via Gcc-patches
Hi,
Similar to aarch64, this patch replaces call to builtin by
dereferencing __a in vld1_p64, vld1_s64 and vld1_u64.

The patch changes code-gen for the intrinsic as follows:
Before patch:
vld1.64 {d16}, [r0:64]
vmovr0, r1, d16 @ int
bx  lr

After patch:
ldrdr0, [r0]
bx  lr

I assume the code-gen after patch is correct, since it loads two
consecutive words from [r0] into r0 and r1 ?

Bootstrapped+tested on arm-linux-gnueabihf.
OK to commit ?

Thanks,
Prathamesh
2021-07-27  Prathamesh Kulkarni  

PR target/66791
* config/arm/arm_neon.h (vld1_p64): Replace call to builtin by
explicitly dereferencing __a.
(vld1_s64): Likewise.
(vld1_u64): Likewise.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 41b596b5fc6..5a91d15bf75 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -10301,7 +10301,7 @@ __extension__ extern __inline poly64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_p64 (const poly64_t * __a)
 {
-  return (poly64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+  return (poly64x1_t) { *__a };
 }
 
 #pragma GCC pop_options
@@ -10330,7 +10330,7 @@ __extension__ extern __inline int64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_s64 (const int64_t * __a)
 {
-  return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+  return (int64x1_t) { *__a };
 }
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -10374,7 +10374,7 @@ __extension__ extern __inline uint64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vld1_u64 (const uint64_t * __a)
 {
-  return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+  return (uint64x1_t) { *__a };
 }
 
 __extension__ extern __inline poly8x8_t


Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Mon, Jul 26, 2021 at 10:19:35PM +0100, Kwok Cheung Yeung wrote:
> > Yes, that is a target variant, but I'm pretty sure we've decided that
> > the target construct added for declare target is actually not a dynamic
> > property.  So basically mostly return to the 5.0 wording with clarifications
> > for Fortran.  See
> > https://github.com/OpenMP/spec/issues/2612#issuecomment-849742988
> > for details.
> > Making the target in construct dynamic would pretty much force all the
> > scoring to be dynamic as well.
> 
> In that comment, Deepak says:
> 
> So, we decided to keep the target trait static, requiring that the declare
> target directive must be explicit and that the function version must be
> different from the version of the function that may be called outside of a
> target region (with the additional clarification that whether it differs or
> not will be implementation defined).
> 
> "the function version must be different from the version of the function
> that may be called outside of a target region": This is what we do not have
> in GCC at the moment - the function versions called within and outside
> target regions are the same on the host.
> 
> "whether it differs or not will be implementation defined": So whether a
> function with 'declare target' and a metadirective involving a 'target'
> construct behaves the same or not when called from both inside and outside
> of a target region is implementation defined?
> 
> I will leave the treatment of target constructs in the selector as it is
> then, with both calls going to the same function with the metadirective
> resolving to the 'target' variant. I will try to address your other concerns
> later.

I think you're right, it should differ in the host vs. target version iff
it is in explicit declare target block, my memory is weak, but let's implement
the 5.0 wording for now (and ignore the 5.1 wording later on) and only when
we'll be doing 5.2 change this (and change for both metadirective and
declare variant at that point).
Ok?

Jakub



Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Kwok Cheung Yeung

Hello

On 26/07/2021 8:56 pm, Jakub Jelinek wrote:

On Mon, Jul 26, 2021 at 08:28:16PM +0100, Kwok Cheung Yeung wrote:

In Section 1.2.2 of the OpenMP TR10 spec, 'target variant' is defined as:

A version of a device routine that can only be executed as part of a target 
region.


Yes, that is a target variant, but I'm pretty sure we've decided that
the target construct added for declare target is actually not a dynamic
property.  So basically mostly return to the 5.0 wording with clarifications
for Fortran.  See
https://github.com/OpenMP/spec/issues/2612#issuecomment-849742988
for details.
Making the target in construct dynamic would pretty much force all the
scoring to be dynamic as well.


In that comment, Deepak says:

So, we decided to keep the target trait static, requiring that the declare 
target directive must be explicit and that the function version must be 
different from the version of the function that may be called outside of a 
target region (with the additional clarification that whether it differs or not 
will be implementation defined).


"the function version must be different from the version of the function that 
may be called outside of a target region": This is what we do not have in GCC at 
the moment - the function versions called within and outside target regions are 
the same on the host.


"whether it differs or not will be implementation defined": So whether a 
function with 'declare target' and a metadirective involving a 'target' 
construct behaves the same or not when called from both inside and outside of a 
target region is implementation defined?


I will leave the treatment of target constructs in the selector as it is then, 
with both calls going to the same function with the metadirective resolving to 
the 'target' variant. I will try to address your other concerns later.


Thanks

Kwok


Re: [PATCH v4] Add QI vector mode support to by-pieces for memset

2021-07-26 Thread H.J. Lu via Gcc-patches
On Mon, Jul 26, 2021 at 11:42 AM Richard Sandiford
 wrote:
>
> "H.J. Lu via Gcc-patches"  writes:
> > +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > +   bytes from constant string DATA + OFFSET and return it as target
> > +   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +   previous iteration.  */
> > +
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > +  HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > +  fixed_size_mode mode)
> > +{
> >const char *c = (const char *) data;
> > -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > +  unsigned int size = GET_MODE_SIZE (mode);
> >
> > -  memset (p, *c, GET_MODE_SIZE (mode));
> > +  rtx target = gen_memset_value_from_prev ((by_pieces_prev *) prev,
> > +mode);
> > +  if (target != nullptr)
> > +return target;
> > +  rtx src = gen_int_mode (*c, QImode);
> > +  target = gen_memset_broadcast (src, mode);
> > +  if (target != nullptr)
> > +return target;
> >
> > -  return c_readstr (p, mode);
> > +  char *p = XALLOCAVEC (char, size);
> > +
> > +  memset (p, *c, size);
> > +
> > +  /* Vector mode should be handled by gen_memset_broadcast above.  */
>
> Nit: s/Vector mode should/Vector modes should/

Fixed.

> > +  return c_readstr (p, as_a  (mode));
> >  }
> >
> >  /* Callback routine for store_by_pieces.  Return the RTL of a register
> > @@ -6788,33 +6910,29 @@ builtin_memset_read_str (void *data, void *prevp,
> > nullptr, it has the RTL info from the previous iteration.  */
> >
> >  static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> >   HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > - scalar_int_mode mode)
> > + fixed_size_mode mode)
> >  {
> >rtx target, coeff;
> >size_t size;
> >char *p;
> >
> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > -  if (prev != nullptr && prev->data != nullptr)
> > -{
> > -  /* Use the previous data in the same mode.  */
> > -  if (prev->mode == mode)
> > - return prev->data;
> > -
> > -  target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > -  if (target != nullptr)
> > - return target;
> > -}
> > -
> >size = GET_MODE_SIZE (mode);
> >if (size == 1)
> >  return (rtx) data;
> >
> > +  target = gen_memset_value_from_prev ((by_pieces_prev *) prev, mode);
> > +  if (target != nullptr)
> > +return target;
> > +
> > +  target = gen_memset_broadcast ((rtx) data, mode);
> > +  if (target != nullptr)
> > +return target;
> > +
> >p = XALLOCAVEC (char, size);
> >memset (p, 1, size);
> > -  coeff = c_readstr (p, mode);
> > +  coeff = c_readstr (p, as_a  (mode));
>
> The same comment would be useful here.

I added the same comment here.

> >
> >target = convert_to_mode (mode, (rtx) data, 1);
> >target = expand_mult (mode, target, coeff, NULL_RTX, 1);
> > @@ -6918,7 +7036,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, 
> > unsigned int ctz_len,
> >   , align, true))
> >  return false;
> >
> > -  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
> > +  by_pieces_constfn constfun;
> >void *constfundata;
> >if (val)
> >  {
> > diff --git a/gcc/builtins.h b/gcc/builtins.h
> > index a64ece3f1cd..4b2ad766c61 100644
> > --- a/gcc/builtins.h
> > +++ b/gcc/builtins.h
> > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum 
> > built_in_function fn);
> >  extern tree mathfn_built_in (tree, combined_fn);
> >  extern tree mathfn_built_in_type (combined_fn);
> >  extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
> > -  scalar_int_mode);
> > +  fixed_size_mode);
> >  extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
> > - scalar_int_mode);
> > + fixed_size_mode);
> >  extern rtx expand_builtin_saveregs (void);
> >  extern tree std_build_builtin_va_list (void);
> >  extern tree std_fn_abi_va_list (tree);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index c8f4abe3e41..14d387750a8 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12149,6 +12149,13 @@ This function prepares to emit a conditional 
> > comparison within a sequence
> >   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the 
> > compares.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode 
> > @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
>
> s/for scratch register/for a scratch register/

Fixed.

> > +be used by memset broadcast.  The backend can use a hard scratch register
>
> How about: s/be used by memset broadcast/be used when expanding memset calls/
> 

Re: Repost #2: [PATCH] PR 100170: Fix eq/ne tests on power10.

2021-07-26 Thread Segher Boessenkool
On Mon, Jul 26, 2021 at 04:46:46PM -0400, Michael Meissner wrote:
> On Wed, Jul 14, 2021 at 04:22:06PM -0500, Segher Boessenkool wrote:
> > > --- a/gcc/testsuite/gcc.target/powerpc/ppc-ne0-1.c
> > > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-ne0-1.c
> > 
> > > -/* { dg-final { scan-assembler-times "addic" 4 } } */
> > > -/* { dg-final { scan-assembler-times "subfe" 1 } } */
> > > -/* { dg-final { scan-assembler-times "addze" 3 } } */
> > > +/* { dg-final { scan-assembler-times {\maddic\M}  4 { target { ! 
> > > has_arch_pwr10 } } } } */
> > > +/* { dg-final { scan-assembler-times {\msubfe\M}  1 { target { ! 
> > > has_arch_pwr10 } } } } */
> > > +/* { dg-final { scan-assembler-times {\maddic\M}  3 { target {   
> > > has_arch_pwr10 } } } } */
> > > +/* { dg-final { scan-assembler-not   {\msubfe\M}{ target {   
> > > has_arch_pwr10 } } } } */
> > > +/* { dg-final { scan-assembler-times {\msetbcr\M} 1 { target {   
> > > has_arch_pwr10 } } } } */
> > > +/* { dg-final { scan-assembler-times {\maddze\M}  3 } } */
> > 
> > It may be easier to split the patch into two, where one part can get the
> > setbcr (the first, simplest function), and the rest stays the same.
> 
> I really don't understand this comment.  I don't see how you could split the
> patch in two, as the function that generates the setbcr (ne0) for power10 
> would
> generate addic/subfe instead of the setbcr on earlier power systems.  Those
> instruction counts have to be changed for the other functions.  So it doesn't
> make sense to split the patch to me.

I'm sorry.  I meant split the *testcase* into two :-)  One with the
first test, the other with the rest.


Segher


Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Mon, Jul 26, 2021 at 04:57:14PM -0400, Andrew MacLeod wrote:
> Handle ASCII and EBCDIC in toupper and tolower ranges.
> 
> gcc/
> PR tree-optimization/7
> * gimple-range-fold.cc (get_letter_range): New.
> (fold_using_range::range_of_builtin_call): Call get_letter_range.
> 
> gcc/testsuite/
> * gcc.dg/pr7.c: Add extra non-standard verifications.
> 
> diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
> index 8465b4a82f6..a952f693c77 100644
> --- a/gcc/gimple-range-fold.cc
> +++ b/gcc/gimple-range-fold.cc
> @@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-ssa-loop.h"
>  #include "tree-scalar-evolution.h"
> +#include "langhooks.h"
>  #include "vr-values.h"
>  #include "range.h"
>  #include "value-query.h"
> @@ -835,6 +836,30 @@ fold_using_range::range_of_builtin_ubsan_call (irange 
> , gcall *call,
>  r.set_varying (type);
>  }
>  
> +// Return TRUE if we recognize the target character set and return the
> +// range for lower case and upper case letters.
> +
> +static bool
> +get_letter_range (tree type, irange , irange )
> +{
> +  // ASCII
> +  int a = lang_hooks.to_target_charset ('a');
> +  int z = lang_hooks.to_target_charset ('z');
> +  int A = lang_hooks.to_target_charset ('A');
> +  int Z = lang_hooks.to_target_charset ('Z');
> +
> +  if ((z - a == 25) && (Z - A == 25))

I would leave the extraneous ()s around both comparisons.

> +{
> +  lowers = int_range<2> (build_int_cst (type, 'a'),
> +   build_int_cst (type, 'z'));
> +  uppers = int_range<2> (build_int_cst (type, 'A'),
> +  build_int_cst (type, 'Z'));

Here you should use a, z, A and Z instead of 'a', etc.
It is the target values, not the host ones you care about.
So e.g. in the unlikely case host is EBCDIC and target ASCII,
the above would still create weirdo ranges...

Otherwise LGTM.

Jakub



Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Andrew MacLeod via Gcc-patches

On 7/26/21 4:36 PM, Jakub Jelinek wrote:

On Mon, Jul 26, 2021 at 10:33:51PM +0200, Jakub Jelinek via Gcc-patches wrote:

@@ -835,6 +836,43 @@ fold_using_range::range_of_builtin_ubsan_call (irange , 
gcall *call,
  r.set_varying (type);
  }
  
+// Return TRUE if we recognize the target character set and return the

+// range for lower case and upper case letters.
+
+static bool
+get_letter_range (tree type, irange , irange )
+{
+  // ASCII
+  if (lang_hooks.to_target_charset (' ') == 0x20)
+{
+  lowers = int_range<2> (build_int_cst (type, 'a'),
+ build_int_cst (type, 'z'));
+  uppers = int_range<2> (build_int_cst (type, 'A'),
+build_int_cst (type, 'Z'));

Wouldn't it be safer to lang_hooks.to_target_charset ('a')
(and 'z') and just verify that their difference is that of 'z' - 'a'

David is right, that their difference is 25.  I think we don't support
EBCDIC on the host, but what if.

Jakub


Fine by me.  This is running thru testing now :-)

This seems good yes?

Andrew



commit f13a661874b772e473ee78fe8133899db51ea642
Author: Andrew MacLeod 
Date:   Mon Jul 26 15:38:42 2021 -0400

Handle ASCII and EBCDIC in toupper and tolower ranges.

gcc/
PR tree-optimization/7
* gimple-range-fold.cc (get_letter_range): New.
(fold_using_range::range_of_builtin_call): Call get_letter_range.

gcc/testsuite/
* gcc.dg/pr7.c: Add extra non-standard verifications.

diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
index 8465b4a82f6..a952f693c77 100644
--- a/gcc/gimple-range-fold.cc
+++ b/gcc/gimple-range-fold.cc
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-ssa-loop.h"
 #include "tree-scalar-evolution.h"
+#include "langhooks.h"
 #include "vr-values.h"
 #include "range.h"
 #include "value-query.h"
@@ -835,6 +836,30 @@ fold_using_range::range_of_builtin_ubsan_call (irange , gcall *call,
 r.set_varying (type);
 }
 
+// Return TRUE if we recognize the target character set and return the
+// range for lower case and upper case letters.
+
+static bool
+get_letter_range (tree type, irange , irange )
+{
+  // ASCII
+  int a = lang_hooks.to_target_charset ('a');
+  int z = lang_hooks.to_target_charset ('z');
+  int A = lang_hooks.to_target_charset ('A');
+  int Z = lang_hooks.to_target_charset ('Z');
+
+  if ((z - a == 25) && (Z - A == 25))
+{
+  lowers = int_range<2> (build_int_cst (type, 'a'),
+			  build_int_cst (type, 'z'));
+  uppers = int_range<2> (build_int_cst (type, 'A'),
+			 build_int_cst (type, 'Z'));
+  return true;
+}
+  // Unknown character set.
+  return false;
+}
+
 // For a builtin in CALL, return a range in R if known and return
 // TRUE.  Otherwise return FALSE.
 
@@ -873,13 +898,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<3> lowers;
+	int_range<3> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any lower case characters,
 	// but including all the upper case ones.
-	int_range<2> exclude (build_int_cst (type, 'a'),
-			  build_int_cst (type, 'z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> uppers (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'));
+	lowers.invert ();
+	r.intersect (lowers);
 	r.union_ (uppers);
 	return true;
   }
@@ -889,13 +917,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<3> lowers;
+	int_range<3> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any upper case characters,
 	// but including all the lower case ones.
-	int_range<2> exclude (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> lowers (build_int_cst (type, 'a'),
-			  build_int_cst (type, 'z'));
+	uppers.invert ();
+	r.intersect (uppers);
 	r.union_ (lowers);
 	return true;
   }


Re: Repost #2: [PATCH] PR 100170: Fix eq/ne tests on power10.

2021-07-26 Thread Michael Meissner via Gcc-patches
On Wed, Jul 14, 2021 at 04:22:06PM -0500, Segher Boessenkool wrote:
> > --- a/gcc/testsuite/gcc.target/powerpc/ppc-ne0-1.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/ppc-ne0-1.c
> 
> > -/* { dg-final { scan-assembler-times "addic" 4 } } */
> > -/* { dg-final { scan-assembler-times "subfe" 1 } } */
> > -/* { dg-final { scan-assembler-times "addze" 3 } } */
> > +/* { dg-final { scan-assembler-times {\maddic\M}  4 { target { ! 
> > has_arch_pwr10 } } } } */
> > +/* { dg-final { scan-assembler-times {\msubfe\M}  1 { target { ! 
> > has_arch_pwr10 } } } } */
> > +/* { dg-final { scan-assembler-times {\maddic\M}  3 { target {   
> > has_arch_pwr10 } } } } */
> > +/* { dg-final { scan-assembler-not   {\msubfe\M}{ target {   
> > has_arch_pwr10 } } } } */
> > +/* { dg-final { scan-assembler-times {\msetbcr\M} 1 { target {   
> > has_arch_pwr10 } } } } */
> > +/* { dg-final { scan-assembler-times {\maddze\M}  3 } } */
> 
> It may be easier to split the patch into two, where one part can get the
> setbcr (the first, simplest function), and the rest stays the same.

I really don't understand this comment.  I don't see how you could split the
patch in two, as the function that generates the setbcr (ne0) for power10 would
generate addic/subfe instead of the setbcr on earlier power systems.  Those
instruction counts have to be changed for the other functions.  So it doesn't
make sense to split the patch to me.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.ibm.com, phone: +1 (978) 899-4797


Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Mon, Jul 26, 2021 at 10:33:51PM +0200, Jakub Jelinek via Gcc-patches wrote:
> > @@ -835,6 +836,43 @@ fold_using_range::range_of_builtin_ubsan_call (irange 
> > , gcall *call,
> >  r.set_varying (type);
> >  }
> >  
> > +// Return TRUE if we recognize the target character set and return the
> > +// range for lower case and upper case letters.
> > +
> > +static bool
> > +get_letter_range (tree type, irange , irange )
> > +{
> > +  // ASCII
> > +  if (lang_hooks.to_target_charset (' ') == 0x20)
> > +{
> > +  lowers = int_range<2> (build_int_cst (type, 'a'),
> > + build_int_cst (type, 'z'));
> > +  uppers = int_range<2> (build_int_cst (type, 'A'),
> > +build_int_cst (type, 'Z'));
> 
> Wouldn't it be safer to lang_hooks.to_target_charset ('a')
> (and 'z') and just verify that their difference is that of 'z' - 'a'

David is right, that their difference is 25.  I think we don't support
EBCDIC on the host, but what if.

Jakub



Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Mon, Jul 26, 2021 at 03:44:28PM -0400, Andrew MacLeod via Gcc-patches wrote:
> I'm testing this...   Think this'll work better?  we don't support any
> character sets where a space is 32 but it isnt ASCI.. and 64 isnt EBCDIC?
> 
> This should handle those 2 sets.. and if is something else, it bails.
> 
> This seems OK?  Im testing it now on x86-64... but dont have an ebcdic
> target...

> --- a/gcc/gimple-range-fold.cc
> +++ b/gcc/gimple-range-fold.cc
> @@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-ssa-loop.h"
>  #include "tree-scalar-evolution.h"
> +#include "langhooks.h"
>  #include "vr-values.h"
>  #include "range.h"
>  #include "value-query.h"
> @@ -835,6 +836,43 @@ fold_using_range::range_of_builtin_ubsan_call (irange 
> , gcall *call,
>  r.set_varying (type);
>  }
>  
> +// Return TRUE if we recognize the target character set and return the
> +// range for lower case and upper case letters.
> +
> +static bool
> +get_letter_range (tree type, irange , irange )
> +{
> +  // ASCII
> +  if (lang_hooks.to_target_charset (' ') == 0x20)
> +{
> +  lowers = int_range<2> (build_int_cst (type, 'a'),
> +   build_int_cst (type, 'z'));
> +  uppers = int_range<2> (build_int_cst (type, 'A'),
> +  build_int_cst (type, 'Z'));

Wouldn't it be safer to lang_hooks.to_target_charset ('a')
(and 'z') and just verify that their difference is that of 'z' - 'a'
and use that range and IMHO just don't bother with EBCDIC or other charsets?

Jakub



Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread David Malcolm via Gcc-patches
On Mon, 2021-07-26 at 15:44 -0400, Andrew MacLeod wrote:
> I'm testing this...   Think this'll work better?  we don't support
> any 
> character sets where a space is 32 but it isnt ASCI.. and 64 isnt
> EBCDIC?

Not sure; libcpp has some code for EBCDIC support, but it seems to be
for the host, not the target.

Maybe test for the target's ('z' - 'a') and ('Z' - 'A') being 25, which
would imply that there are no gaps; don't optimize if that's not the
case???

Sorry for opening this can of worms (I think I'm still in recovery from
Python 2's "interesting" handling of string encoding).

Dave

> 
> This should handle those 2 sets.. and if is something else, it bails.
> 
> This seems OK?  Im testing it now on x86-64... but dont have an
> ebcdic 
> target...
> 
> Andrew
> 
> 
> On 7/26/21 2:39 PM, David Malcolm wrote:
> > On Mon, 2021-07-26 at 14:21 -0400, Andrew MacLeod via Gcc-patches
> > wrote:
> > > Remove lower case characters from the range of to_upper and
> > > likewise,
> > > upper case characters from to_lower.
> > > 
> > > I looked at also only adding the upper case characters for which
> > > there
> > > is a lower_case character in the range,  but it seemed of limited
> > > use
> > > Given some odd usage patterns we emit. .  Instead, I
> > > fold_using_range::range_of_builtin_callsimply took the
> > > incoming range, removed the "from" character set, and added the
> > > "to"
> > > character set.  This'll preserve any odd things that are passed
> > > into it
> > > while providing the basic functionality.
> > > 
> > > Easy enough for someone to enhance if they feel so inclined.
> > > 
> > > Bootstrapped on  x86_64-pc-linux-gnu  with no regressions.
> > > Pushed.
> > Awkward question: does this work with character sets where there
> > are
> > non-letter characters between 'a' and 'z' and between 'A' and 'Z'
> > (e.g.
> > EBCDIC [1])?
> > 
> > For example toupper('~') should return '~', but '~' is between 'a'
> > and
> > 'z' in EBCDIC; likewise tolower('}') should return '}', but '}' is
> > between 'A' and 'Z' in EBCDIC.
> > 
> > Dave
> > 
> > [1] https://en.wikipedia.org/wiki/EBCDIC
> > 
> 




Re: [PATCH 3/3] [PR libfortran/101305] Fix ISO_Fortran_binding.h paths in gfortran testsuite

2021-07-26 Thread Sandra Loosemore

On 7/26/21 3:45 AM, Tobias Burnus wrote:


[snip]

I did say that it mostly works because of:

$ find x86_64-pc-linux-gnu/ -name ISO_Fortran_binding.h
x86_64-pc-linux-gnu/libgfortran/ISO_Fortran_binding.h
x86_64-pc-linux-gnu/32/libgfortran/ISO_Fortran_binding.h

And when looking at the -B lines, I see for the '' alias '-m64' run:
-B.../build/gcc/testsuite/gfortran/../../
-B.../build/x86_64-pc-linux-gnu/./libgfortran/
-B.../build/x86_64-pc-linux-gnu/./libgfortran/.libs
-B.../build/x86_64-pc-linux-gnu/./libquadmath/.libs

which is fine (second line ensures the ISO*.h file is found.)

But for -m32, I see:

-B.../build/gcc/testsuite/gfortran/../../
-B.../build/x86_64-pc-linux-gnu/./libgfortran/
-B.../build/x86_64-pc-linux-gnu/32/libgfortran/.libs
-B.../build/x86_64-pc-linux-gnu/32/libquadmath/.libs

That also works, but it uses again the same directory for ISO*.h,
such that the -m64 header file is used instead of the -m32 one.


I did some more experiments and I see that too.  :-S  It's finding a .h 
file, but not the right one.  :-(



PS: Still, it would be nice if the proper multi-lib ISO*.h could be found;
while it usually does not matter, it could do so in some cases.


I think I ought to fix this now instead of just sweeping it under the 
rug.  The suggestion you made previously to add



# Flags for finding libgfortran ISO*.h files.
if [info exists TOOL_OPTIONS] {
   set specpath [get_multilibs ${TOOL_OPTIONS}]
} else {
   set specpath [get_multilibs]
}
set options "-I $specpath/libgfortran/"


to the .exp files looks consistent with what I see elsewhere for adding 
things to the include path, so I will give it a try and see how it works.


-Sandra


Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Mon, Jul 26, 2021 at 08:28:16PM +0100, Kwok Cheung Yeung wrote:
> In Section 1.2.2 of the OpenMP TR10 spec, 'target variant' is defined as:
> 
> A version of a device routine that can only be executed as part of a target 
> region.

Yes, that is a target variant, but I'm pretty sure we've decided that
the target construct added for declare target is actually not a dynamic
property.  So basically mostly return to the 5.0 wording with clarifications
for Fortran.  See
https://github.com/OpenMP/spec/issues/2612#issuecomment-849742988
for details.
Making the target in construct dynamic would pretty much force all the
scoring to be dynamic as well.

Jakub



Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Andrew MacLeod via Gcc-patches
I'm testing this...   Think this'll work better?  we don't support any 
character sets where a space is 32 but it isnt ASCI.. and 64 isnt EBCDIC?


This should handle those 2 sets.. and if is something else, it bails.

This seems OK?  Im testing it now on x86-64... but dont have an ebcdic 
target...


Andrew


On 7/26/21 2:39 PM, David Malcolm wrote:

On Mon, 2021-07-26 at 14:21 -0400, Andrew MacLeod via Gcc-patches
wrote:

Remove lower case characters from the range of to_upper and likewise,
upper case characters from to_lower.

I looked at also only adding the upper case characters for which there
is a lower_case character in the range,  but it seemed of limited use
Given some odd usage patterns we emit. .  Instead, I 
fold_using_range::range_of_builtin_callsimply took the
incoming range, removed the "from" character set, and added the "to"
character set.  This'll preserve any odd things that are passed into it
while providing the basic functionality.

Easy enough for someone to enhance if they feel so inclined.

Bootstrapped on  x86_64-pc-linux-gnu  with no regressions. Pushed.

Awkward question: does this work with character sets where there are
non-letter characters between 'a' and 'z' and between 'A' and 'Z' (e.g.
EBCDIC [1])?

For example toupper('~') should return '~', but '~' is between 'a' and
'z' in EBCDIC; likewise tolower('}') should return '}', but '}' is
between 'A' and 'Z' in EBCDIC.

Dave

[1] https://en.wikipedia.org/wiki/EBCDIC



commit bd885b3c27fae450e1b5a880ccd5dd5bd89722c1
Author: Andrew MacLeod 
Date:   Mon Jul 26 15:38:42 2021 -0400

Handle ASCII and EBCDIC in toupper and tolower ranges.

PR tree-optimization/7
* gimple-range-fold.cc (get_letter_range): New.
(fold_using_range::range_of_builtin_call): Call get_letter_range.

diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
index 8465b4a82f6..fa7adce250d 100644
--- a/gcc/gimple-range-fold.cc
+++ b/gcc/gimple-range-fold.cc
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-ssa-loop.h"
 #include "tree-scalar-evolution.h"
+#include "langhooks.h"
 #include "vr-values.h"
 #include "range.h"
 #include "value-query.h"
@@ -835,6 +836,43 @@ fold_using_range::range_of_builtin_ubsan_call (irange , gcall *call,
 r.set_varying (type);
 }
 
+// Return TRUE if we recognize the target character set and return the
+// range for lower case and upper case letters.
+
+static bool
+get_letter_range (tree type, irange , irange )
+{
+  // ASCII
+  if (lang_hooks.to_target_charset (' ') == 0x20)
+{
+  lowers = int_range<2> (build_int_cst (type, 'a'),
+			  build_int_cst (type, 'z'));
+  uppers = int_range<2> (build_int_cst (type, 'A'),
+			 build_int_cst (type, 'Z'));
+  return true;
+}
+  // EBCDIC
+  else if (lang_hooks.to_target_charset (' ') == 0x40)
+{
+  lowers = int_range<2> (build_int_cst (type, 'a'),
+			 build_int_cst (type, 'i'));
+  lowers.union_ (int_range<2> (build_int_cst (type, 'j'),
+   build_int_cst (type, 'r')));
+  lowers.union_ (int_range<2> (build_int_cst (type, 's'),
+   build_int_cst (type, 'z')));
+  uppers = int_range<2> (build_int_cst (type, 'A'),
+			 build_int_cst (type, 'I'));
+  uppers.union_ (int_range<2> (build_int_cst (type, 'J'),
+   build_int_cst (type, 'R')));
+  uppers.union_ (int_range<2> (build_int_cst (type, 'S'),
+   build_int_cst (type, 'Z')));
+  return true;
+}
+  // Unknown character set.
+  return false;
+}
+
+
 // For a builtin in CALL, return a range in R if known and return
 // TRUE.  Otherwise return FALSE.
 
@@ -873,13 +911,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<5> lowers;
+	int_range<5> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any lower case characters,
 	// but including all the upper case ones.
-	int_range<2> exclude (build_int_cst (type, 'a'),
-			  build_int_cst (type, 'z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> uppers (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'));
+	lowers.invert ();
+	r.intersect (lowers);
 	r.union_ (uppers);
 	return true;
   }
@@ -889,13 +930,16 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	arg = gimple_call_arg (call, 0);
 	if (!src.get_operand (r, arg))
 	  return false;
+
+	int_range<5> lowers;
+	int_range<5> uppers;
+	if (!get_letter_range (type, lowers, uppers))
+	  return false;
+
 	// Return the range passed in without any upper case characters,
 	// but including all the lower case ones.
-	int_range<2> exclude (build_int_cst (type, 'A'),
-			  build_int_cst (type, 'Z'), VR_ANTI_RANGE);
-	r.intersect (exclude);
-	int_range<2> lowers (build_int_cst (type, 'a'),
-			  build_int_cst (type, 

Re: [PATCH 4/7] ifcvt/optabs: Allow using a CC comparison for emit_conditional_move.

2021-07-26 Thread Richard Sandiford via Gcc-patches


> Before, we had a loop with two iterations that tried "emit_cmov (cond, 
> op2, op3)".  op2 and op3 can (or even were) If this did not succeed we 
> would revert the condition as well as op3 and op3 in-place and try 
> again.  I found that a bit cumbersome and intended to make this explicit 
> but it's still kind of involved, particularly since cond may come in 
> reversed, we additionally swap op2 and op3 and all the way back again.
>
> I remember from the first iteration of these patches that this (double 
> revert) was needed for exactly one test case in the test suite on x86. 
> When re-running it today I could not reproduce it anymore, though.

Hmm, OK.  Doesn't expanding both versions up-front create the same kind of
problem that the patch is fixing, in that we expand (and therefore cost)
both the reversed and unreversed comparison?  Also…

Robin Dapp  writes:
> @@ -4782,17 +4784,30 @@ emit_conditional_move (rtx target, enum rtx_code 
> code, rtx op0, rtx op1,
>if (cmode == VOIDmode)
>  cmode = GET_MODE (op0);
>  
> -  enum rtx_code orig_code = code;
> +  /* If the first condition operand is constant and the second is not, swap
> + them.  In that case we also need to reverse the comparison.
> + If both are non-constant It is possible that the conditional move
> + will not expand with operands in this order, e.g. when we are about to
> + create a min/max expression and the operands do not match the condition.
> + In that case we also need the reversed condition code and comparison.  
> */

…for min/max, I would have expected this swap to create the canonical
operand order for the min and max too.  What causes it to be rejected?

> +
> +  rtx rev_comparison = NULL_RTX;
>bool swapped = false;
> -  if (swap_commutative_operands_p (op2, op3)
> -  && ((reversed = reversed_comparison_code_parts (code, op0, op1, NULL))
> -  != UNKNOWN))
> +
> +  code = unsignedp ? unsigned_condition (code) : code;
> +  comparison = simplify_gen_relational (code, VOIDmode, cmode, op0, op1);
> +
> +  if ((reversed = reversed_comparison_code_parts (code, op0, op1, NULL))
> +  != UNKNOWN)
>  {
> -  std::swap (op2, op3);
> -  code = reversed;
> -  swapped = true;
> +  reversed = unsignedp ? unsigned_condition (reversed) : reversed;

When is this needed?  I'd have expected the reversed from of an unsigned
code to be naturally unsigned.

Thanks,
Richard

> +  rev_comparison = simplify_gen_relational (reversed, VOIDmode, cmode,
> + op0, op1);
>  }
>  
> +  if (swap_commutative_operands_p (op2, op3) && reversed != UNKNOWN)
> +swapped = true;
> +
>if (mode == VOIDmode)
>  mode = GET_MODE (op2);
>  
> @@ -4804,58 +4819,99 @@ emit_conditional_move (rtx target, enum rtx_code 
> code, rtx op0, rtx op1,
>if (!target)
>  target = gen_reg_rtx (mode);
>  
> -  for (int pass = 0; ; pass++)
> +  if (comparison && COMPARISON_P (comparison))
> +prepare_cmp_insn (XEXP (comparison, 0), XEXP (comparison, 1),
> +   GET_CODE (comparison), NULL_RTX,
> +   unsignedp, OPTAB_WIDEN, , );
> +  else
> +return NULL_RTX;
> +
> +  if (rev_comparison && COMPARISON_P (rev_comparison))
> +prepare_cmp_insn (XEXP (rev_comparison, 0), XEXP (rev_comparison, 1),
> +   GET_CODE (rev_comparison), NULL_RTX,
> +   unsignedp, OPTAB_WIDEN, _comparison, );
> +
> +  if (!swapped)
> +return emit_conditional_move (target, comparison, rev_comparison,
> +   op2, op3, mode);
> +  else
> +return emit_conditional_move (target, rev_comparison, comparison,
> +   op3, op2, mode);
> +}
> +
> +/* Helper function for emitting a conditional move.  Given a COMPARISON
> +   and a reversed REV_COMPARISON it will try to expand a conditional move
> +   with COMPARISON first and try with REV_COMPARISON if that fails.  */
> +
> +rtx
> +emit_conditional_move (rtx target, rtx comparison, rtx rev_comparison,
> +rtx op2, rtx op3, machine_mode mode)
> +{
> +
> +  rtx res = emit_conditional_move (target, comparison, op2, op3, mode);
> +
> +  if (res != NULL_RTX)
> +return res;
> +
> +  return emit_conditional_move (target, rev_comparison, op3, op2, mode);
> +}
> +
> +/* Helper for emitting a conditional move.  */
> +
> +static rtx
> +emit_conditional_move (rtx target, rtx comparison,
> +rtx op2, rtx op3, machine_mode mode)
> +{
> +  rtx_insn *last;
> +  enum insn_code icode;
> +
> +  if (comparison == NULL_RTX || !COMPARISON_P (comparison))
> +return NULL_RTX;
> +
> +  /* If the two source operands are identical, that's just a move.  */
> +  if (rtx_equal_p (op2, op3))
>  {
> -  code = unsignedp ? unsigned_condition (code) : code;
> -  comparison = simplify_gen_relational (code, VOIDmode, cmode, op0, op1);
> +  if (!target)
> + target = gen_reg_rtx (mode);
> 

Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Kwok Cheung Yeung

Hello

Thanks for your reply.

On 26/07/2021 3:29 pm, Jakub Jelinek wrote:

On Fri, Jul 09, 2021 at 12:16:15PM +0100, Kwok Cheung Yeung wrote:

3) In the OpenMP examples (version 5.0.1), section 9.7, the example
metadirective.3.c does not work as expected.

#pragma omp declare target
void exp_pi_diff(double *d, double my_pi){
#pragma omp metadirective \
when( construct={target}: distribute parallel for ) \
default( parallel for simd)
...
int main()
{
...
#pragma omp target teams map(tofrom: d[0:N])
exp_pi_diff(d,my_pi);
...
exp_pi_diff(d,my_pi);


The spec says in this case that the target construct is added to the
construct set because of the function appearing in between omp declare target
and omp end declare target, so the above is something that resolves
statically to distribute parallel for.
It is true that in OpenMP 5.1 the earlier
For functions within a declare target block, the target trait is added to the 
beginning of the
set as c 1 for any versions of the function that are generated for target 
regions so the total size
of the set is increased by 1.
has been mistakenly replaced with:
For device routines, the target trait is added to the beginning of the set as c 
1 for any versions of
the procedure that are generated for target regions so the total size of the 
set is increased by 1.
by that has been corrected in 5.2:
C/C++:
For functions that are declared in a code region that is delimited by a declare 
target directive and
its paired end directive, the target trait is added to the beginning of the set 
as c 1 for any target
variants that result from the directive so the total size of the set is 
increased by one.
Fortran:
If a declare target directive appears in the specification part of a procedure 
or in the
specification part of a procedure interface body, the target trait is added to 
the beginning of the
set as c 1 for any target variants that result from the directive so the total 
size of the set is
increased by one.

So, it is really a static decision that can be decided already during
parsing.


In Section 1.2.2 of the OpenMP TR10 spec, 'target variant' is defined as:

A version of a device routine that can only be executed as part of a target 
region.

So isn't this really saying the same thing as the previous versions of the spec? 
The target trait is added to the beginning of the construct set _for any target 
variants_ that result from the directive (implying that it shouldn't be added 
for non-target variants). In this example, the same function exp_pi_diff is 
being used in both a target and non-target context, so shouldn't the 
metadirective resolve differently in the two contexts, independently of the 
function being declared in a 'declare target' block? If not, there does not seem 
to be much point in that example (in section 9.7 of the OpenMP Examples v5.0.1).


From reading the spec, I infer that they expect the device and non-device 
versions of a function with 'declare target' to be separate, but that is not 
currently the case for GCC - on the host compiler, the same version of the 
function gets called in both target and non-target regions (though in the target 
region case, it gets called indirectly via a compiler-generated function with a 
name like main._omp_fn.0). The offload compiler gets its own streamed version, 
so there is no conflict there - by definition, its version must be in a target 
context.


Thanks,

Kwok


[committed] analyzer: fix uninit false +ve when returning structs

2021-07-26 Thread David Malcolm via Gcc-patches
This patch fixes some false positives from
 -Wanalyzer-use-of-uninitialized-value
when returning structs from functions (seen on the Linux kernel).

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as 3a1d168e9e0e3e38adedf5df393e9f8c075fc755.

gcc/analyzer/ChangeLog:
* region-model.cc (region_model::on_call_pre): Always set conjured
LHS, not just for SSA names.

gcc/testsuite/ChangeLog:
* gcc.dg/analyzer/sock-1.c: New test.
* gcc.dg/analyzer/sock-2.c: New test.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/region-model.cc   |  13 ++-
 gcc/testsuite/gcc.dg/analyzer/sock-1.c | 112 +
 gcc/testsuite/gcc.dg/analyzer/sock-2.c |  20 +
 3 files changed, 137 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/sock-1.c
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/sock-2.c

diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index c029759cb9b..9d84b8c28a1 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -1066,14 +1066,11 @@ region_model::on_call_pre (const gcall *call, 
region_model_context *ctxt,
   if (tree lhs = gimple_call_lhs (call))
 {
   const region *lhs_region = get_lvalue (lhs, ctxt);
-  if (TREE_CODE (lhs) == SSA_NAME)
-   {
- const svalue *sval
-   = m_mgr->get_or_create_conjured_svalue (TREE_TYPE (lhs), call,
-   lhs_region);
- purge_state_involving (sval, ctxt);
- set_value (lhs_region, sval, ctxt);
-   }
+  const svalue *sval
+   = m_mgr->get_or_create_conjured_svalue (TREE_TYPE (lhs), call,
+   lhs_region);
+  purge_state_involving (sval, ctxt);
+  set_value (lhs_region, sval, ctxt);
 }
 
   if (gimple_call_internal_p (call))
diff --git a/gcc/testsuite/gcc.dg/analyzer/sock-1.c 
b/gcc/testsuite/gcc.dg/analyzer/sock-1.c
new file mode 100644
index 000..0f3e822492f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/sock-1.c
@@ -0,0 +1,112 @@
+typedef unsigned int __u32;
+__extension__ typedef __signed__ long long __s64;
+__extension__ typedef unsigned long long __u64;
+typedef __u32 u32;
+typedef __s64 s64;
+typedef __u64 u64;
+typedef long long __kernel_time64_t;
+typedef _Bool bool;
+typedef __s64 time64_t;
+struct __kernel_timespec {
+ __kernel_time64_t tv_sec;
+ long long tv_nsec;
+};
+struct timespec64 {
+ time64_t tv_sec;
+ long tv_nsec;
+};
+
+extern struct timespec64 ns_to_timespec64(const s64 nsec);
+int put_timespec64(const struct timespec64 *ts,
+  struct __kernel_timespec *uts);
+
+/* [...snip...] */
+
+extern int put_old_timespec32(const struct timespec64 *, void *);
+
+/* [...snip...] */
+
+/* [...snip...] */
+
+typedef s64 ktime_t;
+
+/* [...snip...] */
+
+extern void ktime_get_real_ts64(struct timespec64 *tv);
+
+/* [...snip...] */
+
+enum tk_offsets {
+ TK_OFFS_REAL,
+ TK_OFFS_BOOT,
+ TK_OFFS_TAI,
+ TK_OFFS_MAX,
+};
+
+extern ktime_t ktime_get(void);
+extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
+extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
+extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
+extern ktime_t ktime_get_raw(void);
+extern u32 ktime_get_resolution_ns(void);
+
+
+static ktime_t ktime_get_real(void)
+{
+ return ktime_get_with_offset(TK_OFFS_REAL);
+}
+
+/* [...snip...] */
+
+struct socket {
+ /* [...snip...] */
+ struct sock *sk;
+ /* [...snip...] */
+};
+
+/* [...snip...] */
+
+struct sock {
+ /* [...snip...] */
+ ktime_t sk_stamp;
+ /* [...snip...] */
+};
+
+/* [...snip...] */
+
+static ktime_t sock_read_timestamp(struct sock *sk)
+{
+  return *(const volatile typeof(sk->sk_stamp) *)&(sk->sk_stamp);
+}
+
+static void sock_write_timestamp(struct sock *sk, ktime_t kt)
+{
+  *(volatile typeof(sk->sk_stamp) *)&(sk->sk_stamp) = kt;
+}
+
+/* [...snip...] */
+
+int sock_gettstamp(struct socket *sock, void *userstamp,
+ bool timeval, bool time32)
+{
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
+
+ /* [...snip...] */
+ ts = ns_to_timespec64((sock_read_timestamp(sk)));
+ if (ts.tv_sec == -1)
+  return -2;
+ if (ts.tv_sec == 0) {
+  ktime_t kt = ktime_get_real();
+  sock_write_timestamp(sk, kt);
+  ts = ns_to_timespec64((kt));
+ }
+
+ if (timeval)
+  ts.tv_nsec /= 1000;
+
+
+ if (time32)
+  return put_old_timespec32(, userstamp);
+ return put_timespec64(, userstamp);
+}
diff --git a/gcc/testsuite/gcc.dg/analyzer/sock-2.c 
b/gcc/testsuite/gcc.dg/analyzer/sock-2.c
new file mode 100644
index 000..237e0cb6f0b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/sock-2.c
@@ -0,0 +1,20 @@
+__extension__ typedef __signed__ long long __s64;
+typedef __s64 time64_t;
+struct timespec64 {
+ time64_t tv_sec;
+ long tv_nsec;
+};
+
+extern struct timespec64 ns_to_timespec64(void);
+
+int sock_gettstamp()
+{
+ struct timespec64 ts;
+
+ /* [...snip...] */
+ ts = 

Re: [PATCH 1/2] Implement basic block path solver.

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/2/2021 2:13 AM, Aldy Hernandez wrote:



On 7/2/21 12:20 AM, Jeff Law wrote:



On 6/28/2021 10:21 AM, Aldy Hernandez wrote:



+// Internal construct to help facilitate debugging of solver.
+#define DEBUG_SOLVER getenv("DEBUG")
Shouldn't this really be a property of what pass is using the solver 
and whether or not the appropriate dump flag is on for that pass?


Whoops.  This was a private construct used for debugging the solver. 
I've changed it to:


+#define DEBUG_SOLVER (0 && dump_file)
I would probably argue that the #define should disappear and the code 
should be checking the current dump state for the current pass.   If you 
don't want to keep the debugging output, then remove it  :-)  I think 
that can be handled in a follow-up patch.







+
+// Return the range of the result of PHI in R.
+
+void
+path_solver::ssa_range_in_phi (irange , gphi *phi)
+{
+  tree name = gimple_phi_result (phi);
+  basic_block bb = gimple_bb (phi);
+
+  // We experimented with querying ranger's range_on_entry here, but
+  // the performance penalty was too high, for hardly any 
improvements.

+  if (at_entry ())
+    {
+  r.set_varying (TREE_TYPE (name));
+  return;
+    }
+
+  basic_block prev = prev_bb ();
+  edge e_in = find_edge (prev, bb);
+  for (size_t i = 0; i < gimple_phi_num_args (phi); ++i)
It's probably not important in practice, but you're going to end up 
calling gimple_phi_num_args every iteration of this loop. It's value 
isn't generally subject to LICM.


I was just following standard practice:
Yea.  I doubt we're at all consistent with that.  In fact, I'd bet I'm a 
serial offender.  We should probably try to do better through since 
we're going to get a function call every loop iteration when the value 
is invariant.  FIxing other instances should be considered 
pre-approved.  ISTM a separate patch for fix that up would be fine.


Oh, and showing the # of instances in FOR statements is useful, but you 
didn't show the other cases (ie, where we shove it into a variable and 
use that as a loop bound).  I'd estimate there's probably 40-50 of 
those.  So it's really a mixed bag.


OK.

Jeff


Re: [PATCH 3/7] ifcvt: Improve costs handling for noce_convert_multiple.

2021-07-26 Thread Richard Sandiford via Gcc-patches
Robin Dapp  writes:
>> It looks like this is an existing (potential) problem,
>> but default_noce_conversion_profitable_p uses seq_cost, which in turn
>> uses insn_cost.  And insn_cost has an optional target hook behind it,
>> which allows for costing based on insn attributes etc.  For a true
>> apples-with-apples comparison we should use insn_cost here too.
>
> Good point, fixed that.
>
>> I think the detail that COSTS_N_INSNS (2) is the default is useful here.
>> (In other words, I'd forgotten by the time I'd poked around other bits of
>> ifcvt and was about to ask why we didn't cost the condition “properly”.)
>> So how about something like:
>> 
>> The original costs already include a base cost of COSTS_N_INSNS (2):
>> one instruction for the compare (which we will be needing either way)
>> and one instruction for the branch.
>
> Yes, this is much clearer. I went with that wording in the attached v2.
>
> Regards
>   Robin
>
> From 54508fa00fbee082fa62f4e1a8167f477938e781 Mon Sep 17 00:00:00 2001
> From: Robin Dapp 
> Date: Wed, 27 Nov 2019 13:46:17 +0100
> Subject: [PATCH v2 3/7] ifcvt: Improve costs handling for
>  noce_convert_multiple.
>
> When noce_convert_multiple is called the original costs are not yet
> initialized.  Therefore, up to now, costs were only ever unfairly
> compared against COSTS_N_INSNS (2).  This would lead to
> default_noce_conversion_profitable_p () rejecting all but the most
> contrived of sequences.
>
> This patch temporarily initializes the original costs by counting
> adding costs for all sets inside the then_bb.

OK, thanks.

Richard

> ---
>  gcc/ifcvt.c | 31 +++
>  1 file changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
> index f9d4478ec18..91b54dbea9c 100644
> --- a/gcc/ifcvt.c
> +++ b/gcc/ifcvt.c
> @@ -3404,14 +3404,17 @@ noce_convert_multiple_sets (struct noce_if_info 
> *if_info)
> (SET (REG) (REG)) insns suitable for conversion to a series
> of conditional moves.  Also check that we have more than one set
> (other routines can handle a single set better than we would), and
> -   fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  */
> +   fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  While going
> +   through the insns store the sum of their potential costs in COST.  */
>  
>  static bool
> -bb_ok_for_noce_convert_multiple_sets (basic_block test_bb)
> +bb_ok_for_noce_convert_multiple_sets (basic_block test_bb, unsigned *cost)
>  {
>rtx_insn *insn;
>unsigned count = 0;
>unsigned param = param_max_rtl_if_conversion_insns;
> +  bool speed_p = optimize_bb_for_speed_p (test_bb);
> +  unsigned potential_cost = 0;
>  
>FOR_BB_INSNS (test_bb, insn)
>  {
> @@ -3447,9 +3450,13 @@ bb_ok_for_noce_convert_multiple_sets (basic_block 
> test_bb)
>if (!can_conditionally_move_p (GET_MODE (dest)))
>   return false;
>  
> +  potential_cost += insn_cost (insn, speed_p);
> +
>count++;
>  }
>  
> +  *cost += potential_cost;
> +
>/* If we would only put out one conditional move, the other strategies
>   this pass tries are better optimized and will be more appropriate.
>   Some targets want to strictly limit the number of conditional moves
> @@ -3497,11 +3504,24 @@ noce_process_if_block (struct noce_if_info *if_info)
>   to calculate a value for x.
>   ??? For future expansion, further expand the "multiple X" rules.  */
>  
> -  /* First look for multiple SETS.  */
> +  /* First look for multiple SETS.  The original costs already include
> + a base cost of COSTS_N_INSNS (2): one instruction for the compare
> + (which we will be needing either way) and one instruction for the
> + branch.  When comparing costs we want to use the branch instruction
> + cost and the sets vs. the cmovs generated here.  Therefore subtract
> + the costs of the compare before checking.
> + ??? Actually, instead of the branch instruction costs we might want
> + to use COSTS_N_INSNS (BRANCH_COST ()) as in other places.  */
> +
> +  unsigned potential_cost = if_info->original_cost - COSTS_N_INSNS (1);
> +  unsigned old_cost = if_info->original_cost;
>if (!else_bb
>&& HAVE_conditional_move
> -  && bb_ok_for_noce_convert_multiple_sets (then_bb))
> +  && bb_ok_for_noce_convert_multiple_sets (then_bb, _cost))
>  {
> +  /* Temporarily set the original costs to what we estimated so
> +  we can determine if the transformation is worth it.  */
> +  if_info->original_cost = potential_cost;
>if (noce_convert_multiple_sets (if_info))
>   {
> if (dump_file && if_info->transform_name)
> @@ -3509,6 +3529,9 @@ noce_process_if_block (struct noce_if_info *if_info)
>if_info->transform_name);
> return TRUE;
>   }
> +
> +  /* Restore the original costs.  */
> +  if_info->original_cost = old_cost;
>  }
>  
>bool speed_p = optimize_bb_for_speed_p 

Re: [PATCH 1/7] ifcvt: Check if cmovs are needed.

2021-07-26 Thread Richard Sandiford via Gcc-patches
Robin Dapp  writes:
> Hi Richard,
>
> thanks for going through the patch set.
>
>> A hash_set might be simpler, given that the code only enters insns
>> for which the bool is false.  “rtx_insn *” would be better than rtx.
>
> Right, changed that.
>
>> Do you mean the sets might be removed or that the checks might be
>> removed?
>
> It's actually the checks that are removed.  I clarified and amended the 
> comments.
>
>> The patch is quite hard to review on its own, since nothing actually
>> uses this variable.  It's also not obvious how the
>> reg_overlap_mentioned_p code works if the old target is referenced
>> later.
>> 
>> Could you refactor the series a bit so that each patch is
>> self-contained? It's OK if that means fewer patches.
> The attached v2 makes use of need_cmov now, I hope this makes it a bit 
> clearer.  Regarding the reg_overlap_mentioned_p: it is used to detect 
> the swap idioms as well as when a cmov destination is used in the 
> condition as well.  If needed this could be two separate patches (most 
> of the second patch would be comments, though).

Thanks for the update.  No need for further splitting, this looks like a
nice self-contained patch as it is.

> diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c
> index 017944f4f79..a5e55456d88 100644
> --- a/gcc/ifcvt.c
> +++ b/gcc/ifcvt.c
> @@ -98,6 +98,7 @@ static int dead_or_predicable (basic_block, basic_block, 
> basic_block,
>  edge, int);
>  static void noce_emit_move_insn (rtx, rtx);
>  static rtx_insn *block_has_only_trap (basic_block);
> +static void check_need_cmovs (basic_block, hash_set *);
>
>  /* Count the number of non-jump active insns in BB.  */
>  
> @@ -3203,6 +3204,10 @@ noce_convert_multiple_sets (struct noce_if_info 
> *if_info)
>auto_vec unmodified_insns;
>int count = 0;
>  
> +  hash_set need_no_cmov;
> +
> +  check_need_cmovs (then_bb, _no_cmov);
> +
>FOR_BB_INSNS (then_bb, insn)
>  {
>/* Skip over non-insns.  */
> @@ -3213,26 +3218,47 @@ noce_convert_multiple_sets (struct noce_if_info 
> *if_info)
>gcc_checking_assert (set);
>  
>rtx target = SET_DEST (set);
> -  rtx temp = gen_reg_rtx (GET_MODE (target));
> +  rtx temp;
>rtx new_val = SET_SRC (set);
>rtx old_val = target;
>  
> -  /* If we were supposed to read from an earlier write in this block,
> -  we've changed the register allocation.  Rewire the read.  While
> -  we are looking, also try to catch a swap idiom.  */
> -  for (int i = count - 1; i >= 0; --i)
> - if (reg_overlap_mentioned_p (new_val, targets[i]))
> -   {
> - /* Catch a "swap" style idiom.  */
> - if (find_reg_note (insn, REG_DEAD, new_val) != NULL_RTX)
> -   /* The write to targets[i] is only live until the read
> -  here.  As the condition codes match, we can propagate
> -  the set to here.  */
> -   new_val = SET_SRC (single_set (unmodified_insns[i]));
> - else
> -   new_val = temporaries[i];
> - break;
> -   }

Don't we still need this code (without the REG_DEAD handling) for the
case in which…

> +  /* As we are transforming
> +  if (x > y)
> +{
> +  a = b;
> +  c = d;
> +}
> +  into
> +a = (x > y) ...
> +c = (x > y) ...
> +
> +  we potentially check x > y before every set.
> +  Even though the check might be removed by subsequent passes, this means
> +  that we cannot transform
> +if (x > y)
> +  {
> +x = y;
> +...
> +  }
> +  into
> +x = (x > y) ...
> +...
> +  since this would invalidate x and the following to-be-removed checks.
> +  Therefore we introduce a temporary every time we are about to
> +  overwrite a variable used in the check.  Costing of a sequence with
> +  these is going to be inaccurate so only use temporaries when
> +  needed.  */
> +  if (reg_overlap_mentioned_p (target, cond))
> + temp = gen_reg_rtx (GET_MODE (target));

…this code triggers?  I don't see otherwise how later uses of x would
pick up “temp” instead of the original target.  E.g. suppose we had:

if (x > y)
  {
x = …;
z = x; // x does not die here
  }

Without the loop, it looks like z would pick up the old value of x
(used in the comparison) instead of the new one.

> +  else
> + temp = target;
> +
> +  /* We have identified swap-style idioms in check_need_cmovs.  A normal
> +  set will need to be a cmov while the first instruction of a swap-style
> +  idiom can be a regular move.  This helps with costing.  */
> +  bool need_cmov = true;
> +  if (need_no_cmov.contains (insn))
> + need_cmov = false;

Would be simpler as:

  bool need_cmov = !need_no_cmov.contains (insn);
>  
>/* If we had a non-canonical conditional jump (i.e. one where
>the fallthrough is to the "else" 

Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Andrew MacLeod via Gcc-patches

On 7/26/21 2:39 PM, David Malcolm wrote:

On Mon, 2021-07-26 at 14:21 -0400, Andrew MacLeod via Gcc-patches
wrote:

Remove lower case characters from the range of to_upper and likewise,
upper case characters from to_lower.

I looked at also only adding the upper case characters for which there
is a lower_case character in the range,  but it seemed of limited use
Given some odd usage patterns we emit. .  Instead, I simply took the
incoming range, removed the "from" character set, and added the "to"
character set.  This'll preserve any odd things that are passed into it
while providing the basic functionality.

Easy enough for someone to enhance if they feel so inclined.

Bootstrapped on  x86_64-pc-linux-gnu  with no regressions. Pushed.

Awkward question: does this work with character sets where there are
non-letter characters between 'a' and 'z' and between 'A' and 'Z' (e.g.
EBCDIC [1])?

For example toupper('~') should return '~', but '~' is between 'a' and
'z' in EBCDIC; likewise tolower('}') should return '}', but '}' is
between 'A' and 'Z' in EBCDIC.

Dave

[1] https://en.wikipedia.org/wiki/EBCDIC

do we suppord non-ansi/ EBCDIC?   I thought I saw other places where we 
use 'a'-'z' and stuff like that.


If we do support it, then how does one determine which set is being used 
so we would only do it for ANSI. ?


Andrew




Re: [PATCH v4] Add QI vector mode support to by-pieces for memset

2021-07-26 Thread Richard Sandiford via Gcc-patches
"H.J. Lu via Gcc-patches"  writes:
> +/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> +   bytes from constant string DATA + OFFSET and return it as target
> +   constant.  If PREV isn't nullptr, it has the RTL info from the
> +   previous iteration.  */
> +
> +rtx
> +builtin_memset_read_str (void *data, void *prev,
> +  HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> +  fixed_size_mode mode)
> +{
>const char *c = (const char *) data;
> -  char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> +  unsigned int size = GET_MODE_SIZE (mode);
>  
> -  memset (p, *c, GET_MODE_SIZE (mode));
> +  rtx target = gen_memset_value_from_prev ((by_pieces_prev *) prev,
> +mode);
> +  if (target != nullptr)
> +return target;
> +  rtx src = gen_int_mode (*c, QImode);
> +  target = gen_memset_broadcast (src, mode);
> +  if (target != nullptr)
> +return target;
>  
> -  return c_readstr (p, mode);
> +  char *p = XALLOCAVEC (char, size);
> +
> +  memset (p, *c, size);
> +
> +  /* Vector mode should be handled by gen_memset_broadcast above.  */

Nit: s/Vector mode should/Vector modes should/

> +  return c_readstr (p, as_a  (mode));
>  }
>  
>  /* Callback routine for store_by_pieces.  Return the RTL of a register
> @@ -6788,33 +6910,29 @@ builtin_memset_read_str (void *data, void *prevp,
> nullptr, it has the RTL info from the previous iteration.  */
>  
>  static rtx
> -builtin_memset_gen_str (void *data, void *prevp,
> +builtin_memset_gen_str (void *data, void *prev,
>   HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> - scalar_int_mode mode)
> + fixed_size_mode mode)
>  {
>rtx target, coeff;
>size_t size;
>char *p;
>  
> -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> -  if (prev != nullptr && prev->data != nullptr)
> -{
> -  /* Use the previous data in the same mode.  */
> -  if (prev->mode == mode)
> - return prev->data;
> -
> -  target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> -  if (target != nullptr)
> - return target;
> -}
> -
>size = GET_MODE_SIZE (mode);
>if (size == 1)
>  return (rtx) data;
>  
> +  target = gen_memset_value_from_prev ((by_pieces_prev *) prev, mode);
> +  if (target != nullptr)
> +return target;
> +
> +  target = gen_memset_broadcast ((rtx) data, mode);
> +  if (target != nullptr)
> +return target;
> +
>p = XALLOCAVEC (char, size);
>memset (p, 1, size);
> -  coeff = c_readstr (p, mode);
> +  coeff = c_readstr (p, as_a  (mode));

The same comment would be useful here.

>  
>target = convert_to_mode (mode, (rtx) data, 1);
>target = expand_mult (mode, target, coeff, NULL_RTX, 1);
> @@ -6918,7 +7036,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned 
> int ctz_len,
>   , align, true))
>  return false;
>  
> -  rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
> +  by_pieces_constfn constfun;
>void *constfundata;
>if (val)
>  {
> diff --git a/gcc/builtins.h b/gcc/builtins.h
> index a64ece3f1cd..4b2ad766c61 100644
> --- a/gcc/builtins.h
> +++ b/gcc/builtins.h
> @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function 
> fn);
>  extern tree mathfn_built_in (tree, combined_fn);
>  extern tree mathfn_built_in_type (combined_fn);
>  extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
> -  scalar_int_mode);
> +  fixed_size_mode);
>  extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
> - scalar_int_mode);
> + fixed_size_mode);
>  extern rtx expand_builtin_saveregs (void);
>  extern tree std_build_builtin_va_list (void);
>  extern tree std_fn_abi_va_list (tree);
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index c8f4abe3e41..14d387750a8 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -12149,6 +12149,13 @@ This function prepares to emit a conditional 
> comparison within a sequence
>   @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode 
> @var{mode})
> +This hook should return an rtx for scratch register in @var{mode} to

s/for scratch register/for a scratch register/

> +be used by memset broadcast.  The backend can use a hard scratch register

How about: s/be used by memset broadcast/be used when expanding memset calls/
since the hook is (IMO rightly) not restricted to vector modes.

> +to avoid stack realignment when expanding memset.  The default is
> +@code{gen_reg_rtx}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned 
> @var{nunroll}, class loop *@var{loop})
>  This target hook returns a new value for the number of 

Re: [COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread David Malcolm via Gcc-patches
On Mon, 2021-07-26 at 14:21 -0400, Andrew MacLeod via Gcc-patches
wrote:
> Remove lower case characters from the range of to_upper and likewise,
> upper case characters from to_lower.
> 
> I looked at also only adding the upper case characters for which there 
> is a lower_case character in the range,  but it seemed of limited use
> Given some odd usage patterns we emit. .  Instead, I simply took the 
> incoming range, removed the "from" character set, and added the "to" 
> character set.  This'll preserve any odd things that are passed into it
> while providing the basic functionality.
> 
> Easy enough for someone to enhance if they feel so inclined.
> 
> Bootstrapped on  x86_64-pc-linux-gnu  with no regressions. Pushed.

Awkward question: does this work with character sets where there are
non-letter characters between 'a' and 'z' and between 'A' and 'Z' (e.g.
EBCDIC [1])?

For example toupper('~') should return '~', but '~' is between 'a' and
'z' in EBCDIC; likewise tolower('}') should return '}', but '}' is
between 'A' and 'Z' in EBCDIC.

Dave

[1] https://en.wikipedia.org/wiki/EBCDIC



Re: [PATCH] incorrect arguments designated in -Wnonnull for arrays

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/25/2021 10:23 AM, Uecker, Martin wrote:

Two arguments are switched for -Wnonnull when
warning about array parameters with bounds > 0
and which are NULL.

This patch corrects the mistake.

Martin


2021-07-25  Martin Uecker  

gcc/
  * calls.c (maybe_warn_rdwr_sizes): Correct argument
  numbers in warning that were switched.

gcc/testsuite/
  * gcc.dg/Wnonnull-4.c: Correct argument numbers in warnings.

I'll defer to Martin Sebor on this.

Martin S., can you cover the review of this patch from Martin U?

Thanks,
Jeff




[COMMITTED] tree-optimization/78888 - Adjust ranges for to_upper and to_lower.

2021-07-26 Thread Andrew MacLeod via Gcc-patches
Remove lower case characters from the range of to_upper and likewise, 
upper case characters from to_lower.


I looked at also only adding the upper case characters for which there 
is a lower_case character in the range,  but it seemed of limited use 
Given some odd usage patterns we emit. .  Instead, I simply took the 
incoming range, removed the "from" character set, and added the "to" 
character set.  This'll preserve any odd things that are passed into it 
while providing the basic functionality.


Easy enough for someone to enhance if they feel so inclined.

Bootstrapped on  x86_64-pc-linux-gnu  with no regressions. Pushed.

commit 1ce0b26e6e1e6c348b1d54f1f462a44df6fe47f5
Author: Andrew MacLeod 
Date:   Mon Jul 26 09:40:32 2021 -0400

Adjust ranges for to_upper and to_lower.

Exclude lower case chars from to_upper and upper case chars from to_lower.

gcc/
PR tree-optimization/7
* gimple-range-fold.cc (fold_using_range::range_of_builtin_call): Add cases
for CFN_BUILT_IN_TOUPPER and CFN_BUILT_IN_TOLOWER.

gcc/testsuite/
* gcc.dg/pr7.c: New.

diff --git a/gcc/gimple-range-fold.cc b/gcc/gimple-range-fold.cc
index f95af3d5866..8465b4a82f6 100644
--- a/gcc/gimple-range-fold.cc
+++ b/gcc/gimple-range-fold.cc
@@ -868,6 +868,38 @@ fold_using_range::range_of_builtin_call (irange , gcall *call,
 	}
   break;
 
+case CFN_BUILT_IN_TOUPPER:
+  {
+	arg = gimple_call_arg (call, 0);
+	if (!src.get_operand (r, arg))
+	  return false;
+	// Return the range passed in without any lower case characters,
+	// but including all the upper case ones.
+	int_range<2> exclude (build_int_cst (type, 'a'),
+			  build_int_cst (type, 'z'), VR_ANTI_RANGE);
+	r.intersect (exclude);
+	int_range<2> uppers (build_int_cst (type, 'A'),
+			  build_int_cst (type, 'Z'));
+	r.union_ (uppers);
+	return true;
+  }
+
+ case CFN_BUILT_IN_TOLOWER:
+  {
+	arg = gimple_call_arg (call, 0);
+	if (!src.get_operand (r, arg))
+	  return false;
+	// Return the range passed in without any upper case characters,
+	// but including all the lower case ones.
+	int_range<2> exclude (build_int_cst (type, 'A'),
+			  build_int_cst (type, 'Z'), VR_ANTI_RANGE);
+	r.intersect (exclude);
+	int_range<2> lowers (build_int_cst (type, 'a'),
+			  build_int_cst (type, 'z'));
+	r.union_ (lowers);
+	return true;
+  }
+
 CASE_CFN_FFS:
 CASE_CFN_POPCOUNT:
   // __builtin_ffs* and __builtin_popcount* return [0, prec].
diff --git a/gcc/testsuite/gcc.dg/pr7.c b/gcc/testsuite/gcc.dg/pr7.c
new file mode 100644
index 000..77a130cf11c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr7.c
@@ -0,0 +1,29 @@
+/* PR tree-optimization/7 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-evrp" } */
+
+void kill (void);
+void keep (void);
+
+void g (int x)
+{
+  if (__builtin_toupper ((unsigned char)x) == 'a')
+kill ();
+  if (__builtin_toupper ((unsigned char)x) == 'z')
+kill ();
+  if (__builtin_tolower ((unsigned char)x) == 'A')
+kill ();
+  if (__builtin_tolower ((unsigned char)x) == 'Z')
+kill ();
+
+  if (__builtin_toupper ((unsigned char)x) == 'A')
+keep ();
+  if (__builtin_toupper ((unsigned char)x) == 'Z')
+keep ();
+  if (__builtin_tolower ((unsigned char)x) == 'a')
+keep ();
+  if (__builtin_tolower ((unsigned char)x) == 'z')
+keep ();
+}
+/* { dg-final { scan-tree-dump-not "kill" "evrp" } }  */
+/* { dg-final { scan-tree-dump-times "keep" 4 "evrp"} } */


Re: [PATCH V2] simplify-rtx: Push sign/zero-extension inside vec_duplicate

2021-07-26 Thread Richard Sandiford via Gcc-patches
Jonathan Wright  writes:
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 74890989cb3045798bf8d0241467eaaf72238297..c4558da7c0f9228ccc5fa2b9c87e8b4690860b47
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2092,14 +2092,14 @@
>  
>  (define_insn "aarch64_mlal_hi_n_insn"
>[(set (match_operand: 0 "register_operand" "=w")
> -(plus:
> -  (mult:
> -  (ANY_EXTEND: (vec_select:
> - (match_operand:VQ_HSI 2 "register_operand" "w")
> - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> -  (ANY_EXTEND: (vec_duplicate:
> -(match_operand: 4 "register_operand" ""
> -  (match_operand: 1 "register_operand" "0")))]
> + (plus:
> +   (mult:
> +   (ANY_EXTEND: (vec_select:
> +  (match_operand:VQ_HSI 2 "register_operand" "w")
> +  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> +  (vec_duplicate: (ANY_EXTEND:
> +  (match_operand: 4 "register_operand" ""
> +   (match_operand: 1 "register_operand" "0")))]
>"TARGET_SIMD"
>"mlal2\t%0., %2., %4.[0]"
>[(set_attr "type" "neon_mla__long")]
> @@ -2167,14 +2167,14 @@
>  
>  (define_insn "aarch64_mlsl_hi_n_insn"
>[(set (match_operand: 0 "register_operand" "=w")
> -(minus:
> -  (match_operand: 1 "register_operand" "0")
> -  (mult:
> -(ANY_EXTEND: (vec_select:
> -  (match_operand:VQ_HSI 2 "register_operand" "w")
> -  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> -(ANY_EXTEND: (vec_duplicate:
> - (match_operand: 4 "register_operand" ""))]
> + (minus:
> +   (match_operand: 1 "register_operand" "0")
> +   (mult:
> + (ANY_EXTEND: (vec_select:
> +   (match_operand:VQ_HSI 2 "register_operand" "w")
> +   (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> + (vec_duplicate: (ANY_EXTEND:
> +   (match_operand: 4 "register_operand" ""))]
>"TARGET_SIMD"
>"mlsl2\t%0., %2., %4.[0]"
>[(set_attr "type" "neon_mla__long")]

Would be good for these patterns to use the one-operator-per-line style too.

> @@ -2560,7 +2568,7 @@
>   (ANY_EXTEND: (vec_select:
> (match_operand:VQ_HSI 2 "register_operand" "w")
> (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> - (ANY_EXTEND: (vec_duplicate:
> + (vec_duplicate: (ANY_EXTEND:
> (vec_select:
>   (match_operand: 4 "register_operand" "")
>   (parallel [(match_operand:SI 5 "immediate_operand" "i")]

Same here.

> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 
> 2d169d3f9f70c85d396adaed124b6c52aca98f07..fc3701698a616727b6ef22f648b283e7d3821b24
>  100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -1731,8 +1731,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>  
>The first might be done entirely in vector registers while the
>second might need a move between register files.  */
> -  temp = simplify_unary_operation (code, GET_MODE_INNER (mode),
> -elt, GET_MODE_INNER (GET_MODE (op)));
> +  if (code == SIGN_EXTEND || code == ZERO_EXTEND)
> + temp = simplify_gen_unary (code, GET_MODE_INNER (mode),
> +elt, GET_MODE_INNER (GET_MODE (op)));
> +  else
> + temp = simplify_unary_operation (code, GET_MODE_INNER (mode),
> +  elt, GET_MODE_INNER (GET_MODE (op)));
>if (temp)
>   return gen_vec_duplicate (mode, temp);
>  }

I guess the comment:

  /* Try applying the operator to ELT and see if that simplifies.
 We can duplicate the result if so.

 The reason we don't use simplify_gen_unary is that it isn't
 necessarily a win to convert things like:

   (neg:V (vec_duplicate:V (reg:S R)))

 to:

   (vec_duplicate:V (neg:S (reg:S R)))

 The first might be done entirely in vector registers while the
 second might need a move between register files.  */

is a bit out-of-date.  How about adding this comment above the call to
simplify_gen_unary.

/* Enforce a canonical order of VEC_DUPLICATE wrt other unary
   operations by promoting VEC_DUPLICATE to the root of the expression
   (as far as possible).  */

and using this modified form of the original comment above the call to
simplify_unary_operation:

/* Try applying the operator to ELT and see if that simplifies.
   We can duplicate the result if so.

   The reason we traditionally haven't used simplify_gen_unary
   for these codes is that it didn't necessarily seem to be a
   win to convert things like:


Re: [PATCH] Replace evrp use in loop versioning with ranger.

2021-07-26 Thread Richard Sandiford via Gcc-patches
Aldy Hernandez  writes:
> On Mon, Jul 26, 2021 at 4:18 PM Richard Sandiford
>  wrote:
>>
>> Aldy Hernandez  writes:
>> > This patch replaces the evrp_range_analyzer in the loop versioning code
>> > with an on-demand ranger.
>> >
>> > Everything was pretty straightforward, except that range_of_expr requires
>> > a gimple statement as context to provide context aware ranges.  I didn't 
>> > see
>> > a convient place where the statement was saved, so I made a vector indexed
>> > by SSA names.  As an alternative, I tried to use the loop's first 
>> > statement,
>> > but that proved to be insufficient.
>>
>> The mapping is one-to-many though: there can be multiple statements
>> for each SSA name.  Maybe that doesn't matter in this context and
>> any of the statements can act as a representative.
>>
>> I'm surprised that the loop's first statement didn't work though,
>> since the SSA name is supposedly known to be loop-invariant.  What went
>> wrong when you tried that?
>
> I was looking at the first statement of loop_info->block_list and one
> of the dg.exp=loop-versioning* tests failed.  Perhaps I should have
> used the loop itself, as in the attached patch.  With this patch all
> of the loop-versioning tests pass.
>
>>
>> > I am not familiar with loop versioning, but if the DOM walk was only
>> > necessary for the calls to record_ranges_from_stmt, this too could be
>> > removed as the ranger will work without it.
>>
>> Yeah, that was the only reason.  If the information is available at
>> version_for_unity (I guess it is) then we should just avoid recording
>> the versioning there if so.
>>
>> How expensive is the check?  If the result is worth caching, perhaps
>> we should have two bitmaps: the existing one, and one that records
>> whether we've checked a particular SSA name.
>>
>> If the check is relatively cheap then that won't be worth it though.
>
> If you're asking about the range_of_expr check, that's all cached, so
> it should be pretty cheap.  Besides, we're no longer calculating
> ranges for each statement in the IL, as we were doing in lv_dom_walker
> with evrp's record_ranges_from_stmt.  Only statements of interest are
> queried.

Sounds good.  If the results are already cached then another level
of caching (via the second bitmap I mentioned above) would obviously
be a waste of time.

> How about this patch, pending tests?

OK, thanks, as a strict improvement over the status quo.  But it'd be
even better without the dom walk :-)

Richard

>
> Aldy
>
> From 8a350003d950869499d729921008abdb491d3a5e Mon Sep 17 00:00:00 2001
> From: Aldy Hernandez 
> Date: Sat, 24 Jul 2021 12:29:28 +0200
> Subject: [PATCH] Replace evrp use in loop versioning with ranger.
>
> This patch replaces the evrp_range_analyzer in the loop versioning code
> with an on-demand ranger.
>
> Tested on x86-64 Linux.
>
> gcc/ChangeLog:
>
>   * gimple-loop-versioning.cc (lv_dom_walker::lv_dom_walker): Remove
>   use of m_range_analyzer.
>   (loop_versioning::lv_dom_walker::before_dom_children): Same.
>   (loop_versioning::lv_dom_walker::after_dom_children): Remove.
>   (loop_versioning::prune_loop_conditions): Replace vr_values use
>   with range_query interface.
>   (pass_loop_versioning::execute): Use ranger.
> ---
>  gcc/gimple-loop-versioning.cc | 44 ---
>  1 file changed, 15 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
> index 4b70c5a4aab..46c3a508c8d 100644
> --- a/gcc/gimple-loop-versioning.cc
> +++ b/gcc/gimple-loop-versioning.cc
> @@ -30,19 +30,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-ssa-loop.h"
>  #include "ssa.h"
>  #include "tree-scalar-evolution.h"
> -#include "tree-chrec.h"
>  #include "tree-ssa-loop-ivopts.h"
>  #include "fold-const.h"
>  #include "tree-ssa-propagate.h"
>  #include "tree-inline.h"
>  #include "domwalk.h"
> -#include "alloc-pool.h"
> -#include "vr-values.h"
> -#include "gimple-ssa-evrp-analyze.h"
>  #include "tree-vectorizer.h"
>  #include "omp-general.h"
>  #include "predict.h"
>  #include "tree-into-ssa.h"
> +#include "gimple-range.h"
> +#include "tree-cfg.h"
>  
>  namespace {
>  
> @@ -261,14 +259,10 @@ private:
>  lv_dom_walker (loop_versioning &);
>  
>  edge before_dom_children (basic_block) FINAL OVERRIDE;
> -void after_dom_children (basic_block) FINAL OVERRIDE;
>  
>private:
>  /* The parent pass.  */
>  loop_versioning _lv;
> -
> -/* Used to build context-dependent range information.  */
> -evrp_range_analyzer m_range_analyzer;
>};
>  
>/* Used to simplify statements based on conditions that are established
> @@ -308,7 +302,7 @@ private:
>bool analyze_block (basic_block);
>bool analyze_blocks ();
>  
> -  void prune_loop_conditions (class loop *, vr_values *);
> +  void prune_loop_conditions (class loop *);
>bool prune_conditions ();
>  
>void merge_loop_info (class loop *, class 

[r12-2511 Regression] FAIL: gfortran.dg/PR93963.f90 -Os execution test on Linux/x86_64

2021-07-26 Thread sunil.k.pandey via Gcc-patches
On Linux/x86_64,

0cbf03689e3e7d9d6002b8e5d159ef3716d0404c is the first bad commit
commit 0cbf03689e3e7d9d6002b8e5d159ef3716d0404c
Author: Tobias Burnus 
Date:   Mon Jul 26 14:20:46 2021 +0200

PR fortran/93308/93963/94327/94331/97046 problems raised by descriptor 
handling

caused

FAIL: gfortran.dg/PR93963.f90   -O2  execution test
FAIL: gfortran.dg/PR93963.f90   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  execution test
FAIL: gfortran.dg/PR93963.f90   -O3 -g  execution test
FAIL: gfortran.dg/PR93963.f90   -Os  execution test

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-master/master/r12-2511/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check RUNTESTFLAGS="dg.exp=gfortran.dg/PR93963.f90 
--target_board='unix{-m32}'"
$ cd {build_dir}/gcc && make check RUNTESTFLAGS="dg.exp=gfortran.dg/PR93963.f90 
--target_board='unix{-m32\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)


Re: [PATCH] Do not use tuple-like interface for pair in unordered containers

2021-07-26 Thread Jonathan Wakely via Gcc-patches

On 23/07/21 19:21 +0100, Jonathan Wakely wrote:

I've been experimenting with this patch, which removes the need to use
std::tuple_element and std::get to access the members of a std::pair
in unordered_{map,multimap}.

I'm in the process of refactoring the  header to reduce
header dependencies throughout the library, and this is the only use
of the tuple-like interface for std::pair in the library.

Using tuple_element and std::get resolved PR 53339 by allowing the
std::pair type to be incomplete, however that is no longer supported
anyway (the 23_containers/unordered_map/requirements/53339.cc test
case is XFAILed). That means we could just define _Select1st as:

 struct _Select1st
 {
   template
 auto
 operator()(_Tp&& __x) const noexcept
 -> decltype(std::forward<_Tp>(__x).first)
 { return std::forward<_Tp>(__x).first; }
 };

But the approach in the patch seems OK too.


Actually I have a fix for PR 53339 so that we can support incomplete
types again. So we don't want to access the .first member in the
return type, as that requires a complete type.



[PATCH 2/2] Ada: Remove debug line number for DECL_IGNORED_P functions

2021-07-26 Thread Bernd Edlinger
It was pointed out in PR101598 to be inappropriate, that
ignored Ada decls receive the source line number which was
recorded in the function decl's DECL_SOURCE_LOCATION.
Therefore set all front-end-generated Ada decls with
DECL_IGNORED_P to UNKNOWN_LOCATION.

2021-07-24  Bernd Edlinger  

PR debug/101598
* gcc-interface/trans.c (Subprogram_Body_to_gnu): Set the
DECL_SOURCE_LOCATION of DECL_IGNORED_P gnu_subprog_decl to
UNKNOWN_LOCATION.
---
 gcc/ada/gcc-interface/trans.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/ada/gcc-interface/trans.c b/gcc/ada/gcc-interface/trans.c
index f61183d..3df56aa 100644
--- a/gcc/ada/gcc-interface/trans.c
+++ b/gcc/ada/gcc-interface/trans.c
@@ -3885,7 +3885,9 @@ Subprogram_Body_to_gnu (Node_Id gnat_node)
 }
 
   /* Set the line number in the decl to correspond to that of the body.  */
-  if (!Sloc_to_locus (Sloc (gnat_node), , false, gnu_subprog_decl))
+  if (DECL_IGNORED_P (gnu_subprog_decl))
+locus = UNKNOWN_LOCATION;
+  else if (!Sloc_to_locus (Sloc (gnat_node), , false, gnu_subprog_decl))
 locus = input_location;
   DECL_SOURCE_LOCATION (gnu_subprog_decl) = locus;
 
-- 
1.9.1


[PATCH 1/2] Fix debug info for ignored decls at start of assembly

2021-07-26 Thread Bernd Edlinger
Ignored functions decls that are compiled at the start of
the assembly have bogus line numbers until the first .file
directive, as reported in PR101575.

The work around for this issue is to emit a dummy .file
directive when the first function is DECL_IGNORED_P, when
that is not already done, mostly for -fdwarf-4.

2021-07-24  Bernd Edlinger  

PR ada/101575
* dwarf2out.c (dwarf2out_begin_prologue): Move init
of fde->ignored_debug to dwarf2out_set_ignored_loc.
(dwarf2out_set_ignored_loc): This is now also called
when no .loc statement is to be generated, in that case
we emit a dummy .file statement when needed.
* final.c (final_start_function_1,
final_scan_insn_1): Call debug_hooks->set_ignored_loc
for all DECL_IGNORED_P functions.
---
 gcc/dwarf2out.c | 29 +
 gcc/final.c |  5 ++---
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 884f1e1..8de0d6f 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -1115,7 +1115,6 @@ dwarf2out_begin_prologue (unsigned int line 
ATTRIBUTE_UNUSED,
   fde->dw_fde_current_label = dup_label;
   fde->in_std_section = (fnsec == text_section
 || (cold_text_section && fnsec == cold_text_section));
-  fde->ignored_debug = DECL_IGNORED_P (current_function_decl);
   in_text_section_p = fnsec == text_section;
 
   /* We only want to output line number information for the genuine dwarf2
@@ -28546,10 +28545,32 @@ dwarf2out_set_ignored_loc (unsigned int line, 
unsigned int column,
 {
   dw_fde_ref fde = cfun->fde;
 
-  fde->ignored_debug = false;
-  set_cur_line_info_table (function_section (fde->decl));
+  if (filename)
+{
+  set_cur_line_info_table (function_section (fde->decl));
+
+  dwarf2out_source_line (line, column, filename, 0, true);
+}
+  else
+{
+  fde->ignored_debug = true;
+
+  /* Work around for PR101575: output a dummy .file directive.  */
+  if (in_first_function_p
+ && debug_info_level >= DINFO_LEVEL_TERSE
+ && dwarf_debuginfo_p ()
+#if defined(HAVE_AS_GDWARF_5_DEBUG_FLAG) && 
defined(HAVE_AS_WORKING_DWARF_N_FLAG)
+ && dwarf_version <= 4
+#endif
+ && output_asm_line_debug_info ())
+   {
+ const char *filename0 = get_AT_string (comp_unit_die (), DW_AT_name);
 
-  dwarf2out_source_line (line, column, filename, 0, true);
+ if (filename0 == NULL)
+   filename0 = "";
+ maybe_emit_file (lookup_filename (filename0));
+   }
+}
 }
 
 /* Record the beginning of a new source file.  */
diff --git a/gcc/final.c b/gcc/final.c
index ac6892d..82a5767 100644
--- a/gcc/final.c
+++ b/gcc/final.c
@@ -1725,7 +1725,7 @@ final_start_function_1 (rtx_insn **firstp, FILE *file, 
int *seen,
   if (!dwarf2_debug_info_emitted_p (current_function_decl))
 dwarf2out_begin_prologue (0, 0, NULL);
 
-  if (DECL_IGNORED_P (current_function_decl) && last_linenum && last_filename)
+  if (DECL_IGNORED_P (current_function_decl))
 debug_hooks->set_ignored_loc (last_linenum, last_columnnum, last_filename);
 
 #ifdef LEAF_REG_REMAP
@@ -2205,8 +2205,7 @@ final_scan_insn_1 (rtx_insn *insn, FILE *file, int 
optimize_p ATTRIBUTE_UNUSED,
}
  else if (!DECL_IGNORED_P (current_function_decl))
debug_hooks->switch_text_section ();
- if (DECL_IGNORED_P (current_function_decl) && last_linenum
- && last_filename)
+ if (DECL_IGNORED_P (current_function_decl))
debug_hooks->set_ignored_loc (last_linenum, last_columnnum,
  last_filename);
 
-- 
1.9.1


[PATCH 0/2] Fix bogus line info in DECL_IGNORED_P functions

2021-07-26 Thread Bernd Edlinger
This fixes a regression which was introduced accidentally with
commit e69ac020372 ("Add line debug info for virtual thunks").

That is certain Ada functions have now a .loc directive which is
probably not how the debug experience of those functions was intended.

Currently those functions stop in the debugger when stepped into, and
show the source location where the data type was declared.  It was
pointed out in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101575#c7
that the previous behaviour is what Ada really wants here.

The prevoious debug experience can be achieved by setting
DECL_SOURCE_LOCATION of such functions in the Ada front end
to UNKNOWN_LOCATION.  This is done by (2/2).

However it turns out that the test case which is from the
gdb-test suite has a special feature that the ignored function
is compiled before the first normal function, and that triggers
a misbehaviour of the dwarf-4 assembler.  That shows a corner
case where the current handling of ignored functions with
no line info at all fails to suppress the bogus line numbers.
That is fixed by (1/2), by emiting a dummy .file directive
when there was no preceeding .file directive.


This was boot-strapped and reg-tested on x86_64-pc-linux-gnu.
Is it OK for trunk?

Thanks,

Bernd Edlinger (2):
  Fix debug info for ignored decls at start of assembly
  Ada: Remove debug line number for DECL_IGNORED_P functions

 gcc/ada/gcc-interface/trans.c |  4 +++-
 gcc/dwarf2out.c   | 29 +
 gcc/final.c   |  5 ++---
 3 files changed, 30 insertions(+), 8 deletions(-)

-- 
1.9.1


Re: [patch][version5]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-07-26 Thread Qing Zhao via Gcc-patches


> On Jul 26, 2021, at 11:09 AM, Martin Jambor  wrote:
> 
> Hi,
> 
>>> So you either need to change build_access_from_expr like I described in
>>> my email
>> 
>> Is the following the change you suggested previously:
>> 
>> [opc@qinzhao-ol8u3-x86 gcc]$ git diff tree-sra.c
>> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
>> index d1280e5f8848..c2597b705169 100644
>> --- a/gcc/tree-sra.c
>> +++ b/gcc/tree-sra.c
>> @@ -1195,8 +1195,10 @@ build_access_from_expr (tree expr, gimple *stmt, bool 
>> write)
>> {
>>   /* This means the aggregate is accesses as a whole in a way other than 
>> an
>> assign statement and thus cannot be removed even if we had a scalar
>> -replacement for everything.  */
>> -  if (cannot_scalarize_away_bitmap)
>> +replacement for everything. However, when the STMT is a call to
>> +DEFERRED_INIT, we should not set this bit.  */
>> +  if (cannot_scalarize_away_bitmap 
>> + && !gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
>>bitmap_set_bit (cannot_scalarize_away_bitmap, DECL_UID 
>> (access->base));
>>   return true;
>> }
>> 
> 
> Yes, although I think that the one I wrote today is more efficient as it
> tests for IFN_DEFERRED_INIT only if we already know stmt is a call and
> also philosophically more correct as the test is performed only for the
> LHS of the statement (I don't think either reason matters much in
> practice, though).

Okay, I will use the patch you wrote today instead.
> 
>> 
>>> or add the following to your patch, which is probably slightly
>>> mor efficient (but it has been only very mildly tested).
>>> 
>>> 
>>> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
>>> index d1280e5f884..602b0fb3a2d 100644
>>> --- a/gcc/tree-sra.c
>>> +++ b/gcc/tree-sra.c
>>> @@ -1395,7 +1395,12 @@ scan_function (void)
>>> 
>>> t = gimple_call_lhs (stmt);
>>> if (t && !disqualify_if_bad_bb_terminating_stmt (stmt, t, NULL))
>>> -   ret |= build_access_from_expr (t, stmt, true);
>>> +   {
>>> + if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
>>> +   ret |= !!build_access_from_expr_1 (t, stmt, true);
>>> + else
>>> +   ret |= build_access_from_expr (t, stmt, true);
>>> +   }
>>> break;
>> 
>> Thanks for the patch, but I don’t quite understand the above change:
>> 
>> When the call is IFN_DEFERRED_INIT, you used build_access_from_expr_1 
>> instead of build_access_from_expr to avoid setting 
>> “cannot_scalarize_away_bitmap” bit.
>>But why adding “!” To this call?
> 
> Note that it is a double !! which is basically a fancy way to convert a
> test for non-NULL to a bool.  It is equivalent to:
> 
> + if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
> +   ret |= (build_access_from_expr_1 (t, stmt, true) != NULL);
> + else
> +   ret |= build_access_from_expr (t, stmt, true);
> 
> use whichever variant you like better.  But the !! trick is used
> throughout the gcc source already.

Okay, I see. thanks.

Qing
> 
> Martin



Re: [PATCH] Replace evrp use in loop versioning with ranger.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
> > How expensive is the check?  If the result is worth caching, perhaps
> > we should have two bitmaps: the existing one, and one that records
> > whether we've checked a particular SSA name.
> >
> > If the check is relatively cheap then that won't be worth it though.
>
> If you're asking about the range_of_expr check, that's all cached, so
> it should be pretty cheap.  Besides, we're no longer calculating
> ranges for each statement in the IL, as we were doing in lv_dom_walker
> with evrp's record_ranges_from_stmt.  Only statements of interest are
> queried.

You know, we already have infrastructure to test performance of small
changes with callgrind.  If you want I can compare numbers for:

1. mainline right now
2. my proposed patch
3. my proposed patch without a dom walk

Aldy



Re: [patch][version5]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-07-26 Thread Martin Jambor
Hi,

On Mon, Jul 26 2021, Qing Zhao wrote:
> HI, Martin,
>
> Thank you for the comments and suggestions on tree-sra.c changes.
>
>>> **Compare with the 4th version, the following are the major changes:
>>> 
>>> 1. delete the code for handling "grp_to_be_debug_replaced" since they are 
>>> not needed per Martin Jambor's suggestion.
>> 
>> sorry if I did not make myself clear in my last email, but the deferred
>> init calls should not result into setting any bits in
>> cannot_scalarize_away_bitmap in the SRA pass, otherwise you'll get
>> different optimization with and without -ftrivial-auto-var-init.
>
> It’s my bad that I missed this part of your comments…
>
>> 
>> So you either need to change build_access_from_expr like I described in
>> my email
>
> Is the following the change you suggested previously:
>
> [opc@qinzhao-ol8u3-x86 gcc]$ git diff tree-sra.c
> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
> index d1280e5f8848..c2597b705169 100644
> --- a/gcc/tree-sra.c
> +++ b/gcc/tree-sra.c
> @@ -1195,8 +1195,10 @@ build_access_from_expr (tree expr, gimple *stmt, bool 
> write)
>  {
>/* This means the aggregate is accesses as a whole in a way other than 
> an
>  assign statement and thus cannot be removed even if we had a scalar
> -replacement for everything.  */
> -  if (cannot_scalarize_away_bitmap)
> +replacement for everything. However, when the STMT is a call to
> +DEFERRED_INIT, we should not set this bit.  */
> +  if (cannot_scalarize_away_bitmap 
> + && !gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
> bitmap_set_bit (cannot_scalarize_away_bitmap, DECL_UID 
> (access->base));
>return true;
>  }
>

Yes, although I think that the one I wrote today is more efficient as it
tests for IFN_DEFERRED_INIT only if we already know stmt is a call and
also philosophically more correct as the test is performed only for the
LHS of the statement (I don't think either reason matters much in
practice, though).

>
>> or add the following to your patch, which is probably slightly
>> mor efficient (but it has been only very mildly tested).
>> 
>> 
>> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
>> index d1280e5f884..602b0fb3a2d 100644
>> --- a/gcc/tree-sra.c
>> +++ b/gcc/tree-sra.c
>> @@ -1395,7 +1395,12 @@ scan_function (void)
>> 
>>  t = gimple_call_lhs (stmt);
>>  if (t && !disqualify_if_bad_bb_terminating_stmt (stmt, t, NULL))
>> -   ret |= build_access_from_expr (t, stmt, true);
>> +   {
>> + if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
>> +   ret |= !!build_access_from_expr_1 (t, stmt, true);
>> + else
>> +   ret |= build_access_from_expr (t, stmt, true);
>> +   }
>>  break;
>
> Thanks for the patch, but I don’t quite understand the above change:
>
> When the call is IFN_DEFERRED_INIT, you used build_access_from_expr_1 instead 
> of build_access_from_expr to avoid setting “cannot_scalarize_away_bitmap” bit.
> But why adding “!” To this call?

Note that it is a double !! which is basically a fancy way to convert a
test for non-NULL to a bool.  It is equivalent to:

+ if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
+   ret |= (build_access_from_expr_1 (t, stmt, true) != NULL);
+ else
+   ret |= build_access_from_expr (t, stmt, true);

use whichever variant you like better.  But the !! trick is used
throughout the gcc source already.

Martin


Re: [patch][version5]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-07-26 Thread Qing Zhao via Gcc-patches
Martin, 

The following is the patch to fix the issues you raised in the previous email, 
let me know if I still miss anything:

Thanks a lot.

Qing

=

From 14524a228b4b41b4eaaa2497455725e075126c2c Mon Sep 17 00:00:00 2001
From: Qing Zhao 
Date: Mon, 26 Jul 2021 15:46:59 +
Subject: [PATCH] Fix tree-sra.c issue raised by Martin Jambor

---
 gcc/testsuite/gcc.dg/auto-init-sra-1.c | 24 
 gcc/testsuite/gcc.dg/auto-init-sra-2.c | 24 
 gcc/tree-sra.c |  8 ++--
 3 files changed, 53 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/auto-init-sra-1.c
 create mode 100644 gcc/testsuite/gcc.dg/auto-init-sra-2.c

diff --git a/gcc/testsuite/gcc.dg/auto-init-sra-1.c 
b/gcc/testsuite/gcc.dg/auto-init-sra-1.c
new file mode 100644
index ..88fd66678f29
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/auto-init-sra-1.c
@@ -0,0 +1,24 @@
+/* Verify that SRA total scalarization will not be confused by padding
+   and also not confused by auto initialization.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 --param sra-max-scalarization-size-Ospeed=16 
-fdump-tree-release_ssa -ftrivial-auto-var-init=zero" } */
+
+struct S
+{
+  int i;
+  unsigned short f1;
+  char f2;
+  unsigned short f3, f4;
+};
+
+
+int foo (struct S *p)
+{
+  struct S l;
+
+  l = *p;
+  l.i++;
+  *p = l;
+}
+
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" } } */
diff --git a/gcc/testsuite/gcc.dg/auto-init-sra-2.c 
b/gcc/testsuite/gcc.dg/auto-init-sra-2.c
new file mode 100644
index ..d260f5ae934e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/auto-init-sra-2.c
@@ -0,0 +1,24 @@
+/* Verify that SRA total scalarization will not be confused by padding
+   and also not confused by auto initialization.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 --param sra-max-scalarization-size-Ospeed=16 
-fdump-tree-release_ssa -ftrivial-auto-var-init=pattern" } */
+
+struct S
+{
+  int i;
+  unsigned short f1;
+  char f2;
+  unsigned short f3, f4;
+};
+
+
+int foo (struct S *p)
+{
+  struct S l;
+
+  l = *p;
+  l.i++;
+  *p = l;
+}
+
+/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" } } */
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index d1280e5f8848..bebba4deaf94 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -1195,8 +1195,10 @@ build_access_from_expr (tree expr, gimple *stmt, bool 
write)
 {
   /* This means the aggregate is accesses as a whole in a way other than an
 assign statement and thus cannot be removed even if we had a scalar
-replacement for everything.  */
-  if (cannot_scalarize_away_bitmap)
+replacement for everything.  However, when the STMT is a call to
+DEFERRED_INIT, we should not set this bit.  */
+  if (cannot_scalarize_away_bitmap
+ && !gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
bitmap_set_bit (cannot_scalarize_away_bitmap, DECL_UID (access->base));
   return true;
 }
@@ -4167,6 +4169,8 @@ sra_modify_deferred_init (gimple *stmt, 
gimple_stmt_iterator *gsi)
   tree arg0_repl = TYPE_SIZE_UNIT (TREE_TYPE (lhs_repl));
   gimple_call_set_arg (stmt, 0, arg0_repl);
   sra_stats.deferred_init++;
+  gcc_assert (!lhs_access->first_child);
+  return SRA_AM_MODIFIED;
 }
 
   if (lhs_access->first_child)
-- 
2.27.0





> On Jul 26, 2021, at 10:25 AM, Qing Zhao via Gcc-patches 
>  wrote:
> 
> HI, Martin,
> 
> Thank you for the comments and suggestions on tree-sra.c changes.
> 
>>> **Compare with the 4th version, the following are the major changes:
>>> 
>>> 1. delete the code for handling "grp_to_be_debug_replaced" since they are 
>>> not needed per Martin Jambor's suggestion.
>> 
>> sorry if I did not make myself clear in my last email, but the deferred
>> init calls should not result into setting any bits in
>> cannot_scalarize_away_bitmap in the SRA pass, otherwise you'll get
>> different optimization with and without -ftrivial-auto-var-init.
> 
> It’s my bad that I missed this part of your comments…
> 
>> 
>> So you either need to change build_access_from_expr like I described in
>> my email
> 
> Is the following the change you suggested previously:
> 
> [opc@qinzhao-ol8u3-x86 gcc]$ git diff tree-sra.c
> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
> index d1280e5f8848..c2597b705169 100644
> --- a/gcc/tree-sra.c
> +++ b/gcc/tree-sra.c
> @@ -1195,8 +1195,10 @@ build_access_from_expr (tree expr, gimple *stmt, bool 
> write)
> {
>   /* This means the aggregate is accesses as a whole in a way other than 
> an
> assign statement and thus cannot be removed even if we had a scalar
> -replacement for everything.  */
> -  if (cannot_scalarize_away_bitmap)
> +replacement for everything. However, when the STMT is a call to
> +DEFERRED_INIT, we should not set this bit.  */
> +  if (cannot_scalarize_away_bitmap 
> + && !gimple_call_internal_p (stmt, 

Re: [PATCH] Fold (X<

2021-07-26 Thread Marc Glisse

On Mon, 26 Jul 2021, Roger Sayle wrote:

The one aspect that's a little odd is that each transform is paired with 
a convert@1 variant, using the efficient match machinery to expose any 
zero extension to fold-const.c's tree_nonzero_bits functionality.


Copying the first transform for context

+(for op (bit_ior bit_xor)
+ (simplify
+  (op (mult:s@0 @1 INTEGER_CST@2)
+  (mult:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (mult @1
+{ wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4)); })))
+ (simplify
+  (op (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+  (mult:s@4 (convert@1 @2) INTEGER_CST@5))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+   (mult @1
+{ wide_int_to_tree (type, wi::to_wide (@3) + wi::to_wide (@5)); })))

Could you explain how the convert helps exactly?

--
Marc Glisse


Re: [PATCH] Analyzer: Refactor callstring to work with pairs of supernodes.

2021-07-26 Thread David Malcolm via Gcc-patches
On Mon, 2021-07-26 at 21:04 +0530, Ankur Saini wrote:
> 
> Here is the latest patch after fixing all the nit picks and issues
> pointed out by everyone. 
> 

Looks good to me.

Thanks
Dave



Re: [PATCH] Analyzer: Refactor callstring to work with pairs of supernodes.

2021-07-26 Thread Ankur Saini via Gcc-patches

Here is the latest patch after fixing all the nit picks and issues pointed out 
by everyone. 



call_string.patch
Description: Binary data


Thanks you 
- Ankur

Re: [patch][version5]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-07-26 Thread Qing Zhao via Gcc-patches
HI, Martin,

Thank you for the comments and suggestions on tree-sra.c changes.

>> **Compare with the 4th version, the following are the major changes:
>> 
>> 1. delete the code for handling "grp_to_be_debug_replaced" since they are 
>> not needed per Martin Jambor's suggestion.
> 
> sorry if I did not make myself clear in my last email, but the deferred
> init calls should not result into setting any bits in
> cannot_scalarize_away_bitmap in the SRA pass, otherwise you'll get
> different optimization with and without -ftrivial-auto-var-init.

It’s my bad that I missed this part of your comments…

> 
> So you either need to change build_access_from_expr like I described in
> my email

Is the following the change you suggested previously:

[opc@qinzhao-ol8u3-x86 gcc]$ git diff tree-sra.c
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index d1280e5f8848..c2597b705169 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -1195,8 +1195,10 @@ build_access_from_expr (tree expr, gimple *stmt, bool 
write)
 {
   /* This means the aggregate is accesses as a whole in a way other than an
 assign statement and thus cannot be removed even if we had a scalar
-replacement for everything.  */
-  if (cannot_scalarize_away_bitmap)
+replacement for everything. However, when the STMT is a call to
+DEFERRED_INIT, we should not set this bit.  */
+  if (cannot_scalarize_away_bitmap 
+ && !gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
bitmap_set_bit (cannot_scalarize_away_bitmap, DECL_UID (access->base));
   return true;
 }


> or add the following to your patch, which is probably slightly
> mor efficient (but it has been only very mildly tested).
> 
> 
> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
> index d1280e5f884..602b0fb3a2d 100644
> --- a/gcc/tree-sra.c
> +++ b/gcc/tree-sra.c
> @@ -1395,7 +1395,12 @@ scan_function (void)
> 
>  t = gimple_call_lhs (stmt);
>  if (t && !disqualify_if_bad_bb_terminating_stmt (stmt, t, NULL))
> -   ret |= build_access_from_expr (t, stmt, true);
> +   {
> + if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
> +   ret |= !!build_access_from_expr_1 (t, stmt, true);
> + else
> +   ret |= build_access_from_expr (t, stmt, true);
> +   }
>  break;

Thanks for the patch, but I don’t quite understand the above change:

When the call is IFN_DEFERRED_INIT, you used build_access_from_expr_1 instead 
of build_access_from_expr to avoid setting “cannot_scalarize_away_bitmap” bit.
But why adding “!” To this call?


> 
>case GIMPLE_ASM:
> 
> 
> 
> And unfortunately I just spotted another potential problem in:
> 
>> +static enum assignment_mod_result
>> +sra_modify_deferred_init (gimple *stmt, gimple_stmt_iterator *gsi)
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree init_type = gimple_call_arg (stmt, 1);
>> +  tree is_vla = gimple_call_arg (stmt, 2);
>> +
>> +  struct access *lhs_access = get_access_for_expr (lhs);
>> +  if (!lhs_access)
>> +return SRA_AM_NONE;
>> +
>> +  location_t loc = gimple_location (stmt);
>> +
>> +  if (lhs_access->grp_to_be_replaced)
>> +{
>> +  tree lhs_repl = get_access_replacement (lhs_access);
>> +  gimple_call_set_lhs (stmt, lhs_repl);
>> +  tree arg0_repl = TYPE_SIZE_UNIT (TREE_TYPE (lhs_repl));
>> +  gimple_call_set_arg (stmt, 0, arg0_repl);
>> +  sra_stats.deferred_init++;
> 
> I think you want to add:
> 
>  gcc_assert (!lhs_access->first_child);
>  return SRA_AM_MODIFIED;

Okay. 
> 
> here, otherwise you risk that (in a case of a single-field structure)
> the call you have just modified here...
> 
>> +}
>> +
>> +  if (lhs_access->first_child)
>> +generate_subtree_deferred_init (lhs_access->first_child,
>> +init_type, is_vla, gsi, loc);
>> +  if (lhs_access->grp_covered)
>> +{
>> +  unlink_stmt_vdef (stmt);
>> +  gsi_remove (gsi, true);
> 
> ...will be deleted here.

I see. Thanks a lot for spotting this issue.
> 
>> +  release_defs (stmt);
>> +  return SRA_AM_REMOVED;
>> +}
>> +
>> +  return SRA_AM_MODIFIED;
>> +}
> 
> Sorry again for spotting this late and perhaps mis-communicating about
> the cannot_scalarize_away_bitmap issue earlier.  Apart from these two
> things, I believe the tree-sra.c bits are fin.

Thank you for the detailed review.

Qing
> 
> Martin
> 
> 
> 
> 
>> 2. for Pattern init, call __builtin_clear_padding after the call to 
>> .DEFERRED_INIT to initialize the paddings to zeroes;
>> 3. for partially or fully initialized auto variables, call   
>> __builtin_clear_padding before the real initialization to initialize
>>the paddings to zeroes.
>> 4. Update the documentation with padding initialization to zeroes.
>> 5. in order to reuse __builtin_clear_padding for auto init purpose, add one 
>> more dummy argument to indiciate 

Re: [PATCH] Replace evrp use in loop versioning with ranger.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
On Mon, Jul 26, 2021 at 4:18 PM Richard Sandiford
 wrote:
>
> Aldy Hernandez  writes:
> > This patch replaces the evrp_range_analyzer in the loop versioning code
> > with an on-demand ranger.
> >
> > Everything was pretty straightforward, except that range_of_expr requires
> > a gimple statement as context to provide context aware ranges.  I didn't see
> > a convient place where the statement was saved, so I made a vector indexed
> > by SSA names.  As an alternative, I tried to use the loop's first statement,
> > but that proved to be insufficient.
>
> The mapping is one-to-many though: there can be multiple statements
> for each SSA name.  Maybe that doesn't matter in this context and
> any of the statements can act as a representative.
>
> I'm surprised that the loop's first statement didn't work though,
> since the SSA name is supposedly known to be loop-invariant.  What went
> wrong when you tried that?

I was looking at the first statement of loop_info->block_list and one
of the dg.exp=loop-versioning* tests failed.  Perhaps I should have
used the loop itself, as in the attached patch.  With this patch all
of the loop-versioning tests pass.

>
> > I am not familiar with loop versioning, but if the DOM walk was only
> > necessary for the calls to record_ranges_from_stmt, this too could be
> > removed as the ranger will work without it.
>
> Yeah, that was the only reason.  If the information is available at
> version_for_unity (I guess it is) then we should just avoid recording
> the versioning there if so.
>
> How expensive is the check?  If the result is worth caching, perhaps
> we should have two bitmaps: the existing one, and one that records
> whether we've checked a particular SSA name.
>
> If the check is relatively cheap then that won't be worth it though.

If you're asking about the range_of_expr check, that's all cached, so
it should be pretty cheap.  Besides, we're no longer calculating
ranges for each statement in the IL, as we were doing in lv_dom_walker
with evrp's record_ranges_from_stmt.  Only statements of interest are
queried.

How about this patch, pending tests?

Aldy
From 8a350003d950869499d729921008abdb491d3a5e Mon Sep 17 00:00:00 2001
From: Aldy Hernandez 
Date: Sat, 24 Jul 2021 12:29:28 +0200
Subject: [PATCH] Replace evrp use in loop versioning with ranger.

This patch replaces the evrp_range_analyzer in the loop versioning code
with an on-demand ranger.

Tested on x86-64 Linux.

gcc/ChangeLog:

	* gimple-loop-versioning.cc (lv_dom_walker::lv_dom_walker): Remove
	use of m_range_analyzer.
	(loop_versioning::lv_dom_walker::before_dom_children): Same.
	(loop_versioning::lv_dom_walker::after_dom_children): Remove.
	(loop_versioning::prune_loop_conditions): Replace vr_values use
	with range_query interface.
	(pass_loop_versioning::execute): Use ranger.
---
 gcc/gimple-loop-versioning.cc | 44 ---
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
index 4b70c5a4aab..46c3a508c8d 100644
--- a/gcc/gimple-loop-versioning.cc
+++ b/gcc/gimple-loop-versioning.cc
@@ -30,19 +30,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-loop.h"
 #include "ssa.h"
 #include "tree-scalar-evolution.h"
-#include "tree-chrec.h"
 #include "tree-ssa-loop-ivopts.h"
 #include "fold-const.h"
 #include "tree-ssa-propagate.h"
 #include "tree-inline.h"
 #include "domwalk.h"
-#include "alloc-pool.h"
-#include "vr-values.h"
-#include "gimple-ssa-evrp-analyze.h"
 #include "tree-vectorizer.h"
 #include "omp-general.h"
 #include "predict.h"
 #include "tree-into-ssa.h"
+#include "gimple-range.h"
+#include "tree-cfg.h"
 
 namespace {
 
@@ -261,14 +259,10 @@ private:
 lv_dom_walker (loop_versioning &);
 
 edge before_dom_children (basic_block) FINAL OVERRIDE;
-void after_dom_children (basic_block) FINAL OVERRIDE;
 
   private:
 /* The parent pass.  */
 loop_versioning _lv;
-
-/* Used to build context-dependent range information.  */
-evrp_range_analyzer m_range_analyzer;
   };
 
   /* Used to simplify statements based on conditions that are established
@@ -308,7 +302,7 @@ private:
   bool analyze_block (basic_block);
   bool analyze_blocks ();
 
-  void prune_loop_conditions (class loop *, vr_values *);
+  void prune_loop_conditions (class loop *);
   bool prune_conditions ();
 
   void merge_loop_info (class loop *, class loop *);
@@ -500,7 +494,7 @@ loop_info::worth_versioning_p () const
 }
 
 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning )
-  : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
+  : dom_walker (CDI_DOMINATORS), m_lv (lv)
 {
 }
 
@@ -509,26 +503,12 @@ loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning )
 edge
 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 {
-  m_range_analyzer.enter (bb);
-
   if (bb == bb->loop_father->header)
-m_lv.prune_loop_conditions (bb->loop_father, 

Re: 0001-Don-t-skip-prologue-instructions-as-it-could-affect-.patch

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/25/2021 7:47 PM, Bin.Cheng wrote:

On Sat, Jul 24, 2021 at 12:30 AM Jeff Law via Gcc-patches
 wrote:



On 7/14/2021 3:14 AM, bin.cheng via Gcc-patches wrote:

Hi,
I ran into a wrong code bug in code with deep template instantiation when 
working on sdx::simd.
The root cause as described in commit summary is we skip prologue insns in 
init_alias_analysis.
This simple patch fixes the issue, however, it's hard to reduce a case because 
of heavy use of
templates.
Bootstrap and test on x86_64, is it OK?

It's a clear correctness improvement, but what's unclear to me is why
we'd want to skip them in the epilogue either.

I can only guess, there is nothing to initialize epilogue for because
no code follows.
Yea, but couldn't the lack of analysis of the epilogue lead to a pass 
mis-optimizing code within the epilogue itself?  It's not terribly 
likely, but it just seems wrong to skip the epilogue like this.  
Remember, the aliasing bits are just an analysis phase to find the 
aliasing relationships that exist and we don't necessarily know how that 
data is going to be used.  It may in fact be safe now, but may not be 
safe in the future if someone added a late RTL pass that used the 
aliasing info in a new way.


The more I think about it, the more I think we should remove remove this 
hunk of code completely.  There is some chance for fallout, but I think 
it's unlikely.


Jeff



Re: [PATCH] testsuite: mips: use noinline attribute instead of -fno-inline

2021-07-26 Thread Maciej W. Rozycki
On Mon, 26 Jul 2021, Richard Sandiford wrote:

> >> Sorry, somehow I didn't see Richard's reply.  Perhaps a
> >> misconfiguration
> >> on my mail server.
> 
> I don't know where the problem lies, but for some reason I've been
> getting rejects when sending messages directly (via reply-all).

 Something about the .wang domain with your mail system perhaps?  The 
domain is not one of those we've been used to and must surely be one of 
the IANA's additions from a few years ago.  In any case it is valid and 
mail deliveries seem to work from here just fine.  I suggest talking to 
your IT staff.

$ host mengyan1223.wang
mengyan1223.wang has address 89.208.246.23
mengyan1223.wang has IPv6 address 2001:470:683e::1
mengyan1223.wang mail is handled by 10 mengyan1223.wang.
$

  Maciej


Re: [PATCH take 2] Fold bswap32(x) != 0 to x != 0 (and related transforms)

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/24/2021 3:44 AM, Roger Sayle wrote:

My apologies for the short delay.  Thanks for explaining why these
transforms
don't usually require explicit tests for side-effects (but occasionally do).
This does simplify things; please find attached the shorter revised patch.

This patch has been retested on x86_64-pc-linux-gnu with a make bootstrap
and make -k check with no new failures.  Ok for mainline?

2010-07-24  Roger Sayle  
Marc Glisse  

gcc/ChangeLog
* match.pd (rotate): Simplify equality/inequality of rotations.
(bswap): Simplify equality/inequality tests of byte swapping.

gcc/testsuite/ChangeLog
* gcc.dg/fold-eqrotate-1.c: New test case.
* gcc.dg/fold-eqbswap-1.c: New test case.

OK.  Thanks!

jeff



[PUSHED] Implement operator_bitwise_xor::op1_op2_relation_effect.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
This patch adjusts XORing of ranges where the operands are known to be
equal or not equal.

We should probably do the same thing for the op[12]_range methods.

Tested on x86-64 Linux.

gcc/ChangeLog:

* range-op.cc (operator_bitwise_xor::op1_op2_relation_effect):
New.
---
 gcc/range-op.cc | 33 +
 1 file changed, 33 insertions(+)

diff --git a/gcc/range-op.cc b/gcc/range-op.cc
index 4bdd14dd6f7..b1fb25c77f8 100644
--- a/gcc/range-op.cc
+++ b/gcc/range-op.cc
@@ -3101,6 +3101,11 @@ public:
  const irange ,
  const irange ,
  relation_kind rel = VREL_NONE) const;
+  virtual bool op1_op2_relation_effect (irange _range,
+   tree type,
+   const irange _range,
+   const irange _range,
+   relation_kind rel) const;
 } op_bitwise_xor;
 
 void
@@ -3134,6 +3139,34 @@ operator_bitwise_xor::wi_fold (irange , tree type,
 r.set_varying (type);
 }
 
+bool
+operator_bitwise_xor::op1_op2_relation_effect (irange _range,
+  tree type,
+  const irange &,
+  const irange &,
+  relation_kind rel) const
+{
+  if (rel == VREL_NONE)
+return false;
+
+  int_range<2> rel_range;
+
+  switch (rel)
+{
+case EQ_EXPR:
+  rel_range.set_zero (type);
+  break;
+case NE_EXPR:
+  rel_range.set_nonzero (type);
+  break;
+default:
+  return false;
+}
+
+  lhs_range.intersect (rel_range);
+  return true;
+}
+
 bool
 operator_bitwise_xor::op1_range (irange , tree type,
 const irange ,
-- 
2.31.1



[PUSHED] Pass relationship to methods calling generic fold_range.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
Fix a small oversight in methods calling the base class fold_range.

Tested on x86-64 Linux.

gcc/ChangeLog:

* range-op.cc (operator_lshift::fold_range): Pass rel to
base class fold_range.
(operator_rshift::fold_range): Same.
---
 gcc/range-op.cc | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/range-op.cc b/gcc/range-op.cc
index e0be51dbc90..4bdd14dd6f7 100644
--- a/gcc/range-op.cc
+++ b/gcc/range-op.cc
@@ -1929,7 +1929,7 @@ bool
 operator_lshift::fold_range (irange , tree type,
 const irange ,
 const irange ,
-relation_kind rel ATTRIBUTE_UNUSED) const
+relation_kind rel) const
 {
   int_range_max shift_range;
   if (!get_shift_range (shift_range, type, op2))
@@ -1960,7 +1960,7 @@ operator_lshift::fold_range (irange , tree type,
 }
   else
 // Otherwise, invoke the generic fold routine.
-return range_operator::fold_range (r, type, op1, shift_range);
+return range_operator::fold_range (r, type, op1, shift_range, rel);
 }
 
 void
@@ -2189,7 +2189,7 @@ bool
 operator_rshift::fold_range (irange , tree type,
 const irange ,
 const irange ,
-relation_kind rel ATTRIBUTE_UNUSED) const
+relation_kind rel) const
 {
   int_range_max shift;
   if (!get_shift_range (shift, type, op2))
@@ -2201,7 +2201,7 @@ operator_rshift::fold_range (irange , tree type,
   return true;
 }
 
-  return range_operator::fold_range (r, type, op1, shift);
+  return range_operator::fold_range (r, type, op1, shift, rel);
 }
 
 void
-- 
2.31.1



Re: [PATCH] Remove legacy external declarations in toplev.h [PR101447]

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/15/2021 1:58 AM, Richard Biener via Gcc-patches wrote:

On Thu, Jul 15, 2021 at 3:54 AM ashimida via Gcc-patches
 wrote:


External declarations in ./gcc/toplev.h is no longer used in newest
version of gcc and should be cleaned up to avoid misunderstandings.

OK


gcc/ChangeLog:

  * toplev.h (set_random_seed):

I've fixed the ChangeLog and committed this to the trunk for Ashimida.

Ashimida -- in the future please don't close the PR until the change has 
been committed to the trunk.


Thanks,
Jeff



Re: [PATCH] assume built-in calls don't modify allocated objects (PR 101584)

2021-07-26 Thread Jeff Law via Gcc-patches




On 7/22/2021 4:27 PM, Martin Sebor via Gcc-patches wrote:

Passing a pointer to a built-in function escapes it, which in
turn causes objects pointed to by other potentially aliased
pointers to be assumed to be modified by other built-in calls.
This leads to a class of false negative -Wuninitialized
warnings that can be avoided by the warning code and without
making changes to the aliasing machinery.

In addition, GCC conservatively assumes that an object whose
address is passed as an argument to any directive in a printf
call is modified by the call.  This is necessary if the directive
isn't known because it could be %n, but in such a case it's
reasonable to assume the pointed-to type wouldn't be const-
qualified.  This assumption makes it easy to detect a class
of uninitialized reads that are not detected today.

The attached patch implements both assumptions: i.e., that a call
to a built-in function declared to take only const pointer
arguments, or to a variadic function with only const pointers
as arguments, doesn't modify any objects.

The change detects certain uninitialized accesses slightly earlier
which causes uninit-38.c failures.  As the comment in the test
explains, that's expected.  I've simply removed the failed tests
and left the rest.  They exercise the same functionality (MEM_REF
formatting).

Tested on x86_64-linux.

Martin


gcc-101584.diff

PR tree-optimization/101584 - missing -Wuninitialized with an allocated object 
after a built-in call

gcc/ChangeLog:

PR tree-optimization/101584
* tree-ssa-uninit.c (builtin_call_nomodifying_p): New function.
(check_defs): Call it.

gcc/testsuite/ChangeLog:

PR tree-optimization/101584
* gcc.dg/uninit-38.c: Remove assertions.
* gcc.dg/uninit-41.c: New test.

OK
jeff



Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Jakub Jelinek via Gcc-patches
On Fri, Jul 09, 2021 at 12:16:15PM +0100, Kwok Cheung Yeung wrote:
> This is a WIP implementation of metadirectives as defined in the OpenMP 5.0
> spec. I intend to add support for metadirectives as specified in OpenMP 5.1
> later (where the directive can be selected dynamically at runtime), but am
> concentrating on the static part for now. Parsing has only been implemented
> in the C frontend so far. I am especially interested in feedback regarding
> certain aspects of the implementation before I become too committed to the
> current design.

Note, there is a partial overlap with the attribute syntax changes, see below.
c-family/c-omp.c now has omp_directives table that should be updated for
changes like this and then c_omp_categorize_directive that returns some
information about the directives given a directive name (though, that name
can be one, two or three tokens long, consider e.g. target enter data
or cancellation point directives).

For metadirective, I think very special case are declarative directives in
them, I'd tend to sorry for them at least for now, I'm pretty sure many
cases with them are just unimplementable and will need to be restricted in
the standard, others can be implemented with lots of effort.
Whether it is e.g. metadirective guarding declare target ... end declare
target pair that would only conditionally set declare target and instead of
a single bit to find out if something is declare target or not we'd until
resolved need to compute it for all possibilities, or e.g. conditional
declare reduction/declare mapper where the name lookup for reduction or map
directives would be dependent on metadirective resolution later on, etc.
I'm afraid a total nightmare nobody has really thought about details for it.

> 1) When parsing each directive variant, a vector of tokens is constructed
> and populated with the tokens for a regular equivalent pragma, along with
> the tokens for its clauses and the body. The parser routine for that pragma
> type is then called with these tokens, and the entire resulting parse tree
> is stored as a sub-tree of the metadirective tree structure.
> 
> This results in the body being parsed and stored once for each directive
> variant. I believe this is necessary because the body is parsed differently
> if there is a 'for' in the directive (using c_parser_omp_for_loop) compared
> to if there is not, plus clauses in the directive (e.g. tile, collapse) can
> change how the for loop is parsed.
> 
> As an optimisation, identical body trees could be merged together, but that
> can come later.

I'm afraid it isn't just an optimization and we need to be as smart as
possible.  I'm not sure it is possible to parse everything many times,
consider e.g. labels in the blocks, nested function definitions, variable
definitions, etc.
While OpenMP requires that essentially the code must be valid if the
metadirective is replaced by any of those mentioned directives which rules
quite some weirdo corner cases, nothing prevents e.g. two or more
when directives to be standalone directives (which don't have any body and
so whatever comes after them should be left parsed for later as normal
statement sequence), one or more to be normal constructs that accept a
structured block and one or more to be e.g. looping constructs (simd, for,
distribute, taskloop or combined versions of those).
Even when issues with labels etc. are somehow solved (e.g. for structured
blocks we have the restriction that goto, break, continue, or switch into
a case/default label, etc. can't be used to enter or exit the structured
block which could mean some cases can be handled through renaming seen
labels in all but one bodies), most important is to sync on where parsing
should continue after the metadirective.
I think it would be nice if the metadirective parsing at least made quick
analysis on what kind of bodies the directives will want and can use the new
c-omp.c infrastructure or if needed extend it (e.g. separate the 
C_OMP_DIR_CONSTRUCT
category into C_OMP_DIR_CONSTRUCT and C_OMP_DIR_LOOPING_CONSTRUCT where
the latter would be used for those that expect some omp loop after it).
One option would be then to parse the body as the most restricted construct
(looping (and determine highest needed collapse and ordered), then construct,
then standalone) and be able to adjust what we parsed into what the
different constructs need, but another option is the separate parsing of
the code after the directive multiple times, but at least in the order of
most restricted to least restricted, remember where to stop and don't parse
it multiple times at least for directives that need the same thing.

> 
> 2) Selectors in the device set (i.e. kind, isa, arch) resolve differently
> depending on whether the program is running on a target or on the host.
> Since we don't keep multiple versions of a function for each target on the
> host compiler, resolving metadirectives with these selectors needs to be
> delayed until after 

Re: [PATCH] Replace evrp use in loop versioning with ranger.

2021-07-26 Thread Richard Sandiford via Gcc-patches
Aldy Hernandez  writes:
> This patch replaces the evrp_range_analyzer in the loop versioning code
> with an on-demand ranger.
>
> Everything was pretty straightforward, except that range_of_expr requires
> a gimple statement as context to provide context aware ranges.  I didn't see
> a convient place where the statement was saved, so I made a vector indexed
> by SSA names.  As an alternative, I tried to use the loop's first statement,
> but that proved to be insufficient.

The mapping is one-to-many though: there can be multiple statements
for each SSA name.  Maybe that doesn't matter in this context and
any of the statements can act as a representative.

I'm surprised that the loop's first statement didn't work though,
since the SSA name is supposedly known to be loop-invariant.  What went
wrong when you tried that?

> I am not familiar with loop versioning, but if the DOM walk was only
> necessary for the calls to record_ranges_from_stmt, this too could be
> removed as the ranger will work without it.

Yeah, that was the only reason.  If the information is available at
version_for_unity (I guess it is) then we should just avoid recording
the versioning there if so.

How expensive is the check?  If the result is worth caching, perhaps
we should have two bitmaps: the existing one, and one that records
whether we've checked a particular SSA name.

If the check is relatively cheap then that won't be worth it though.

Thanks,
Richard

>
> Tested on x86-64 Linux.
>
> OK?
>
> gcc/ChangeLog:
>
>   * gimple-loop-versioning.cc (lv_dom_walker::lv_dom_walker): Remove
>   use of m_range_analyzer.
>   (loop_versioning::lv_dom_walker::before_dom_children): Same.
>   (loop_versioning::lv_dom_walker::after_dom_children): Remove.
>   (loop_versioning::loop_versioning): Allocate space for
>   m_ssa_context.
>   (loop_versioning::version_for_unity): Set m_ssa_context.
>   (loop_versioning::prune_loop_conditions): Replace vr_values use
>   with range_query interface.
>   (pass_loop_versioning::execute): Use ranger.
> ---
>  gcc/gimple-loop-versioning.cc | 49 ++-
>  1 file changed, 20 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
> index 4b70c5a4aab..987994d4995 100644
> --- a/gcc/gimple-loop-versioning.cc
> +++ b/gcc/gimple-loop-versioning.cc
> @@ -30,19 +30,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-ssa-loop.h"
>  #include "ssa.h"
>  #include "tree-scalar-evolution.h"
> -#include "tree-chrec.h"
>  #include "tree-ssa-loop-ivopts.h"
>  #include "fold-const.h"
>  #include "tree-ssa-propagate.h"
>  #include "tree-inline.h"
>  #include "domwalk.h"
> -#include "alloc-pool.h"
> -#include "vr-values.h"
> -#include "gimple-ssa-evrp-analyze.h"
>  #include "tree-vectorizer.h"
>  #include "omp-general.h"
>  #include "predict.h"
>  #include "tree-into-ssa.h"
> +#include "gimple-range.h"
> +#include "tree-cfg.h"
>  
>  namespace {
>  
> @@ -261,14 +259,10 @@ private:
>  lv_dom_walker (loop_versioning &);
>  
>  edge before_dom_children (basic_block) FINAL OVERRIDE;
> -void after_dom_children (basic_block) FINAL OVERRIDE;
>  
>private:
>  /* The parent pass.  */
>  loop_versioning _lv;
> -
> -/* Used to build context-dependent range information.  */
> -evrp_range_analyzer m_range_analyzer;
>};
>  
>/* Used to simplify statements based on conditions that are established
> @@ -308,7 +302,7 @@ private:
>bool analyze_block (basic_block);
>bool analyze_blocks ();
>  
> -  void prune_loop_conditions (class loop *, vr_values *);
> +  void prune_loop_conditions (class loop *);
>bool prune_conditions ();
>  
>void merge_loop_info (class loop *, class loop *);
> @@ -355,6 +349,9 @@ private:
>  
>/* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
>auto_vec  m_address_list;
> +
> +  /* Gimple context for each SSA in loop_info::unity_names.  */
> +  auto_vec  m_ssa_context;
>  };
>  
>  /* If EXPR is an SSA name and not a default definition, return the
> @@ -500,7 +497,7 @@ loop_info::worth_versioning_p () const
>  }
>  
>  loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning )
> -  : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
> +  : dom_walker (CDI_DOMINATORS), m_lv (lv)
>  {
>  }
>  
> @@ -509,26 +506,12 @@ loop_versioning::lv_dom_walker::lv_dom_walker 
> (loop_versioning )
>  edge
>  loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
>  {
> -  m_range_analyzer.enter (bb);
> -
>if (bb == bb->loop_father->header)
> -m_lv.prune_loop_conditions (bb->loop_father, _range_analyzer);
> -
> -  for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
> -   gsi_next ())
> -m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
> +m_lv.prune_loop_conditions (bb->loop_father);
>  
>return NULL;
>  }
>  
> -/* Process 

Re: [PATCH 3/4]AArch64: correct dot-product RTL patterns for aarch64.

2021-07-26 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
> Hi,
>
> Sorry It looks like I forgot to ask if OK for backport to GCC 9, 10, 11 after 
> some stew.

Yeah, OK for backports too.

Thanks,
Richard

>
> Thanks,
> Tamar
>
>> -Original Message-
>> From: Richard Sandiford 
>> Sent: Thursday, July 22, 2021 7:11 PM
>> To: Tamar Christina 
>> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
>> ; Marcus Shawcroft
>> ; Kyrylo Tkachov 
>> Subject: Re: [PATCH 3/4]AArch64: correct dot-product RTL patterns for
>> aarch64.
>> 
>> Tamar Christina  writes:
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> >* config/aarch64/aarch64-simd-builtins.def (sdot, udot): Rename to..
>> >(sdot_prod, udot_prod): ... This.
>> >* config/aarch64/aarch64-simd.md (aarch64_dot):
>> Merged
>> >into...
>> >(dot_prod): ... this.
>> >(aarch64_dot_lane, aarch64_dot_laneq):
>> >Change operands order.
>> >(sadv16qi): Use new operands order.
>> >* config/aarch64/arm_neon.h (vdot_u32, vdotq_u32, vdot_s32,
>> >vdotq_s32): Use new RTL ordering.
>> 
>> OK, thanks.
>> 
>> Richard


Re: [PATCH] testsuite: mips: use noinline attribute instead of -fno-inline

2021-07-26 Thread Richard Sandiford via Gcc-patches
Xi Ruoyao  writes:
> On Fri, 2021-07-23 at 11:18 +0800, Xi Ruoyao wrote:
>> On Fri, 2021-07-23 at 04:21 +0200, Maciej W. Rozycki wrote:
>> > On Fri, 9 Jul 2021, Richard Sandiford via Gcc-patches wrote:
>> > 
>> > > > > > > The "smallest fix" is simply adding -fno-inline into
>> > > > > > > mips.exp. 
>> > > > > > > However
>> > > > > > > I don't like it because I agree with you that mips.exp
>> > > > > > > shouldn't
>> > > > > > > care
>> > > > > > > about dg-options, at least don't do it too much.
>> > > > > > As I said in the other message, I think the smallest fix is
>> > > > > > the
>> > > > > > way
>> > > > > > to
>> > > > > > go though.
>> > > > > THanks for chiming in Richard.  I didn't know all the
>> > > > > background
>> > > > > here.   
>> > > > > Let's just go with the small fix based on your
>> > > > > recommendation.  We
>> > > > > can
>> > > > > always revisit if we keep running into issues in this code.
>> > > > 
>> > > > Pushed at 3b33b113.
>> > > 
>> > > It looks like that was the originally posted patch though.  It
>> > > probably
>> > > wasn't very clear, but by smallest fix, I meant adding inline to:
>> > 
>> >  Xi, will you revert your commit that was not approved and apply the
>> > correct fix?
>> 
>> Sorry, somehow I didn't see Richard's reply.  Perhaps a
>> misconfiguration
>> on my mail server.

I don't know where the problem lies, but for some reason I've been
getting rejects when sending messages directly (via reply-all).
I should have said something on-list, sorry.

I'll try replying only via the list to see if that helps.

>> The "correct" fix is 
>> 
>> --- a/gcc/testsuite/gcc.target/mips/mips.exp
>> +++ b/gcc/testsuite/gcc.target/mips/mips.exp
>> @@ -325,6 +325,7 @@ foreach option {
>>  finite-math-only
>>  fixed-hi
>>  fixed-lo
>> +    inline
>>  lax-vector-conversions
>>  omit-frame-pointer
>>  optimize-sibling-calls
>> 
>> right?  I'll do a regtest and if there is no problem I'll commit it.
>
> Done at 863737b8 and 19e05058.

Great, thanks!

Richard


Re: [PATCH 2/2] Backwards jump threader rewrite with ranger.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
PING * 2

BTW, this is also needed for:

a) evrp_range_analyzer removal from DOM threader (I have follow-up patches).

b) VRP replacement with evrp along with VRP threader removal (I also
have patches).

Thanks.
Aldy


On Thu, Jul 15, 2021 at 4:57 PM Aldy Hernandez  wrote:
>
> As mentioned in my previous email, these are some minor changes to the
> previous revision.  All I'm changing here is the call into the solver
> to use range_of_expr and range_of_stmt.  Everything else remains the
> same.
>
> Tested on x86-64 Linux.
>
> On Mon, Jul 5, 2021 at 5:39 PM Aldy Hernandez  wrote:
> >
> > PING.
> >
> > Aldy



Committed: Re: [Patch, fortran V2] PR fortran/93308/93963/94327/94331/97046 problems raised by descriptor handling

2021-07-26 Thread Tobias Burnus

I have now committed José's patch with the two nits fixed
(cf. my on-top patch to which I just replied)

r12-2511-g0cbf03689e3e7d9d6002b8e5d159ef3716d0404c

Note:
I have slightly reworded the error message compared to both
the original patch and to my on-top suggestion.

Reason:
When calling a BIND(C) function from Fortran, it might happen
that a actual or effective argument is an allocatable or pointer
that is no allocatated/associated (→ base_addr == NULL) but whose
dtype.attribute is 'other' as the dummy argument is
nonallocatable/nonpointer.
Likewise, when passing a base_addr == NULL from C to a Fortran-written
BIND(C) procedure where attribute == CFI_attribute_other.

Those errors are much more likely than having some other bug. Thus,
those get now an error on their own instead of a generic error,
even though the reason can be the same as for:

On the other hand, if the attribute != 0, 1, 2 it is invalid, which
either is a bug in the compiler, random/uninitialized memory or a
bug in the C code setting up the descriptor. Thus, the error message
is now different.

Comments to the new wording + comments/remarks to this commit
(or any new or existing code) are welcome :-)

Thanks,

Tobias

PS: I wrote:

On 22.06.21 09:11, Tobias Burnus wrote:


On 21.06.21 22:29, Tobias Burnus wrote:

However, that's independent from the patch you had submitted
and which is fine except for the two tiny nits.

As I just did run into a test, which does trigger the error, I think
it would be useful to have something like the following on top
of your patch – what do you think?

(Two of the changes are the nit changes I mentioned in the
LGTM approval.)

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
commit 0cbf03689e3e7d9d6002b8e5d159ef3716d0404c
Author: Tobias Burnus 
Date:   Mon Jul 26 14:20:46 2021 +0200

PR fortran/93308/93963/94327/94331/97046 problems raised by descriptor handling

Fortran: Fix attributes and bounds in ISO_Fortran_binding.

2021-07-26  José Rui Faustino de Sousa  
Tobias Burnus  

PR fortran/93308
PR fortran/93963
PR fortran/94327
PR fortran/94331
PR fortran/97046

gcc/fortran/ChangeLog:

* trans-decl.c (convert_CFI_desc): Only copy out the descriptor
if necessary.
* trans-expr.c (gfc_conv_gfc_desc_to_cfi_desc): Updated attribute
handling which reflect a previous intermediate version of the
standard. Only copy out the descriptor if necessary.

libgfortran/ChangeLog:

* runtime/ISO_Fortran_binding.c (cfi_desc_to_gfc_desc): Add code
to verify the descriptor. Correct bounds calculation.
(gfc_desc_to_cfi_desc): Add code to verify the descriptor.

gcc/testsuite/ChangeLog:

* gfortran.dg/ISO_Fortran_binding_1.f90: Add pointer attribute,
this test is still erroneous but now it compiles.
* gfortran.dg/bind_c_array_params_2.f90: Update regex to match
code changes.
* gfortran.dg/PR93308.f90: New test.
* gfortran.dg/PR93963.f90: New test.
* gfortran.dg/PR94327.c: New test.
* gfortran.dg/PR94327.f90: New test.
* gfortran.dg/PR94331.c: New test.
* gfortran.dg/PR94331.f90: New test.
* gfortran.dg/PR97046.f90: New test.
---
 gcc/fortran/trans-decl.c   |  32 +--
 gcc/fortran/trans-expr.c   |  24 +-
 .../gfortran.dg/ISO_Fortran_binding_1.f90  |   2 +-
 gcc/testsuite/gfortran.dg/PR93308.f90  |  52 +
 gcc/testsuite/gfortran.dg/PR93963.f90  | 150 
 gcc/testsuite/gfortran.dg/PR94327.c|  70 ++
 gcc/testsuite/gfortran.dg/PR94327.f90  | 195 
 gcc/testsuite/gfortran.dg/PR94331.c|  73 ++
 gcc/testsuite/gfortran.dg/PR94331.f90  | 252 +
 gcc/testsuite/gfortran.dg/PR97046.f90  |  58 +
 .../gfortran.dg/bind_c_array_params_2.f90  |   2 +-
 libgfortran/runtime/ISO_Fortran_binding.c  |  56 -
 12 files changed, 933 insertions(+), 33 deletions(-)

diff --git a/gcc/fortran/trans-decl.c b/gcc/fortran/trans-decl.c
index bf8783a35f8..784f7b61ce1 100644
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@@ -4539,22 +4539,28 @@ convert_CFI_desc (gfc_wrapped_block * block, gfc_symbol *sym)
   gfc_add_expr_to_block (_block, incoming);
   incoming = gfc_finish_block (_block);
 
-
   /* Convert the gfc descriptor back to the CFI type before going
 	 out of scope, if the CFI type was present at entry.  */
-  gfc_init_block 

[PUSHED] Abstract out conditional simplification out of execute_vrp.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
VRP simplifies conditionals involving casted values outside of the main
folding mechanism, because this optimization inhibits the VRP jump
threader from threading through the comparison.

As part of replacing VRP with an evrp instance, I am making sure we do
everything VRP does.  Hence, I am abstracting this functionality out so
we can call it from from elsewhere.

ISTM that when the proposed ranger-based jump threader can handle
everything the forward threader does, there will be no need for this
optimization to be done outside of the evrp folder.  Perhaps we can fold
this into the substitute_using_ranges class.  But that's further down
the line.

Also, there is no need to pass a vr_values around, when the base
range_query class will do.  I fixed this, at it makes it trivial to pass
down a ranger or evrp instance.

Tested on x86-64 Linux.

gcc/ChangeLog:

* tree-vrp.c (vrp_simplify_cond_using_ranges): Rename vr_values
with range_query.
(execute_vrp): Abstract out simplification of conditionals...
(simplify_casted_conds): ...here.
---
 gcc/tree-vrp.c | 39 +++
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
index a9c31bcedb5..58111f83183 100644
--- a/gcc/tree-vrp.c
+++ b/gcc/tree-vrp.c
@@ -4359,7 +4359,7 @@ vrp_jump_threader::after_dom_children (basic_block bb)
subsequent passes.  */
 
 static void
-vrp_simplify_cond_using_ranges (vr_values *query, gcond *stmt)
+vrp_simplify_cond_using_ranges (range_query *query, gcond *stmt)
 {
   tree op0 = gimple_cond_lhs (stmt);
   tree op1 = gimple_cond_rhs (stmt);
@@ -4423,6 +4423,27 @@ vrp_simplify_cond_using_ranges (vr_values *query, gcond 
*stmt)
 }
 }
 
+/* A comparison of an SSA_NAME against a constant where the SSA_NAME
+   was set by a type conversion can often be rewritten to use the RHS
+   of the type conversion.  Do this optimization for all conditionals
+   in FUN.
+
+   However, doing so inhibits jump threading through the comparison.
+   So that transformation is not performed until after jump threading
+   is complete.  */
+
+static void
+simplify_casted_conds (function *fun, range_query *query)
+{
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, fun)
+{
+  gimple *last = last_stmt (bb);
+  if (last && gimple_code (last) == GIMPLE_COND)
+   vrp_simplify_cond_using_ranges (query, as_a  (last));
+}
+}
+
 /* Main entry point to VRP (Value Range Propagation).  This pass is
loosely based on J. R. C. Patterson, ``Accurate Static Branch
Prediction by Value Range Propagation,'' in SIGPLAN Conference on
@@ -4519,21 +4540,7 @@ execute_vrp (struct function *fun, bool 
warn_array_bounds_p)
   vrp_jump_threader threader (fun, _vr_values);
   threader.thread_jumps ();
 
-  /* A comparison of an SSA_NAME against a constant where the SSA_NAME
- was set by a type conversion can often be rewritten to use the
- RHS of the type conversion.
-
- However, doing so inhibits jump threading through the comparison.
- So that transformation is not performed until after jump threading
- is complete.  */
-  basic_block bb;
-  FOR_EACH_BB_FN (bb, fun)
-{
-  gimple *last = last_stmt (bb);
-  if (last && gimple_code (last) == GIMPLE_COND)
-   vrp_simplify_cond_using_ranges (_vr_values,
-   as_a  (last));
-}
+  simplify_casted_conds (fun, _vr_values);
 
   free_numbers_of_iterations_estimates (fun);
 
-- 
2.31.1



Re: [PATCH] ipa-devirt: check precision mismatch of enum values [PR101396]

2021-07-26 Thread Richard Sandiford via Gcc-patches
Xi Ruoyao via Gcc-patches  writes:
> We are comparing enum values (in wide_int) to check ODR violation.
> However, if we compare two wide_int values with different precision,
> we'll trigger an assert, leading to ICE.  With enum-base introduced
> in C++11, it's easy to sink into this situation.
>
> To fix the issue, we need to explicitly check this kind of mismatch,
> and emit a proper warning message if there is such one.
>
> Bootstrapped & regtested on x86_64-linux-gnu.  Ok for trunk?
>
> gcc/
>
>   PR ipa/101396
>   * ipa-devirt.c (ipa_odr_read_section): Compare the precision of
> enum values, and emit a warning if they mismatch.

OK, thanks.

Richard

> gcc/testsuite/
>
>   PR ipa/101396
>   * g++.dg/lto/pr101396_0.C: New test.
>   * g++.dg/lto/pr101396_1.C: New test.
> ---
>  gcc/ipa-devirt.c  |  9 +
>  gcc/testsuite/g++.dg/lto/pr101396_0.C | 12 
>  gcc/testsuite/g++.dg/lto/pr101396_1.C | 10 ++
>  3 files changed, 31 insertions(+)
>  create mode 100644 gcc/testsuite/g++.dg/lto/pr101396_0.C
>  create mode 100644 gcc/testsuite/g++.dg/lto/pr101396_1.C
>
> diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c
> index 8cd1100aba9..8deec75b2df 100644
> --- a/gcc/ipa-devirt.c
> +++ b/gcc/ipa-devirt.c
> @@ -4193,6 +4193,8 @@ ipa_odr_read_section (struct lto_file_decl_data 
> *file_data, const char *data,
> if (do_warning != -1 || j >= this_enum.vals.length ())
>   continue;
> if (strcmp (id, this_enum.vals[j].name)
> +   || (val.get_precision() !=
> +   this_enum.vals[j].val.get_precision())
> || val != this_enum.vals[j].val)
>   {
> warn_name = xstrdup (id);
> @@ -4260,6 +4262,13 @@ ipa_odr_read_section (struct lto_file_decl_data 
> *file_data, const char *data,
>   "name %qs differs from name %qs defined"
>   " in another translation unit",
>   this_enum.vals[j].name, warn_name);
> +   else if (this_enum.vals[j].val.get_precision() !=
> +warn_value.get_precision())
> + inform (this_enum.vals[j].locus,
> + "name %qs is defined as %u-bit while another "
> + "translation unit defines it as %u-bit",
> + warn_name, this_enum.vals[j].val.get_precision(),
> + warn_value.get_precision());
> /* FIXME: In case there is easy way to print wide_ints,
>perhaps we could do it here instead of overflow check.  */
> else if (wi::fits_shwi_p (this_enum.vals[j].val)
> diff --git a/gcc/testsuite/g++.dg/lto/pr101396_0.C 
> b/gcc/testsuite/g++.dg/lto/pr101396_0.C
> new file mode 100644
> index 000..b7a2947a880
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/lto/pr101396_0.C
> @@ -0,0 +1,12 @@
> +/* { dg-lto-do link } */
> +
> +enum A : __UINT32_TYPE__ { // { dg-lto-warning "6: type 'A' violates the 
> C\\+\\+ One Definition Rule" }
> +  a, // { dg-lto-note "3: name 'a' is defined as 32-bit while another 
> translation unit defines it as 64-bit" }
> +  b,
> +  c
> +};
> +
> +int main()
> +{
> +  return (int) A::a;
> +}
> diff --git a/gcc/testsuite/g++.dg/lto/pr101396_1.C 
> b/gcc/testsuite/g++.dg/lto/pr101396_1.C
> new file mode 100644
> index 000..a6d032d694d
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/lto/pr101396_1.C
> @@ -0,0 +1,10 @@
> +enum A : __UINT64_TYPE__ { // { dg-lto-note "6: an enum with different value 
> name is defined in another translation unit" }
> +  a, // { dg-lto-note "3: mismatching definition" }
> +  b,
> +  c
> +};
> +
> +int f(enum A x)
> +{
> +  return (int) x;
> +}


Re: [WIP, OpenMP] OpenMP metadirectives support

2021-07-26 Thread Kwok Cheung Yeung

Ping? Does anyone have any opinions on how this issue should be resolved?

On 09/07/2021 12:16 pm, Kwok Cheung Yeung wrote:
3) In the OpenMP examples (version 5.0.1), section 9.7, the example 
metadirective.3.c does not work as expected.


#pragma omp declare target
void exp_pi_diff(double *d, double my_pi){
    #pragma omp metadirective \
    when( construct={target}: distribute parallel for ) \
    default( parallel for simd)
...
int main()
{
    ...
    #pragma omp target teams map(tofrom: d[0:N])
    exp_pi_diff(d,my_pi);
    ...
    exp_pi_diff(d,my_pi);

In the first call to exp_pi_diff in an '#pragma omp target' construct, the 
metadirective is expected to expand to 'distribute parallel for', but in the 
second (without the '#pragma omp target'), it should expand to 'parallel for simd'.


During OMP expansion of the 'omp target', it creates a child function that calls 
exp_pi_diff:


__attribute__((omp target entrypoint))
void main._omp_fn.0 (const struct .omp_data_t.12 & restrict .omp_data_i)
{
   ...
    :
   __builtin_GOMP_teams (0, 0);
   exp_pi_diff (d.13, my_pi);

This is not a problem on the offload compiler (since by definition its copy of 
exp_pi_diff must be in a 'target'), but if the host device is used, the same 
version of exp_pi_diff is called in both target and non-target contexts.


What would be the best way to solve this? Offhand, I can think of two solutions:

(a) Recursively go through all functions that can be reached via a target region 
and create clones for each, redirecting all function calls in the clones to the 
new cloned versions. Resolve the metadirectives in the clones and originals 
separately.




Maybe this could be done at the same time as when marking functions implicitly 
'declare target'? It seems a lot of work for one special case though...


(b) Make the construct selector a dynamic selector when OpenMP 5.1 metadirective 
support is implemented. Keep track of the current construct list every time an 
OpenMP construct is entered or exited, and make the decision at runtime.




I think this would be easier to implement at runtime (assuming that the 
infrastructure for OpenMP 5.1 was already in place) since this a host-side 
issue, but it probably goes against the intent of the specification, given that 
the 'construct' selector set appeared in the 5.0 specification before dynamic 
replacements became available.


Thanks

Kwok


[PATCH] Fold (X<

2021-07-26 Thread Roger Sayle

The easiest way to motivate these additions to match.pd is with the
following example:

unsigned int foo(unsigned char i) {
  return i | (i<<8) | (i<<16) | (i<<24);
}

which mainline with -O2 on x86_64 currently generates:
foo:movzbl  %dil, %edi
movl%edi, %eax
movl%edi, %edx
sall$8, %eax
sall$16, %edx
orl %edx, %eax
orl %edi, %eax
sall$24, %edi
orl %edi, %eax
ret

but with this patch now becomes:
foo:movzbl  %dil, %eax
imull   $16843009, %eax, %eax
ret

Interestingly, this transformation is already applied when using
addition, allowing synth_mult to select an optimal sequence, but
not when using the equivalent bit-wise ior or xor operators.

The solution is to use tree_nonzero_bits to check that the
potentially non-zero bits of each operand don't overlap, which
ensures that BIT_IOR_EXPR and BIT_XOR_EXPR produce the same
results as PLUS_EXPR, which effectively generalizes the old
fold_plusminus_mult_expr.  Technically, the transformation
is to canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to
X*(C1+C2) where X and X<

gcc/ChangeLog
* match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
(X*C1)^(X*C2) as X*(C1+C2), and related variants, using
tree_nonzero_bits to ensure that operands are bit-wise disjoint.

gcc/testsuite/ChangeLog
* gcc.dg/fold-ior-4.c: New test.

Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/match.pd b/gcc/match.pd
index beb8d27..0347bea 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2833,6 +2833,113 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (convert (mult (convert:t @0) { cst; })
 #endif
 
+/* Canonicalize (X*C1)|(X*C2) and (X*C1)^(X*C2) to (C1+C2)*X when
+   tree_nonzero_bits allows IOR and XOR to be treated like PLUS.
+   Likewise, handle (X< 0
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+  wide_int c = wi::add (wi::to_wide (@2),
+wi::lshift (wone, wi::to_wide (@4))); }
+(mult @1 { wide_int_to_tree (type, c); }
+ (simplify
+  (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+   (lshift:s@4 (convert@1 @2) INTEGER_CST@5))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+   && tree_int_cst_sgn (@5) > 0
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+  wide_int c = wi::add (wi::to_wide (@3),
+wi::lshift (wone, wi::to_wide (@5))); }
+(mult @1 { wide_int_to_tree (type, c); }
+ (simplify
+  (op:c (mult:s@0 @1 INTEGER_CST@2)
+   @1)
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (mult @1
+{ wide_int_to_tree (type,
+wi::add (wi::to_wide (@2), 1)); })))
+ (simplify
+  (op:c (mult:s@0 (convert@1 @2) INTEGER_CST@3)
+   (convert@1 @2))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (mult @1
+{ wide_int_to_tree (type,
+wi::add (wi::to_wide (@3), 1)); })))
+ (simplify
+  (op (lshift:s@0 @1 INTEGER_CST@2)
+  (lshift:s@3 @1 INTEGER_CST@4))
+  (if (INTEGRAL_TYPE_P (type)
+   && tree_int_cst_sgn (@2) > 0
+   && tree_int_cst_sgn (@4) > 0
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
+   (with { tree t = type;
+  if (!TYPE_OVERFLOW_WRAPS (t))
+t = unsigned_type_for (t);
+  wide_int wone = wi::one (TYPE_PRECISION (t));
+  wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)),
+wi::lshift (wone, wi::to_wide (@4))); }
+(convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })
+ (simplify
+  (op (lshift:s@0 (convert@1 @2) INTEGER_CST@3)
+  (lshift:s@4 (convert@1 @2) INTEGER_CST@5))
+  (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
+   && tree_int_cst_sgn (@3) > 0
+   && tree_int_cst_sgn (@5) > 0
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+   (with { tree t = type;
+  if (!TYPE_OVERFLOW_WRAPS (t))
+t = unsigned_type_for (t);
+  wide_int wone = wi::one (TYPE_PRECISION (t));
+  wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@3)),
+wi::lshift (wone, wi::to_wide (@5))); }
+(convert (mult:t (convert:t @1) { wide_int_to_tree (t,c); })
+ (simplify
+  (op:c (lshift:s@0 @1 INTEGER_CST@2)
+   @1)
+  (if (INTEGRAL_TYPE_P (type)
+   && tree_int_cst_sgn (@2) > 0
+   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
+   (with { tree t = type;
+  if (!TYPE_OVERFLOW_WRAPS (t))
+t = unsigned_type_for (t);
+  wide_int wone = wi::one 

[x86_64 PATCH] Decrement followed by cmov improvements.

2021-07-26 Thread Roger Sayle

The following patch to the x86_64 backend improves the code generated
for a decrement followed by a conditional move.  The primary change is
to recognize that after subtracting one, checking the result is -1 (or
equivalently that the original value was zero) can be implemented using
the borrow/carry flag instead of requiring an explicit test instruction.
This is achieved by a new define_insn_and_split that allows combine to
split the desired sequence/composite into a *subsi_3 and *movsicc_noc.

The other change with this patch is/are a pair of peephole2 optimizations
to eliminate register-to-register moves generated during register
allocation.  During reload, the compiler doesn't know that inverting
the condition of a conditional cmove can sometimes reduce register
pressure, but this is easy to tidy up during the peephole2 pass (where
swapping the order of the insn's operands performs the required
logic inversion).

Both improvements are demonstrated by the case below:

int foo(int x) {
  if (x == 0)
x = 16;
  else x--;
  return x;
}

Before:
foo:leal-1(%rdi), %eax
testl   %edi, %edi
movl$16, %edx
cmove   %edx, %eax
ret

After:
foo:subl$1, %edi
movl$16, %eax
cmovnc  %edi, %eax
ret

And the value of the peephole2 clean-up can be seen on its own in:

int bar(int x) {
  x--;
  if (x == 0)
x = 16;
  return x;
}

Before:
bar:movl%edi, %eax
movl$16, %edx
subl$1, %eax
cmove   %edx, %eax
ret

After:
bar:subl$1, %edi
movl$16, %eax
cmovne  %edi, %eax
ret

These idioms were inspired by the source code of NIST SciMark4's
Random_nextDouble function, where the tweaks above result in
a ~1% improvement in the MonteCarlo benchmark kernel.

This patch has been tested on x86_64-pc-linux-gnu with a
"make boostrap" and "make -k check" with no new failures.

Ok for mainline?


2021-07-26  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.md (*dec_cmov): New define_insn_and_split
to generate a conditional move using the carry flag after sub $1.
(peephole2): Eliminate a register-to-register move by inverting
the condition of a conditional move.

gcc/testsuite/ChangeLog
* gcc.target/i386/dec-cmov-1.c: New test.
* gcc.target/i386/dec-cmov-2.c: New test.

Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8b809c4..a4f512f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6755,6 +6755,29 @@
? GEU : LTU, VOIDmode, cc, const0_rtx);
 })
 
+;; Help combine use borrow flag to test for -1 after dec (add $-1).
+(define_insn_and_split "*dec_cmov"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+   (if_then_else:SWI248
+(match_operator 1 "bt_comparison_operator"
+ [(match_operand:SWI248 2 "register_operand" "0") (const_int 0)])
+(plus:SWI248 (match_dup 2) (const_int -1))
+(match_operand:SWI248 3 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2) (const_int 1)))
+ (set (match_dup 0) (minus:SWI248 (match_dup 2) (const_int 1)))])
+   (set (match_dup 0)
+   (if_then_else:SWI248 (match_dup 4) (match_dup 0) (match_dup 3)))]
+{
+  rtx cc = gen_rtx_REG (CCCmode, FLAGS_REG);
+  operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[1]) == NE
+   ? GEU : LTU, VOIDmode, cc, const0_rtx);
+})
+
 (define_insn "*subsi_3_zext"
   [(set (reg FLAGS_REG)
(compare (match_operand:SI 1 "register_operand" "0")
@@ -19068,6 +19091,70 @@
 gcc_unreachable ();
 })
 
+;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1).
+;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
+(define_peephole2
+ [(set (match_operand:SWI248 0 "register_operand")
+   (match_operand:SWI248 1 "register_operand"))
+  (parallel [(set (reg FLAGS_REG) (match_operand 5))
+(set (match_dup 0) (match_operand:SWI248 6))])
+  (set (match_operand:SWI248 2 "register_operand")
+   (match_operand:SWI248 3))
+  (set (match_dup 0)
+   (if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator"
+[(reg FLAGS_REG) (const_int 0)])
+   (match_dup 0)
+   (match_dup 2)))]
+ "TARGET_CMOVE
+  && REGNO (operands[2]) != REGNO (operands[0])
+  && REGNO (operands[2]) != REGNO (operands[1])
+  && peep2_reg_dead_p (1, operands[1])
+  && peep2_reg_dead_p (4, operands[2])
+  && !reg_overlap_mentioned_p (operands[0], operands[3])"
+ [(parallel [(set (match_dup 7) (match_dup 8))
+(set (match_dup 1) (match_dup 9))])
+  (set (match_dup 0) (match_dup 3))
+  (set (match_dup 0) (if_then_else:SWI248 (match_dup 4)
+ 

Re: [patch][version5]add -ftrivial-auto-var-init and variable attribute "uninitialized" to gcc

2021-07-26 Thread Martin Jambor
Hi,

On Sun, Jul 25 2021, Qing Zhao via Gcc-patches wrote:
> Hi,
>
> This is the 5th version of the patch for the new security feature for GCC.
>
> I have tested it with bootstrap on both x86 and aarch64, regression testing 
> on both x86 and aarch64.
> Also compile and run CPU2017, without any issue.
>
> Please take a look and let me know your comments and suggestions.
>
> thanks.
>
> Qing
>
> **Compare with the 4th version, the following are the major changes:
>
> 1. delete the code for handling "grp_to_be_debug_replaced" since they are not 
> needed per Martin Jambor's suggestion.

sorry if I did not make myself clear in my last email, but the deferred
init calls should not result into setting any bits in
cannot_scalarize_away_bitmap in the SRA pass, otherwise you'll get
different optimization with and without -ftrivial-auto-var-init.

So you either need to change build_access_from_expr like I described in
my email or add the following to your patch, which is probably slightly
mor efficient (but it has been only very mildly tested).


diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index d1280e5f884..602b0fb3a2d 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -1395,7 +1395,12 @@ scan_function (void)
 
  t = gimple_call_lhs (stmt);
  if (t && !disqualify_if_bad_bb_terminating_stmt (stmt, t, NULL))
-   ret |= build_access_from_expr (t, stmt, true);
+   {
+ if (gimple_call_internal_p (stmt, IFN_DEFERRED_INIT))
+   ret |= !!build_access_from_expr_1 (t, stmt, true);
+ else
+   ret |= build_access_from_expr (t, stmt, true);
+   }
  break;
 
case GIMPLE_ASM:



And unfortunately I just spotted another potential problem in:

> +static enum assignment_mod_result
> +sra_modify_deferred_init (gimple *stmt, gimple_stmt_iterator *gsi)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree init_type = gimple_call_arg (stmt, 1);
> +  tree is_vla = gimple_call_arg (stmt, 2);
> +
> +  struct access *lhs_access = get_access_for_expr (lhs);
> +  if (!lhs_access)
> +return SRA_AM_NONE;
> +
> +  location_t loc = gimple_location (stmt);
> +
> +  if (lhs_access->grp_to_be_replaced)
> +{
> +  tree lhs_repl = get_access_replacement (lhs_access);
> +  gimple_call_set_lhs (stmt, lhs_repl);
> +  tree arg0_repl = TYPE_SIZE_UNIT (TREE_TYPE (lhs_repl));
> +  gimple_call_set_arg (stmt, 0, arg0_repl);
> +  sra_stats.deferred_init++;

I think you want to add:

  gcc_assert (!lhs_access->first_child);
  return SRA_AM_MODIFIED;

here, otherwise you risk that (in a case of a single-field structure)
the call you have just modified here...

> +}
> +
> +  if (lhs_access->first_child)
> +generate_subtree_deferred_init (lhs_access->first_child,
> + init_type, is_vla, gsi, loc);
> +  if (lhs_access->grp_covered)
> +{
> +  unlink_stmt_vdef (stmt);
> +  gsi_remove (gsi, true);

...will be deleted here.

> +  release_defs (stmt);
> +  return SRA_AM_REMOVED;
> +}
> +
> +  return SRA_AM_MODIFIED;
> +}

Sorry again for spotting this late and perhaps mis-communicating about
the cannot_scalarize_away_bitmap issue earlier.  Apart from these two
things, I believe the tree-sra.c bits are fin.

Martin




> 2. for Pattern init, call __builtin_clear_padding after the call to 
> .DEFERRED_INIT to initialize the paddings to zeroes;
> 3. for partially or fully initialized auto variables, call   
> __builtin_clear_padding before the real initialization to initialize
> the paddings to zeroes.
> 4. Update the documentation with padding initialization to zeroes.
> 5. in order to reuse __builtin_clear_padding for auto init purpose, add one 
> more dummy argument to indiciate whether it's for auto init or not,
>if for auto init, do not emit error messages to avoid confusing users.
> 6. Add new testing cases to verify padding initializations.
> 7. rename some of the old testing cases to make the file name reflecting the 
> testing purpose per Kees Cook's suggestions.
>
> **Please see version 4 at:
> https://gcc.gnu.org/pipermail/gcc-patches/2021-July/574642.html
>
> **ChangeLog is:
> gcc/ChangeLog:
>
> 2021-07-23  qing zhao  
>
> * builtins.c (expand_builtin_memset): Make external visible.
> * builtins.h (expand_builtin_memset): Declare extern.
> * common.opt (ftrivial-auto-var-init=): New option.
> * doc/extend.texi: Document the uninitialized attribute.
> * doc/invoke.texi: Document -ftrivial-auto-var-init.
> * flag-types.h (enum auto_init_type): New enumerated type
> auto_init_type.
> * gimple-fold.c (clear_padding_type): Add one new parameter.
> (clear_padding_union): Likewise.
> (clear_padding_emit_loop): Likewise.
> (clear_type_padding_in_mask): Likewise.
> (gimple_fold_builtin_clear_padding): Handle this new 

Re: [PATCH] RISC-V: Enable overlap-by-pieces in case of fast unaliged access

2021-07-26 Thread Andrew Waterman
On Thu, Jul 22, 2021 at 10:27 AM Palmer Dabbelt  wrote:
>
> On Thu, 22 Jul 2021 06:29:46 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
> > Could you add a testcase? Otherwise LGTM.
> >
> > Option: -O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64
> > void foo(char *dst){
> >__builtin_memset(dst, 0, 15);
> > }
>
> I'd like to see:
>
> * Test results.  This is only on for one target right now, so relying on
>   it to just work on others isn't a good idea.
> * Something to demonstrate this doesn't break -mstrict-align.
> * Some sort of performance analysis.  Most machines that support
>   unaligned access do so with some performance degredation,

Also, some machines that gracefully support misaligned accesses under
most circumstances nevertheless experience a perf degradation when the
load depends on two stores that overlap partially but not fully.  This
transformation will obviously trigger such behavior from time to time.

Note, I'm not objecting to this patch; I'm only responding to Palmer's
comment.  It makes sense to enable this kind of optimization for
-mtune=native etc., just not for standard software distributions.


>   it's unclear
>   that this conversion will actually manifst a performance improvement.
>   I don't have a C906 and don't know what workloads people care about
>   running on one, but I'm certainly not convinced this is a win --
>   what's listed here looks to be the best case, and that's only saving
>   two instructions to generate a pretty pathological sequence
>   (misaligned access that conflicts with a prior store).
>
> Jojo: do you have any description of the C906 pipeline?  Specifically in
> this case it'd be good to know how it handles unaligned accesses.
>
> >
> > On Thu, Jul 22, 2021 at 8:53 PM Christoph Muellner via Gcc-patches
> >  wrote:
> >>
> >> This patch enables the overlap-by-pieces feature of the by-pieces
> >> infrastructure for inlining builtins in case the target has set
> >> riscv_slow_unaligned_access_p to false.
> >>
> >> To demonstrate the effect for targets with fast unaligned access,
> >> the following code sequences are generated for a 15-byte memset-zero.
> >>
> >> Without overlap_op_by_pieces we get:
> >>   8e:   00053023sd  zero,0(a0)
> >>   92:   00052423sw  zero,8(a0)
> >>   96:   00051623sh  zero,12(a0)
> >>   9a:   00050723sb  zero,14(a0)
> >>
> >> With overlap_op_by_pieces we get:
> >>   7e:   00053023sd  zero,0(a0)
> >>   82:   000533a3sd  zero,7(a0)
> >>
> >> gcc/ChangeLog:
> >>
> >> * config/riscv/riscv.c (riscv_overlap_op_by_pieces): New function.
> >> (TARGET_OVERLAP_OP_BY_PIECES_P): Connect to
> >> riscv_overlap_op_by_pieces.
> >>
> >> Signed-off-by: Christoph Muellner 
> >> ---
> >>  gcc/config/riscv/riscv.c | 11 +++
> >>  1 file changed, 11 insertions(+)
> >>
> >> diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
> >> index 576960bb37c..98c76ba657a 100644
> >> --- a/gcc/config/riscv/riscv.c
> >> +++ b/gcc/config/riscv/riscv.c
> >> @@ -5201,6 +5201,14 @@ riscv_slow_unaligned_access (machine_mode, unsigned 
> >> int)
> >>return riscv_slow_unaligned_access_p;
> >>  }
> >>
> >> +/* Implement TARGET_OVERLAP_OP_BY_PIECES_P.  */
> >> +
> >> +static bool
> >> +riscv_overlap_op_by_pieces (void)
> >> +{
> >> +  return !riscv_slow_unaligned_access_p;
> >> +}
> >> +
> >>  /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
> >>
> >>  static bool
> >> @@ -5525,6 +5533,9 @@ riscv_asan_shadow_offset (void)
> >>  #undef TARGET_SLOW_UNALIGNED_ACCESS
> >>  #define TARGET_SLOW_UNALIGNED_ACCESS riscv_slow_unaligned_access
> >>
> >> +#undef TARGET_OVERLAP_OP_BY_PIECES_P
> >> +#define TARGET_OVERLAP_OP_BY_PIECES_P riscv_overlap_op_by_pieces
> >> +
> >>  #undef TARGET_SECONDARY_MEMORY_NEEDED
> >>  #define TARGET_SECONDARY_MEMORY_NEEDED riscv_secondary_memory_needed
> >>
> >> --
> >> 2.31.1
> >>


[PUSHED] Pass gimple context to array_bounds_checker.

2021-07-26 Thread Aldy Hernandez via Gcc-patches
I have changed the use of the array_bounds_checker in VRP to use a
ranger in my local tree to make sure there are no regressions when using
either VRP or the ranger.  In doing so I noticed that the checker
does not pass context to get_value_range, which causes the ranger to miss a
few cases.  This patch fixes the oversight.

Tested on x86-64 Linux using the array bounds checker both with VRP and
the ranger.

gcc/ChangeLog:

* gimple-array-bounds.cc (array_bounds_checker::get_value_range):
Add gimple argument.
(array_bounds_checker::check_array_ref): Same.
(array_bounds_checker::check_addr_expr): Same.
(array_bounds_checker::check_array_bounds): Pass statement to
check_array_bounds and check_addr_expr.
* gimple-array-bounds.h (check_array_bounds): Add gimple argument.
(check_addr_expr): Same.
(get_value_range): Same.
---
 gcc/gimple-array-bounds.cc | 17 +
 gcc/gimple-array-bounds.h  |  6 +++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/gcc/gimple-array-bounds.cc b/gcc/gimple-array-bounds.cc
index 8dfd6f9500a..598c76bf52e 100644
--- a/gcc/gimple-array-bounds.cc
+++ b/gcc/gimple-array-bounds.cc
@@ -43,9 +43,9 @@ along with GCC; see the file COPYING3.  If not see
 // break the dependency on equivalences for this pass.
 
 const value_range *
-array_bounds_checker::get_value_range (const_tree op)
+array_bounds_checker::get_value_range (const_tree op, gimple *stmt)
 {
-  return ranges->get_value_range (op);
+  return ranges->get_value_range (op, stmt);
 }
 
 /* Try to determine the DECL that REF refers to.  Return the DECL or
@@ -173,7 +173,7 @@ trailing_array (tree arg, tree *pref)
 
 bool
 array_bounds_checker::check_array_ref (location_t location, tree ref,
-  bool ignore_off_by_one)
+  gimple *stmt, bool ignore_off_by_one)
 {
   if (warning_suppressed_p (ref, OPT_Warray_bounds))
 /* Return true to have the caller prevent warnings for enclosing
@@ -287,7 +287,7 @@ array_bounds_checker::check_array_ref (location_t location, 
tree ref,
   const value_range *vr = NULL;
   if (TREE_CODE (low_sub) == SSA_NAME)
 {
-  vr = get_value_range (low_sub);
+  vr = get_value_range (low_sub, stmt);
   if (!vr->undefined_p () && !vr->varying_p ())
{
  low_sub = vr->kind () == VR_RANGE ? vr->max () : vr->min ();
@@ -563,7 +563,8 @@ array_bounds_checker::check_mem_ref (location_t location, 
tree ref,
address of an ARRAY_REF, and call check_array_ref on it.  */
 
 void
-array_bounds_checker::check_addr_expr (location_t location, tree t)
+array_bounds_checker::check_addr_expr (location_t location, tree t,
+  gimple *stmt)
 {
   /* For the most significant subscript only, accept taking the address
  of the just-past-the-end element.  */
@@ -575,7 +576,7 @@ array_bounds_checker::check_addr_expr (location_t location, 
tree t)
   bool warned = false;
   if (TREE_CODE (t) == ARRAY_REF)
{
- warned = check_array_ref (location, t, ignore_off_by_one);
+ warned = check_array_ref (location, t, stmt, ignore_off_by_one);
  ignore_off_by_one = false;
}
   else if (TREE_CODE (t) == MEM_REF)
@@ -728,14 +729,14 @@ array_bounds_checker::check_array_bounds (tree *tp, int 
*walk_subtree,
   bool warned = false;
   array_bounds_checker *checker = (array_bounds_checker *) wi->info;
   if (TREE_CODE (t) == ARRAY_REF)
-warned = checker->check_array_ref (location, t,
+warned = checker->check_array_ref (location, t, wi->stmt,
   false/*ignore_off_by_one*/);
   else if (TREE_CODE (t) == MEM_REF)
 warned = checker->check_mem_ref (location, t,
 false /*ignore_off_by_one*/);
   else if (TREE_CODE (t) == ADDR_EXPR)
 {
-  checker->check_addr_expr (location, t);
+  checker->check_addr_expr (location, t, wi->stmt);
   *walk_subtree = false;
 }
   else if (inbounds_memaccess_p (t))
diff --git a/gcc/gimple-array-bounds.h b/gcc/gimple-array-bounds.h
index fa64262777d..d8f7ff7a89f 100644
--- a/gcc/gimple-array-bounds.h
+++ b/gcc/gimple-array-bounds.h
@@ -31,10 +31,10 @@ public:
 
 private:
   static tree check_array_bounds (tree *tp, int *walk_subtree, void *data);
-  bool check_array_ref (location_t, tree, bool ignore_off_by_one);
+  bool check_array_ref (location_t, tree, gimple *, bool ignore_off_by_one);
   bool check_mem_ref (location_t, tree, bool ignore_off_by_one);
-  void check_addr_expr (location_t, tree);
-  const value_range *get_value_range (const_tree op);
+  void check_addr_expr (location_t, tree, gimple *);
+  const value_range *get_value_range (const_tree op, gimple *);
 
   struct function *fun;
   range_query *ranges;
-- 
2.31.1



Re: [PATCH] libstdc++: Add missing std::move in ranges::copy/move/reverse_copy [PR101599]

2021-07-26 Thread Jonathan Wakely via Gcc-patches
On Fri, 23 Jul 2021 at 17:39, Patrick Palka via Libstdc++
 wrote:
>
> In passing, this also renames the template parameter _O2 to _Out2 in
> ranges::partition_copy and uglify its function parameters out_true
> and out_false.
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk+branches?

Yes, thanks.

> PR libstdc++/101599
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/ranges_algo.h (__reverse_copy_fn::operator()):
> Add missing std::move in return statement.
> (__partition_copy_fn::operator()): Rename templtae parameter
> _O2 to _Out2.  Uglify function parameters out_true and out_false.
> * include/bits/ranges_algobase.h (__copy_or_move): Add missing
> std::move to recursive call that unwraps a __normal_iterator
> output iterator.
> * testsuite/25_algorithms/copy/constrained.cc (test06): New test.
> * testsuite/25_algorithms/move/constrained.cc (test05): New test.
> ---
>  libstdc++-v3/include/bits/ranges_algo.h   | 20 +--
>  libstdc++-v3/include/bits/ranges_algobase.h   |  2 +-
>  .../25_algorithms/copy/constrained.cc | 13 
>  .../25_algorithms/move/constrained.cc | 13 
>  4 files changed, 37 insertions(+), 11 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/ranges_algo.h 
> b/libstdc++-v3/include/bits/ranges_algo.h
> index 83371a4bdf0..8462521c369 100644
> --- a/libstdc++-v3/include/bits/ranges_algo.h
> +++ b/libstdc++-v3/include/bits/ranges_algo.h
> @@ -1343,7 +1343,7 @@ namespace ranges
> *__result = *__tail;
> ++__result;
>   }
> -   return {__i, __result};
> +   return {__i, std::move(__result)};
>}
>
>  template
> @@ -2423,14 +2423,14 @@ namespace ranges
>struct __partition_copy_fn
>{
>  template _Sent,
> -weakly_incrementable _Out1, weakly_incrementable _O2,
> +weakly_incrementable _Out1, weakly_incrementable _Out2,
>  typename _Proj = identity,
>  indirect_unary_predicate> _Pred>
>requires indirectly_copyable<_Iter, _Out1>
> -   && indirectly_copyable<_Iter, _O2>
> -  constexpr partition_copy_result<_Iter, _Out1, _O2>
> +   && indirectly_copyable<_Iter, _Out2>
> +  constexpr partition_copy_result<_Iter, _Out1, _Out2>
>operator()(_Iter __first, _Sent __last,
> -_Out1 __out_true, _O2 __out_false,
> +_Out1 __out_true, _Out2 __out_false,
>  _Pred __pred, _Proj __proj = {}) const
>{
> for (; __first != __last; ++__first)
> @@ -2450,18 +2450,18 @@ namespace ranges
>}
>
>  template -weakly_incrementable _O2,
> +weakly_incrementable _Out2,
>  typename _Proj = identity,
>  indirect_unary_predicate, _Proj>>
>_Pred>
>requires indirectly_copyable, _Out1>
> -   && indirectly_copyable, _O2>
> -  constexpr partition_copy_result, _Out1, 
> _O2>
> -  operator()(_Range&& __r, _Out1 out_true, _O2 out_false,
> +   && indirectly_copyable, _Out2>
> +  constexpr partition_copy_result, _Out1, 
> _Out2>
> +  operator()(_Range&& __r, _Out1 __out_true, _Out2 __out_false,
>  _Pred __pred, _Proj __proj = {}) const
>{
> return (*this)(ranges::begin(__r), ranges::end(__r),
> -  std::move(out_true), std::move(out_false),
> +  std::move(__out_true), std::move(__out_false),
>std::move(__pred), std::move(__proj));
>}
>};
> diff --git a/libstdc++-v3/include/bits/ranges_algobase.h 
> b/libstdc++-v3/include/bits/ranges_algobase.h
> index c1037657c4c..78c295981d5 100644
> --- a/libstdc++-v3/include/bits/ranges_algobase.h
> +++ b/libstdc++-v3/include/bits/ranges_algobase.h
> @@ -244,7 +244,7 @@ namespace ranges
>else if constexpr (__is_normal_iterator<_Out>)
> {
>   auto [__in,__out]
> -   = ranges::__copy_or_move<_IsMove>(__first, __last, 
> __result.base());
> +   = ranges::__copy_or_move<_IsMove>(std::move(__first), __last, 
> __result.base());
>   return {std::move(__in), decltype(__result){__out}};
> }
>else if constexpr (sized_sentinel_for<_Sent, _Iter>)
> diff --git a/libstdc++-v3/testsuite/25_algorithms/copy/constrained.cc 
> b/libstdc++-v3/testsuite/25_algorithms/copy/constrained.cc
> index 77ecf99d5b1..a05948a49c6 100644
> --- a/libstdc++-v3/testsuite/25_algorithms/copy/constrained.cc
> +++ b/libstdc++-v3/testsuite/25_algorithms/copy/constrained.cc
> @@ -26,6 +26,7 @@
>  using __gnu_test::test_container;
>  using __gnu_test::test_range;
>  using __gnu_test::input_iterator_wrapper;
> +using __gnu_test::input_iterator_wrapper_nocopy;
>  using __gnu_test::output_iterator_wrapper;
>  using __gnu_test::forward_iterator_wrapper;
>
> @@ -214,6 +215,17 @@ test05()
>return ok;
>  }
>
> +void
> 

Re: [PATCH] libstdc++: Fix up implementation of LWG 3533 [PR101589]

2021-07-26 Thread Jonathan Wakely via Gcc-patches
On Fri, 23 Jul 2021 at 17:35, Patrick Palka via Libstdc++
 wrote:
>
> In r12-569 I accidentally applied the LWG 3533 change that
> was intended for elements_view::iterator::base to
> elements_view::base instead.
>
> This patch corrects this, and also applies the corresponding LWG 3533
> change to lazy_split_view::inner-iter::base now that we implement P2210.
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk and release
> branches?

OK, thanks.


>
> PR libstdc++/101589
>
> libstdc++-v3/ChangeLog:
>
> * include/std/ranges (lazy_split_view::_InnerIter::base): Make
> the const& overload unconstrained and return a const reference
> as per LWG 3533.  Make unconditionally noexcept.
> (elements_view::base): Revert accidental r12-569 change.
> (elements_view::_Iterator::base): Make the const& overload
> unconstrained and return a const reference as per LWG 3533.
> Make unconditionally noexcept.
> ---
>  libstdc++-v3/include/std/ranges | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index d791e15d096..50b414e8c8c 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -3103,8 +3103,8 @@ namespace views::__adaptor
> : _M_i(std::move(__i))
>   { }
>
> - constexpr iterator_t<_Base>
> - base() const& requires copyable>
> + constexpr const iterator_t<_Base>&
> + base() const& noexcept
>   { return _M_i_current(); }
>
>   constexpr iterator_t<_Base>
> @@ -3786,8 +3786,8 @@ namespace views::__adaptor
> : _M_base(std::move(base))
>{ }
>
> -  constexpr const _Vp&
> -  base() const & noexcept
> +  constexpr _Vp
> +  base() const& requires copy_constructible<_Vp>
>{ return _M_base; }
>
>constexpr _Vp
> @@ -3913,9 +3913,8 @@ namespace views::__adaptor
> : _M_current(std::move(i._M_current))
>   { }
>
> - constexpr iterator_t<_Base>
> - base() const&
> -   requires copyable>
> + constexpr const iterator_t<_Base>&
> + base() const& noexcept
>   { return _M_current; }
>
>   constexpr iterator_t<_Base>
> --
> 2.32.0.349.gdaab8a564f
>



Re: [PATCH] RISC-V: Enable overlap-by-pieces in case of fast unaliged access

2021-07-26 Thread Kito Cheng via Gcc-patches
LGTM, but I would like to wait Palmer's ack.

I am fine with current scheme, just check riscv_slow_unaligned_access_p,
in case we have performance issue, we can consider change mtune's
slow_unaligned_access field to a number rather than a boolean value,
so that we can have better cost model for that.


On Fri, Jul 23, 2021 at 6:55 AM Christoph Müllner via Gcc-patches
 wrote:
>
> I have added tests for memset and memcpy.
> Additionally, I have added a test to ensure that -mstrict-align is
> still working.
> I've run the complete GCC test suite with and without the resulting
> patch with no regressions
> (rv32imac/ilp32/medlow, rv32imafdc/ilp32d/medlow,
> rv64imac/lp64/medlow, rv64imafdc/lp64d/medlow).
>
> I have not changed the patch itself, because I think it is reasonable
> to assume that overlapping access
> (i.e. reducing store instructions) will always be cheaper on targets,
> that don't have penalties for unaligned accesses.
> If maintainers disagree, I can change accordingly.
>
> The new patch can be found here:
> https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575864.html
>
>
> On Thu, Jul 22, 2021 at 8:23 PM Christoph Müllner  
> wrote:
> >
> > On Thu, Jul 22, 2021 at 7:27 PM Palmer Dabbelt  wrote:
> > >
> > > On Thu, 22 Jul 2021 06:29:46 PDT (-0700), gcc-patches@gcc.gnu.org wrote:
> > > > Could you add a testcase? Otherwise LGTM.
> > > >
> > > > Option: -O2 -mtune=thead-c906 -march=rv64gc -mabi=lp64
> > > > void foo(char *dst){
> > > >__builtin_memset(dst, 0, 15);
> > > > }
> > >
> > > I'd like to see:
> > >
> > > * Test results.  This is only on for one target right now, so relying on
> > >   it to just work on others isn't a good idea.
> >
> > So you prefer the previous version of the patch, where each
> > tune struct can decide if that feature should be enabled or not?
> >
> > > * Something to demonstrate this doesn't break -mstrict-align.
> >
> > -mstrict-align sets riscv_slow_unaligned_access_p to true.
> > This will be returned by TARGET_SLOW_UNALIGNED_ACCESS.
> > So, this cannot happen. I will add an additional test for that.
> >
> > > * Some sort of performance analysis.  Most machines that support
> > >   unaligned access do so with some performance degredation, it's unclear
> > >   that this conversion will actually manifst a performance improvement.
> > >   I don't have a C906 and don't know what workloads people care about
> > >   running on one, but I'm certainly not convinced this is a win --
> > >   what's listed here looks to be the best case, and that's only saving
> > >   two instructions to generate a pretty pathological sequence
> > >   (misaligned access that conflicts with a prior store).
> > >
> > > Jojo: do you have any description of the C906 pipeline?  Specifically in
> > > this case it'd be good to know how it handles unaligned accesses.
> >
> > The tune struct already includes the field slow_unaligned_access.
> > For c906 this is set to false. So the answer is: c906 handles unaligned
> > access reasonably well (assuming the contents of its tune struct are 
> > correct).
> >
> > Note, that the overlapping access only happens if unaligned accesses
> > are allowed.
> > If slow_unaligned_access is set, then you will get a sequence of
> > store-byte instructions.
> >
> > >
> > > >
> > > > On Thu, Jul 22, 2021 at 8:53 PM Christoph Muellner via Gcc-patches
> > > >  wrote:
> > > >>
> > > >> This patch enables the overlap-by-pieces feature of the by-pieces
> > > >> infrastructure for inlining builtins in case the target has set
> > > >> riscv_slow_unaligned_access_p to false.
> > > >>
> > > >> To demonstrate the effect for targets with fast unaligned access,
> > > >> the following code sequences are generated for a 15-byte memset-zero.
> > > >>
> > > >> Without overlap_op_by_pieces we get:
> > > >>   8e:   00053023sd  zero,0(a0)
> > > >>   92:   00052423sw  zero,8(a0)
> > > >>   96:   00051623sh  zero,12(a0)
> > > >>   9a:   00050723sb  zero,14(a0)
> > > >>
> > > >> With overlap_op_by_pieces we get:
> > > >>   7e:   00053023sd  zero,0(a0)
> > > >>   82:   000533a3sd  zero,7(a0)
> > > >>
> > > >> gcc/ChangeLog:
> > > >>
> > > >> * config/riscv/riscv.c (riscv_overlap_op_by_pieces): New 
> > > >> function.
> > > >> (TARGET_OVERLAP_OP_BY_PIECES_P): Connect to
> > > >> riscv_overlap_op_by_pieces.
> > > >>
> > > >> Signed-off-by: Christoph Muellner 
> > > >> ---
> > > >>  gcc/config/riscv/riscv.c | 11 +++
> > > >>  1 file changed, 11 insertions(+)
> > > >>
> > > >> diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
> > > >> index 576960bb37c..98c76ba657a 100644
> > > >> --- a/gcc/config/riscv/riscv.c
> > > >> +++ b/gcc/config/riscv/riscv.c
> > > >> @@ -5201,6 +5201,14 @@ riscv_slow_unaligned_access (machine_mode, 
> > > >> unsigned int)
> > > >>return riscv_slow_unaligned_access_p;
> > > >>  }
> > > >>
> > > 

Re: [PATCH 3/3] [PR libfortran/101305] Fix ISO_Fortran_binding.h paths in gfortran testsuite

2021-07-26 Thread Tobias Burnus

Hi Sandra,

On 23.07.21 22:43, Sandra Loosemore wrote:

On 7/23/21 8:15 AM, Tobias Burnus wrote:

On 21.07.21 12:17, Tobias Burnus wrote:

On 13.07.21 23:28, Sandra Loosemore wrote:

ISO_Fortran_binding.h is now generated in the libgfortran build
directory where it is on the default include path.  Adjust includes
[...]

Unfortunately, that does not help.

It seems as if the following works in the *.exp file:
[...]

I'm not seeing the include path failures Tobias is seeing, [...] why
this isn't working for Tobias.  :-S


I also do not have any idea – I did bootstrap before into an empty directory
and I don't think I had other patches applied.

I have no idea why it did not work – nor why it now works. I did now (again?):

* Reset all patches + re-apply your three patches
* Bootstrap into an empty directory with
  $ /configure --prefix=... --enable-multiarch 
--enable-languages=c,c++,fortran,lto,objc
  $ make -j12 && make install
  $ cd gcc
  $ make check-fortran RUNTESTFLAGS="dg.exp=ISO_Fortran_binding_1.f90 
--target_board=unix\{,-m32\}"

and now I got:
=== gfortran Summary for unix ===
# of expected passes12
=== gfortran Summary for unix/-m32 ===
# of expected passes12

Thus, it (mostly) works.
(I also did a more complete 'make check-fortran' run.)

 * * *

I did say that it mostly works because of:

$ find x86_64-pc-linux-gnu/ -name ISO_Fortran_binding.h
x86_64-pc-linux-gnu/libgfortran/ISO_Fortran_binding.h
x86_64-pc-linux-gnu/32/libgfortran/ISO_Fortran_binding.h

And when looking at the -B lines, I see for the '' alias '-m64' run:
-B.../build/gcc/testsuite/gfortran/../../
-B.../build/x86_64-pc-linux-gnu/./libgfortran/
-B.../build/x86_64-pc-linux-gnu/./libgfortran/.libs
-B.../build/x86_64-pc-linux-gnu/./libquadmath/.libs

which is fine (second line ensures the ISO*.h file is found.)

But for -m32, I see:

-B.../build/gcc/testsuite/gfortran/../../
-B.../build/x86_64-pc-linux-gnu/./libgfortran/
-B.../build/x86_64-pc-linux-gnu/32/libgfortran/.libs
-B.../build/x86_64-pc-linux-gnu/32/libquadmath/.libs

That also works, but it uses again the same directory for ISO*.h,
such that the -m64 header file is used instead of the -m32 one.

 * * *

I am not sure whether it really matters – the differences between
the header files is (on x86-64-gnu-linux):

-#define CFI_type_int128_t (CFI_type_Integer + (16 << CFI_type_kind_shift))
-#define CFI_type_int_least128_t (CFI_type_Integer + (16 << 
CFI_type_kind_shift))
-#define CFI_type_int_fast128_t (CFI_type_Integer + (16 << CFI_type_kind_shift))
+#define CFI_type_int128_t -2
+#define CFI_type_int_least128_t -2
+#define CFI_type_int_fast128_t -2

There might be larger differences on other multi-arch systems, but at least
for x86-64, it seems to be harmless. (-2 = not available).

For instance, there might be an issue on Windows. (I keep forgetting what
sizeof(long) is with -m64 – is is the same as sizeof(int) as with -m32?
Or is it 64bit with -m64?)

 * * *

Thus, I am puzzled why it failed before and not longer. But given that it
works now:

LGTM, thanks for the patch and sorry for the confusion!

Tobias

PS: Still, it would be nice if the proper multi-lib ISO*.h could be found;
while it usually does not matter, it could do so in some cases.

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] Analyzer: Refactor callstring to work with pairs of supernodes.

2021-07-26 Thread Ankur Saini via Gcc-patches



> On 25-Jul-2021, at 7:59 PM, Prathamesh Kulkarni 
>  wrote:
> 
> !(*this == other)

Thanks for the suggestion.

Here is the updated patch.



Thank you
- Ankur 

Re: [PATCH V2] simplify-rtx: Push sign/zero-extension inside vec_duplicate

2021-07-26 Thread Jonathan Wright via Gcc-patches
Hi,

This updated patch fixes the two-operators-per-row style issue in the 
aarch64-simd.md RTL patterns as well as integrating the simplify-rtx.c
change as suggested.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  

* config/aarch64/aarch64-simd.md: Push sign/zero-extension
inside vec_duplicate for all patterns.
* simplify-rtx.c (simplify_context::simplify_unary_operation_1):
Push sign/zero-extension inside vec_duplicate.



From: Richard Sandiford 
Sent: 22 July 2021 18:36
To: Jonathan Wright 
Cc: gcc-patches@gcc.gnu.org ; Kyrylo Tkachov 

Subject: Re: [PATCH] simplify-rtx: Push sign/zero-extension inside 
vec_duplicate 
 
Jonathan Wright  writes:
> Hi,
>
> As a general principle, vec_duplicate should be as close to the root
> of an expression as possible. Where unary operations have
> vec_duplicate as an argument, these operations should be pushed
> inside the vec_duplicate.
>
> This patch modifies unary operation simplification to push
> sign/zero-extension of a scalar inside vec_duplicate.
>
> This patch also updates all RTL patterns in aarch64-simd.md to use
> the new canonical form.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu and
> x86_64-none-linux-gnu - no issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  
>
> * config/aarch64/aarch64-simd.md: Push sign/zero-extension
> inside vec_duplicate for all patterns.
> * simplify-rtx.c (simplify_context::simplify_unary_operation_1):
> Push sign/zero-extension inside vec_duplicate.
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 74890989cb3045798bf8d0241467eaaf72238297..99a95a54248041906b9a0ad742d3a0dca9733b35
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2092,14 +2092,14 @@
>  
>  (define_insn "aarch64_mlal_hi_n_insn"
>    [(set (match_operand: 0 "register_operand" "=w")
> -    (plus:
> -  (mult:
> -  (ANY_EXTEND: (vec_select:
> - (match_operand:VQ_HSI 2 "register_operand" "w")
> - (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> -  (ANY_EXTEND: (vec_duplicate:
> -    (match_operand: 4 "register_operand" ""
> -  (match_operand: 1 "register_operand" "0")))]
> + (plus:
> +   (mult:
> +   (ANY_EXTEND: (vec_select:
> +  (match_operand:VQ_HSI 2 "register_operand" "w")
> +  (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
> +  (vec_duplicate: (ANY_EXTEND:
> +  (match_operand: 4 "register_operand" ""
> +   (match_operand: 1 "register_operand" "0")))]

Sorry to nitpick, since this is pre-existing, but I think the pattern
would be easier to read with one operation per line.  I.e.:

    (plus:
  (mult:
    (ANY_EXTEND:
  (vec_select:
    (match_operand:VQ_HSI 2 "register_operand" "w")
    (match_operand:VQ_HSI 3 "vect_par_cnst_hi_half" "")))
    (vec_duplicate:
  (ANY_EXTEND:
    (match_operand: 4 "register_operand" ""
  (match_operand: 1 "register_operand" "0")))]

Same for the other patterns with similar doubling of operators.
(It looks like you've fixed other indentation problems though, thanks.)

> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 
> 2d169d3f9f70c85d396adaed124b6c52aca98f07..f885816412f7576d2535f827562d2b425a6a553b
>  100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -903,6 +903,18 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>    rtx temp, elt, base, step;
>    scalar_int_mode inner, int_mode, op_mode, op0_mode;
>  
> +  /* Extending a VEC_DUPLICATE of a scalar should be canonicalized to a
> + VEC_DUPLICATE of an extended scalar. This is outside of the main switch
> + as we may wish to push all unary operations inside VEC_DUPLICATE. */
> +  if ((code == SIGN_EXTEND || code == ZERO_EXTEND)
> +  && GET_CODE (op) == VEC_DUPLICATE
> +  && GET_MODE_NUNITS (GET_MODE (XEXP (op, 0))).to_constant () == 1)
> +    {
> +  rtx x = simplify_gen_unary (code, GET_MODE_INNER (mode),
> +   XEXP (op, 0), GET_MODE (XEXP (op, 0)));
> +  return gen_vec_duplicate (mode, x);
> +    }
> +
>    switch (code)
>  {
>  case NOT:

This is really an extension of the existing code:

  if (VECTOR_MODE_P (mode)
  && vec_duplicate_p (op, )
  && code != VEC_DUPLICATE)
    {
  /* Try applying the operator to ELT and see if that simplifies.
 We can duplicate the result if so.

 The reason we don't use simplify_gen_unary is that it isn't
 necessarily a win to convert things like:


Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

2021-07-26 Thread Hongtao Liu via Gcc-patches
Correct mail list, please reply under this email.

On Mon, Jul 26, 2021 at 4:47 PM liuhongt  wrote:
>
> Hi:
>   As decribled in PR, the pinsr instruction has poor throughput in SKX
> and CLX, which leads to worse performance in vectorization in some cases.
> This patch adds a cost member named integer_to_sse to simulate pinsr/movd
> which is used by vector construction, the cost is same as sse_op on other
>  targets, but twice much as sse_op on CLX/SKX.
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/99881
> * config/i386/i386.h (processor_costs): Add new member
> integer_to_sse.
> * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> generic_cost, core_cost): Initialize integer_to_sse same value
> as sse_op.
> (skylake_cost): Initialize integer_to_sse twice as much as sse_op.
> * config/i386/i386.c (ix86_builtin_vectorization_cost):
> Use integer_to_sse instead of sse_op to calculate the cost of
> vec_construct.
>
> gcc/testsuite/ChangeLog:
>
> PR target/99881
> * gcc.target/i386/pr99881.c: New test.
> ---
>  gcc/config/i386/i386.c  |  6 ++-
>  gcc/config/i386/i386.h  |  1 +
>  gcc/config/i386/x86-tune-costs.h| 26 +
>  gcc/testsuite/gcc.target/i386/pr99881.c | 49 +
>  4 files changed, 81 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ff96134fb37..fbebd2d8f9a 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum 
> vect_cost_for_stmt type_of_cost,
>case vec_construct:
> {
>   /* N element inserts into SSE vectors.  */
> - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> + int cost
> +   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> +   ix86_cost->sse_op
> +   : ix86_cost->integer_to_sse);
> +
>   /* One vinserti128 for combining two SSE vectors for AVX256.  */
>   if (GET_MODE_BITSIZE (mode) == 256)
> cost += ix86_vec_cost (mode, ix86_cost->addss);
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 0c2c93daf32..d1e1c225990 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -165,6 +165,7 @@ struct processor_costs {
>const int xmm_move, ymm_move, /* cost of moving XMM and YMM register.  */
> zmm_move;
>const int sse_to_integer;/* cost of moving SSE register to integer.  */
> +  const int integer_to_sse;/* cost of moving integer to SSE register.  */
>const int gather_static, gather_per_elt; /* Cost of gather load is computed
>as static + per_item * nelts. */
>const int scatter_static, scatter_per_elt; /* Cost of gather store is
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..67cfa006196 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -102,6 +102,7 @@ struct processor_costs ix86_size_cost = {/* costs for 
> tuning for size */
>in 128bit, 256bit and 512bit */
>3, 3, 3, /* cost of moving XMM,YMM,ZMM 
> register */
>3,   /* cost of moving SSE register to 
> integer.  */
> +  COSTS_N_BYTES (2),   /* cost of moving integer to sse 
> register.  */
>5, 0,/* Gather load static, 
> per_elt.  */
>5, 0,/* Gather store static, 
> per_elt.  */
>0,   /* size of l1 cache  */
> @@ -211,6 +212,7 @@ struct processor_costs i386_cost = {/* 386 
> specific costs */
>{4, 8, 16, 32, 64},  /* cost of unaligned stores.  */
>2, 4, 8, /* cost of moving XMM,YMM,ZMM 
> register */
>3,   /* cost of moving SSE register to 
> integer.  */
> +  COSTS_N_INSNS (1),   /* cost of moving integer to sse 
> register.  */
>4, 4,/* Gather load static, 
> per_elt.  */
>4, 4,/* Gather store static, 
> per_elt.  */
>0,   /* size of l1 

Re: [PATCH v2, Fortran] [PR libfortran/101317] Bind(c): Improve error checking in CFI_* functions

2021-07-26 Thread Tobias Burnus

Hi Sandra,

On 25.07.21 06:11, Sandra Loosemore wrote:

Congratulation – we have found a bug in the spec, which is also
present in the current draft (21-007). I have now written to J3:
https://mailman.j3-fortran.org/pipermail/j3/2021-July/013189.html


That discussion seems to have wandered off into some other direction
so I'm not sure whether it really clarifies this problem.


I concur. I do hope that it will be at some point discussed and clarified.

But for now:


For the purposes of this patch I have left in the test for elem_len >
0 in CFI_establish where the standard explicitly has that requirement
and removed it from the other functions where I'd added it just to be
consistent.

I think that makes sense.

OK, I have done that throughout the file, and also made the wording
change you asked for.  While I was at it, I went through all the
diagnostic messages in the file and simplified the wording of a few
other messages as well, fixed typos and inconsistent capitalization
and missing punctuation and things like that.

Thanks!

Here's a new patch.


LGTM.

Thanks,

Tobias

-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


[PATCH] gimple-fold: Fix up __builtin_clear_padding on classes with virtual inheritence [PR101586]

2021-07-26 Thread Jakub Jelinek via Gcc-patches
Hi!

For the following testcase, B is 16-byte type, containing 8-byte
virtual pointer and 1-byte A member, and C contains two FIELD_DECLs,
one with B type and size of just 8-byte and then a field with type
A and 1-byte size.
The __builtin_clear_padding code was upset about the B typed FIELD_DECL
containing FIELD_DECLs beyond the field size and triggered
assertion failure.
This patch makes it ignore all FIELD_DECLs that are (fully) beyond the sz
passed from the caller (except for the flexible array member
diagnostics that is kept).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-07-26  Jakub Jelinek  

PR middle-end/101586
* gimple-fold.c (clear_padding_type): Ignore FIELD_DECLs with byte
positions above or equal to sz except for diagnostics of flexible
array members.

* g++.dg/torture/builtin-clear-padding-4.C: New test.

--- gcc/gimple-fold.c.jj2021-07-20 10:08:09.911687453 +0200
+++ gcc/gimple-fold.c   2021-07-23 12:00:32.212892331 +0200
@@ -4697,6 +4697,8 @@ clear_padding_type (clear_padding_struct
if (fldsz == 0)
  continue;
HOST_WIDE_INT pos = int_byte_position (field);
+   if (pos >= sz)
+ continue;
HOST_WIDE_INT bpos
  = tree_to_uhwi (DECL_FIELD_BIT_OFFSET (field));
bpos %= BITS_PER_UNIT;
@@ -4772,6 +4774,8 @@ clear_padding_type (clear_padding_struct
else
  {
HOST_WIDE_INT pos = int_byte_position (field);
+   if (pos >= sz)
+ continue;
HOST_WIDE_INT fldsz = tree_to_shwi (DECL_SIZE_UNIT (field));
gcc_assert (pos >= 0 && fldsz >= 0 && pos >= cur_pos);
clear_padding_add_padding (buf, pos - cur_pos);
--- gcc/testsuite/g++.dg/torture/builtin-clear-padding-4.C.jj   2021-07-23 
12:53:48.372773954 +0200
+++ gcc/testsuite/g++.dg/torture/builtin-clear-padding-4.C  2021-07-23 
12:50:41.855114231 +0200
@@ -0,0 +1,44 @@
+// PR middle-end/101586
+
+struct A { char a; };
+struct B : virtual A {};
+struct C : B {};
+struct D : virtual A, C {};
+
+__attribute__((noipa)) A *
+baz (C *p, D *q)
+{
+  if (p)
+return dynamic_cast  (p);
+  else
+return dynamic_cast  (q);
+}
+
+void
+foo ()
+{ 
+  C c;
+  c.a = 42;
+  __builtin_clear_padding ();
+  A *p = baz (, 0);
+  if (c.a != 42 || p->a != 42)
+__builtin_abort ();
+}
+
+void
+bar ()
+{
+  D d;
+  d.a = 42;
+  __builtin_clear_padding ();
+  A *p = baz (0, );
+  if (d.a != 42 || p->a != 42)
+__builtin_abort ();
+}
+
+int
+main ()
+{
+  foo ();
+  bar ();
+}

Jakub



[committed] openmp: Add support for omp attributes section and scan directives

2021-07-26 Thread Jakub Jelinek via Gcc-patches
Hi!

This patch adds support for expressing the section and scan directives
using the attribute syntax and additionally fixes some bugs in the attribute
syntax directive handling.
For now it requires that the scan and section directives appear as the only
attribute, not combined with other OpenMP or non-OpenMP attributes on the same
statement.

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.

2021-07-26  Jakub Jelinek  

* parser.h (struct cp_lexer): Add orphan_p member.
* parser.c (cp_parser_statement): Don't change in_omp_attribute_pragma
upon restart from CPP_PRAGMA handling.  Fix up condition when a lexer
should be destroyed and adjust saved_tokens if it records tokens from
the to be destroyed lexer.
(cp_parser_omp_section_scan): New function.
(cp_parser_omp_scan_loop_body): Use it.  If
parser->lexer->in_omp_attribute_pragma, allow optional comma
after scan.
(cp_parser_omp_sections_scope): Use cp_parser_omp_section_scan.

* g++.dg/gomp/attrs-1.C: Use attribute syntax even for section
and scan directives.
* g++.dg/gomp/attrs-2.C: Likewise.
* g++.dg/gomp/attrs-6.C: New test.
* g++.dg/gomp/attrs-7.C: New test.
* g++.dg/gomp/attrs-8.C: New test.

--- gcc/cp/parser.h.jj  2021-07-23 09:37:23.615877596 +0200
+++ gcc/cp/parser.h 2021-07-23 19:17:58.125904410 +0200
@@ -117,6 +117,10 @@ struct GTY (()) cp_lexer {
   /* True if we're in the context of OpenMP directives written as C++11
  attributes turned into pragma.  */
   bool in_omp_attribute_pragma;
+
+  /* True for in_omp_attribute_pragma lexer that should be destroyed
+ when it is no longer in use.  */
+  bool orphan_p;
 };
 
 
--- gcc/cp/parser.c.jj  2021-07-23 09:37:23.619877539 +0200
+++ gcc/cp/parser.c 2021-07-23 19:31:58.739126553 +0200
@@ -11901,10 +11901,9 @@ cp_parser_statement (cp_parser* parser,
   tree statement, std_attrs = NULL_TREE;
   cp_token *token;
   location_t statement_location, attrs_loc;
-  bool in_omp_attribute_pragma;
+  bool in_omp_attribute_pragma = parser->lexer->in_omp_attribute_pragma;
 
  restart:
-  in_omp_attribute_pragma = parser->lexer->in_omp_attribute_pragma;
   if (if_p != NULL)
 *if_p = false;
   /* There is no statement yet.  */
@@ -11951,6 +11950,7 @@ cp_parser_statement (cp_parser* parser,
 the statement.  */
  cp_parser_label_for_labeled_statement (parser, std_attrs);
  in_compound = false;
+ in_omp_attribute_pragma = parser->lexer->in_omp_attribute_pragma;
  goto restart;
 
case RID_IF:
@@ -12034,6 +12034,7 @@ cp_parser_statement (cp_parser* parser,
 
  cp_parser_label_for_labeled_statement (parser, std_attrs);
  in_compound = false;
+ in_omp_attribute_pragma = parser->lexer->in_omp_attribute_pragma;
  goto restart;
}
 }
@@ -12058,13 +12059,28 @@ cp_parser_statement (cp_parser* parser,
cp_parser_pragma (parser, pragma_compound, if_p);
   else if (!cp_parser_pragma (parser, pragma_stmt, if_p))
do_restart = true;
-  if (lexer->in_omp_attribute_pragma && !in_omp_attribute_pragma)
-   {
- gcc_assert (parser->lexer != lexer);
+  if (parser->lexer != lexer
+ && lexer->in_omp_attribute_pragma
+ && (!in_omp_attribute_pragma || lexer->orphan_p))
+   {
+ if (saved_tokens.lexer == lexer)
+   {
+ if (saved_tokens.commit)
+   cp_lexer_commit_tokens (lexer);
+ gcc_assert (lexer->saved_tokens.length () == saved_tokens.len);
+ saved_tokens.lexer = parser->lexer;
+ saved_tokens.commit = false;
+ saved_tokens.len = parser->lexer->saved_tokens.length ();
+   }
  cp_lexer_destroy (lexer);
+ lexer = parser->lexer;
}
   if (do_restart)
goto restart;
+  if (parser->lexer == lexer
+ && lexer->in_omp_attribute_pragma
+ && !in_omp_attribute_pragma)
+   parser->lexer->orphan_p = true;
   return;
 }
   else if (token->type == CPP_EOF)
@@ -40775,6 +40791,77 @@ cp_finish_omp_range_for (tree orig, tree
 cp_finish_decomp (decl, decomp_first_name, decomp_cnt);
 }
 
+/* Return true if next tokens contain a standard attribute that contains
+   omp::directive (DIRECTIVE).  */
+
+static bool
+cp_parser_omp_section_scan (cp_parser *parser, const char *directive,
+   bool tentative)
+{
+  size_t n = cp_parser_skip_attributes_opt (parser, 1), i;
+  if (n < 10)
+return false;
+  for (i = 5; i < n - 4; i++)
+if (cp_lexer_nth_token_is (parser->lexer, i, CPP_NAME)
+   && cp_lexer_nth_token_is (parser->lexer, i + 1, CPP_OPEN_PAREN)
+   && cp_lexer_nth_token_is (parser->lexer, i + 2, CPP_NAME))
+  {
+   tree first = cp_lexer_peek_nth_token (parser->lexer, i)->u.value;
+   tree second = cp_lexer_peek_nth_token